You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by tq...@apache.org on 2022/06/01 21:39:24 UTC

[tvm-site] branch asf-site updated: deploying docs (apache/tvm@b9890dbbebeff95202a7dc65cbce3e808869cd33)

This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/tvm-site.git


The following commit(s) were added to refs/heads/asf-site by this push:
     new 2b30652ff deploying docs (apache/tvm@b9890dbbebeff95202a7dc65cbce3e808869cd33)
2b30652ff is described below

commit 2b30652ff8719c2647e0d31bb81d703c5fc7306f
Author: tvm-bot <95...@users.noreply.github.com>
AuthorDate: Wed Jun 1 21:39:20 2022 +0000

    deploying docs (apache/tvm@b9890dbbebeff95202a7dc65cbce3e808869cd33)
---
 docs/_sources/contribute/ci.rst.txt                |  108 -
 .../how_to/compile_models/from_mxnet.rst.txt       |    2 +-
 .../how_to/compile_models/from_oneflow.rst.txt     |    2 +-
 .../how_to/compile_models/from_paddle.rst.txt      |    2 +-
 .../how_to/compile_models/from_pytorch.rst.txt     |    2 +-
 .../how_to/compile_models/from_tensorflow.rst.txt  |    5 -
 .../compile_models/sg_execution_times.rst.txt      |   20 +-
 .../deploy_models/deploy_model_on_android.rst.txt  |    2 +-
 .../deploy_object_detection_pytorch.rst.txt        |    4 +-
 .../deploy_models/deploy_prequantized.rst.txt      |    6 +-
 .../deploy_prequantized_tflite.rst.txt             |    4 +-
 .../how_to/deploy_models/deploy_quantized.rst.txt  |    2 +-
 .../deploy_models/deploy_ssd_gluoncv.rst.txt       |    4 +-
 .../deploy_models/sg_execution_times.rst.txt       |   18 +-
 .../extend_tvm/bring_your_own_datatypes.rst.txt    |    2 +-
 .../how_to/extend_tvm/sg_execution_times.rst.txt   |   10 +-
 .../how_to/extend_tvm/use_pass_instrument.rst.txt  |   16 +-
 .../optimize_operators/opt_conv_cuda.rst.txt       |    2 +-
 .../optimize_operators/opt_conv_tensorcore.rst.txt |    2 +-
 .../how_to/optimize_operators/opt_gemm.rst.txt     |   16 +-
 .../optimize_operators/sg_execution_times.rst.txt  |    8 +-
 .../sg_execution_times.rst.txt                     |   16 +-
 .../tune_conv2d_layer_cuda.rst.txt                 | 5832 +++++++-------------
 .../tune_network_cuda.rst.txt                      |    2 +-
 .../tune_network_x86.rst.txt                       |    4 +-
 .../tune_sparse_x86.rst.txt                        |  173 +-
 .../tune_with_autotvm/sg_execution_times.rst.txt   |   12 +-
 .../tune_with_autotvm/tune_conv2d_cuda.rst.txt     |   34 +-
 .../work_with_microtvm/micro_autotune.rst.txt      |   16 +-
 .../work_with_microtvm/sg_execution_times.rst.txt  |   10 +-
 .../work_with_relay/sg_execution_times.rst.txt     |    8 +-
 .../work_with_schedules/sg_execution_times.rst.txt |   18 +-
 .../how_to/work_with_schedules/tensorize.rst.txt   |    2 +-
 .../tutorials/autotvm/sg_execution_times.rst.txt   |    6 +-
 .../frontend/deploy_classification.rst.txt         |    2 +-
 .../tutorials/frontend/deploy_detection.rst.txt    |    2 +-
 .../tutorials/frontend/sg_execution_times.rst.txt  |    6 +-
 .../tutorials/optimize/sg_execution_times.rst.txt  |    6 +-
 .../topic/vta/tutorials/sg_execution_times.rst.txt |    6 +-
 .../tutorial/auto_scheduler_matmul_x86.rst.txt     |    2 +-
 docs/_sources/tutorial/autotvm_relay_x86.rst.txt   |   54 +-
 .../tutorial/cross_compilation_and_rpc.rst.txt     |    2 +-
 docs/_sources/tutorial/intro_topi.rst.txt          |    2 +-
 docs/_sources/tutorial/sg_execution_times.rst.txt  |   26 +-
 .../tutorial/tensor_expr_get_started.rst.txt       |   40 +-
 docs/commit_hash                                   |    2 +-
 docs/contribute/ci.html                            |  118 +-
 docs/contribute/index.html                         |    4 -
 docs/how_to/compile_models/from_mxnet.html         |    2 +-
 docs/how_to/compile_models/from_oneflow.html       |  131 +-
 docs/how_to/compile_models/from_paddle.html        |    2 +-
 docs/how_to/compile_models/from_pytorch.html       |    7 +-
 docs/how_to/compile_models/from_tensorflow.html    |    1 -
 docs/how_to/compile_models/sg_execution_times.html |   20 +-
 .../deploy_models/deploy_model_on_android.html     |    2 +-
 .../deploy_object_detection_pytorch.html           |   20 +-
 docs/how_to/deploy_models/deploy_prequantized.html |    9 +-
 .../deploy_models/deploy_prequantized_tflite.html  |    4 +-
 docs/how_to/deploy_models/deploy_quantized.html    |    2 +-
 docs/how_to/deploy_models/deploy_ssd_gluoncv.html  |   34 +-
 docs/how_to/deploy_models/sg_execution_times.html  |   18 +-
 .../extend_tvm/bring_your_own_datatypes.html       |    2 +-
 docs/how_to/extend_tvm/sg_execution_times.html     |   10 +-
 docs/how_to/extend_tvm/use_pass_instrument.html    |   16 +-
 docs/how_to/optimize_operators/opt_conv_cuda.html  |    2 +-
 .../optimize_operators/opt_conv_tensorcore.html    |    2 +-
 docs/how_to/optimize_operators/opt_gemm.html       |   16 +-
 .../optimize_operators/sg_execution_times.html     |    8 +-
 .../sg_execution_times.html                        |   14 +-
 .../tune_conv2d_layer_cuda.html                    | 5832 +++++++-------------
 .../tune_with_autoscheduler/tune_network_cuda.html |    2 +-
 .../tune_with_autoscheduler/tune_network_x86.html  |    4 +-
 .../tune_with_autoscheduler/tune_sparse_x86.html   |  173 +-
 .../tune_with_autotvm/sg_execution_times.html      |   12 +-
 .../how_to/tune_with_autotvm/tune_conv2d_cuda.html |   34 +-
 docs/how_to/work_with_microtvm/micro_autotune.html |   16 +-
 .../work_with_microtvm/sg_execution_times.html     |   10 +-
 .../how_to/work_with_relay/sg_execution_times.html |    8 +-
 .../work_with_schedules/sg_execution_times.html    |   18 +-
 docs/how_to/work_with_schedules/tensorize.html     |    2 +-
 docs/objects.inv                                   |  Bin 22422 -> 22411 bytes
 docs/reference/api/python/auto_scheduler.html      |    4 +-
 .../api/typedoc/classes/bytestreamreader.html      |   12 +-
 .../api/typedoc/classes/cachedcallstack.html       |   34 +-
 docs/reference/api/typedoc/classes/dldatatype.html |   12 +-
 docs/reference/api/typedoc/classes/dldevice.html   |   10 +-
 .../reference/api/typedoc/classes/environment.html |   12 +-
 docs/reference/api/typedoc/classes/ffilibrary.html |   20 +-
 .../api/typedoc/classes/graphexecutor.html         |   16 +-
 docs/reference/api/typedoc/classes/instance.html   |   40 +-
 docs/reference/api/typedoc/classes/memory.html     |   34 +-
 docs/reference/api/typedoc/classes/module.html     |   10 +-
 docs/reference/api/typedoc/classes/ndarray.html    |   22 +-
 .../api/typedoc/classes/packedfunccell.html        |    6 +-
 docs/reference/api/typedoc/classes/rpcserver.html  |   14 +-
 docs/reference/api/typedoc/classes/scalar.html     |    6 +-
 .../api/typedoc/classes/webgpucontext.html         |   12 +-
 docs/reference/api/typedoc/enums/argtypecode.html  |   30 +-
 .../api/typedoc/enums/aynccallbackcode.html        |    4 +-
 .../api/typedoc/enums/dldatatypecode.html          |    8 +-
 .../api/typedoc/enums/rpcserverstate.html          |   12 +-
 docs/reference/api/typedoc/enums/sizeof.html       |   18 +-
 docs/reference/api/typedoc/index.html              |  112 +-
 .../api/typedoc/interfaces/disposable.html         |    2 +-
 .../api/typedoc/interfaces/functioninfo.html       |    6 +-
 .../api/typedoc/interfaces/libraryprovider.html    |    4 +-
 docs/searchindex.js                                |    2 +-
 .../vta/tutorials/autotvm/sg_execution_times.html  |    6 +-
 .../tutorials/frontend/deploy_classification.html  |    2 +-
 .../vta/tutorials/frontend/deploy_detection.html   |    2 +-
 .../vta/tutorials/frontend/sg_execution_times.html |    6 +-
 .../vta/tutorials/optimize/sg_execution_times.html |    6 +-
 docs/topic/vta/tutorials/sg_execution_times.html   |    6 +-
 docs/tutorial/auto_scheduler_matmul_x86.html       |    2 +-
 docs/tutorial/autotvm_relay_x86.html               |  268 +-
 docs/tutorial/cross_compilation_and_rpc.html       |    2 +-
 docs/tutorial/intro_topi.html                      |    2 +-
 docs/tutorial/sg_execution_times.html              |   26 +-
 docs/tutorial/tensor_expr_get_started.html         |   40 +-
 119 files changed, 4981 insertions(+), 8924 deletions(-)

diff --git a/docs/_sources/contribute/ci.rst.txt b/docs/_sources/contribute/ci.rst.txt
index d40e4d5ab..0cc1bf9dd 100644
--- a/docs/_sources/contribute/ci.rst.txt
+++ b/docs/_sources/contribute/ci.rst.txt
@@ -63,114 +63,6 @@ Reproduce Failures
 
 Most TVM Python tests run under |pytest|_ and can be run as described in :ref:`pr-testing`.
 
-Keeping CI Green
-****************
-
-Developers rely on the TVM CI to get signal on their PRs before merging.
-Occasionally breakages slip through and break ``main``, which in turn causes
-the same error to show up on an PR that is based on the broken commit(s). Broken
-commits can be identified `through GitHub <https://github.com/apache/tvm/commits/main>`_
-via the commit status icon or via `Jenkins <https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/activity?branch=main>`_.
-In these situations it is possible to either revert the offending commit or
-submit a forward fix to address the issue. It is up to the committer and commit
-author which option to choose, keeping in mind that a broken CI affects all TVM
-developers and should be fixed as soon as possible.
-
-Skip CI for Reverts
--------------------
-
-For reverts and trivial forward fixes, adding ``[skip ci]`` to the revert's
-PR title will cause CI to shortcut and only run lint. Committers should
-take care that they only merge CI-skipped PRs to fix a failure on ``main`` and
-not in cases where the submitter wants to shortcut CI to merge a change faster.
-The PR title is checked when the build is first run (specifically during the lint
-step, so changes after that has run do not affect CI and will require the job to
-be re-triggered by another ``git push``).
-
-.. code:: bash
-
-  # Revert HEAD commit, make sure to insert '[skip ci]' at the beginning of
-  # the commit subject
-  git revert HEAD
-  git checkout -b my_fix
-  # After you have pushed your branch, create a PR as usual.
-  git push my_repo
-  # Example: Skip CI on a branch with an existing PR
-  # Adding this commit to an existing branch will cause a new CI run where
-  # Jenkins is skipped
-  git commit --allow-empty --message "[skip ci] Trigger skipped CI"
-  git push my_repo
-
-Handling Flaky Failures
-***********************
-
-.. https://stackoverflow.com/questions/4743845/format-text-in-a-link-in-restructuredtext/4836544#4836544
-.. |pytest's @xfail decorator| replace:: pytest's ``@xfail`` decorator
-.. _pytest's @xfail decorator: https://docs.pytest.org/en/6.2.x/skipping.html#xfail-mark-test-functions-as-expected-to-fail
-.. |strict=True| replace:: ``strict=True``
-.. _strict=True: https://docs.pytest.org/en/6.2.x/skipping.html#strict-parameter
-
-If you notice a failure on your PR that seems unrelated to your change, you should
-search `recent GitHub issues related to flaky tests <https://github.com/apache/tvm/issues?q=is%3Aissue+%5BCI+Problem%5D+Flaky+>`_ and
-`file a new issue <https://github.com/apache/tvm/issues/new?assignees=&labels=&template=ci-problem.md&title=%5BCI+Problem%5D+>`_
-if you don't see any reports of the failure. If a certain test or class of tests affects
-several PRs or commits on ``main`` with flaky failures, the test should be disabled via
-|pytest's @xfail decorator|_ with |strict=True|_ and the relevant issue linked in the
-disabling PR.
-
-.. code:: python
-
-    @pytest.mark.xfail(strict=False, reason="Flaky test: https://github.com/apache/tvm/issues/1234")
-    def test_something_flaky():
-        pass
-
-``ci-docker-staging``
-*********************
-
-The `ci-docker-staging <https://github.com/apache/tvm/tree/ci-docker-staging>`_
-branch is used to test updates to Docker images and ``Jenkinsfile`` changes. When
-running a build for a normal PR from a forked repository, Jenkins uses the code
-from the PR except for the ``Jenkinsfile`` itself, which comes from the base branch.
-When branches are built, the ``Jenkinsfile`` in the branch is used, so a committer
-with write access must push PRs to a branch in apache/tvm to properly test
-``Jenkinsfile`` changes. If your PR makes changes to the ``Jenkinsfile``, make sure
-to @ a `committer <https://github.com/apache/tvm/blob/main/CONTRIBUTORS.md>`_
-and ask them to push your PR as a branch to test the changes.
-
-.. _docker_images:
-
-Docker Images
-*************
-
-.. |top_of_the_Jenkinsfile| replace:: top of the ``Jenkinsfile``
-.. _top_of_the_Jenkinsfile: https://github.com/apache/tvm/blob/7481a297740f073b193a3f09b3e27f056e8c7f2e/Jenkinsfile#L48-L54
-
-Each CI job runs most of its work inside a Docker container, built from files
-in the `docker/ <https://github.com/apache/tvm/tree/main/docker>`_ folder. These
-files are built nightly in Jenkins via the `docker-images-ci <https://ci.tlcpack.ai/job/docker-images-ci/>`_ job.
-The images for these containers are hosted in the `tlcpack Docker Hub <https://hub.docker.com/u/tlcpack>`_
-and referenced at the |top_of_the_Jenkinsfile|_. These can be inspected and run
-locally via standard Docker commands.
-
-.. code:: bash
-
-    # Beware: CI images can be several GB in size
-    # Get a bare docker shell in the ci-gpu container
-    docker run -it tlcpack/ci-gpu:v0.78 /bin/bash
-
-``docker/bash.sh`` will automatically grab the latest image from the ``Jenkinsfile``
-and help in mounting your current directory.
-
-.. code:: bash
-
-    # Run the ci_cpu image specified in Jenkinsfile
-    cd tvm
-    bash docker/bash.sh ci_cpu
-    # the tvm directory is automatically mounted
-    # example: build tvm (note: this will overrwrite build/)
-    $ ./tests/scripts/task_config_build_cpu.sh
-    $ ./tests/scripts/task_build.sh build -j32
-
 
 Reporting Issues
 ****************
diff --git a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
index 84bc7260f..406bcb165 100644
--- a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
@@ -98,7 +98,7 @@ In this section, we download a pretrained imagenet model and classify an image.
 
  .. code-block:: none
 
-    Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip01b8b8fa-eb13-4bfa-b333-c520e5c61568 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+    Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip674fcbf0-c9e1-47b7-be5a-bdc8ccacdf0b from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
     x (1, 3, 224, 224)
 
 
diff --git a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
index 2cdf1c4e4..ecbeddf22 100644
--- a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
@@ -100,7 +100,7 @@ Load a pretrained OneFlow model and save model
  .. code-block:: none
 
     Downloading: "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip" to /workspace/.oneflow/flowvision_cache/resnet18.zip
-
      0%|          | 0.00/41.5M [00:00<?, ?B/s]
      0%|          | 16.0k/41.5M [00:00<07:51, 92.3kB/s]
      0%|          | 48.0k/41.5M [00:00<04:57, 146kB/s] 
      0%|          | 72.0k/41.5M [00:00<05:05, 142kB/s]
      0%|          | 144k/41.5M [00:00<02:53, 249kB/s] 
      1%|          | 288k/41.5M [00:00<01:34, 457kB/s]
      1%|1         | 584k/41.5M [00:01<00:48, 879kB/s]
      2%|2         | 992k/41.5M [00:01<00:31, 1.36MB/s]
      3%|3         | 1.36M/41.5M [00:01<00:25, 1.66MB/s]
      4%|4         | 1.76M/41.5M [00:01<00:22, 1.87MB/s]
      5%|5         | 2.16M/41.5M [00:01<00:20, 2.03MB/s]
      6%|6         | 2.60M/41.5M [00:01<00:18, 2.19MB/s]
      7%|7         | 3.05M/41.5M [00:02<00:17, 2.32MB/s]
      8%|8         | 3.51M/41.5M [00:02<00:16, 2.44MB/s]
     10%|9         | 3.98M/41.5M [00:02<00:15, 2.53MB/s]
     11%|#         | 4.48M/41.5M [00:02<00:14, 2.66MB/s]
     12%|#2        | 5.00M/41.5M [00:02<00:13, 2.78MB/s]
     13%|#3        | 5.55M/41.5M [00:03<00:
 12, 2.91MB/s]
     15%|#4        | 6.12M/41.5M [00:03<00:12, 3.04MB/s]
     16%|#6        | 6.70M/41.5M [00:03<00:11, 3.16MB/s]
     18%|#7        | 7.30M/41.5M [00:03<00:10, 3.27MB/s]
     19%|#9        | 7.93M/41.5M [00:03<00:10, 3.39MB/s]
     21%|##        | 8.57M/41.5M [00:03<00:09, 3.51MB/s]
     22%|##2       | 9.23M/41.5M [00:04<00:09, 3.62MB/s]
     24%|##3       | 9.93M/41.5M [00:04<00:08, 3.76MB/s]
     26%|##5       | 10.7M/41.5M [00:04<00:08, 3.93MB/s]
     28%|##7       | 11.4M/41.5M [00:04<00:07, 4.10MB/s]
     29%|##9       | 12.2M/41.5M [00:04<00:07, 4.29MB/s]
     32%|###1      | 13.1M/41.5M [00:04<00:06, 4.47MB/s]
     34%|###3      | 14.0M/41.5M [00:05<00:06, 4.69MB/s]
     36%|###5      | 14.9M/41.5M [00:05<00:05, 4.91MB/s]
     38%|###8      | 15.8M/41.5M [00:05<00:05, 5.14MB/s]
     41%|####      | 16.9M/41.5M [00:05<00:04, 5.38MB/s]
     43%|####3     | 17.9M/41.5M [00:05<00:04, 5.62MB/s]
     46%|####5     | 19.0M/41.5M [00:06<00:04, 5.89MB/s]
     49%|####8
      | 20.2M/41.5M [00:06<00:03, 6.18MB/s]
     52%|#####1    | 21.4M/41.5M [00:06<00:03, 6.48MB/s]
     55%|#####4    | 22.7M/41.5M [00:06<00:02, 6.80MB/s]
     58%|#####7    | 24.0M/41.5M [00:06<00:02, 7.13MB/s]
     61%|######1   | 25.5M/41.5M [00:06<00:02, 7.49MB/s]
     65%|######4   | 26.9M/41.5M [00:07<00:01, 7.84MB/s]
     68%|######8   | 28.4M/41.5M [00:07<00:01, 8.09MB/s]
     72%|#######2  | 29.9M/41.5M [00:07<00:01, 8.27MB/s]
     76%|#######5  | 31.4M/41.5M [00:07<00:01, 8.39MB/s]
     79%|#######9  | 32.8M/41.5M [00:07<00:01, 8.48MB/s]
     83%|########2 | 34.3M/41.5M [00:08<00:00, 8.52MB/s]
     86%|########6 | 35.8M/41.5M [00:08<00:00, 8.58MB/s]
     90%|########9 | 37.3M/41.5M [00:08<00:00, 8.60MB/s]
     93%|#########3| 38.7M/41.5M [00:08<00:00, 8.61MB/s]
     97%|#########6| 40.2M/41.5M [00:08<00:00, 8.63MB/s]
    100%|##########| 41.5M/41.5M [00:08<00:00, 4.97MB/s]
+
      0%|          | 0.00/41.5M [00:00<?, ?B/s]
      0%|          | 16.0k/41.5M [00:00<07:47, 93.1kB/s]
      0%|          | 48.0k/41.5M [00:00<04:55, 147kB/s] 
      0%|          | 96.0k/41.5M [00:00<03:26, 211kB/s]
      0%|          | 184k/41.5M [00:00<02:13, 325kB/s] 
      1%|          | 272k/41.5M [00:00<01:50, 391kB/s]
      1%|          | 368k/41.5M [00:01<01:36, 447kB/s]
      1%|1         | 464k/41.5M [00:01<01:29, 482kB/s]
      1%|1         | 568k/41.5M [00:01<01:22, 520kB/s]
      2%|1         | 672k/41.5M [00:01<01:17, 552kB/s]
      2%|1         | 784k/41.5M [00:01<01:13, 582kB/s]
      2%|2         | 904k/41.5M [00:01<01:09, 616kB/s]
      2%|2         | 1.01M/41.5M [00:02<01:05, 647kB/s]
      3%|2         | 1.14M/41.5M [00:02<01:01, 688kB/s]
      3%|3         | 1.27M/41.5M [00:02<00:58, 718kB/s]
      3%|3         | 1.41M/41.5M [00:02<00:55, 761kB/s]
      4%|3         | 1.57M/41.5M [00:02<00:52, 801kB/s]
      4%|4         | 1.73M/41.5M [00:03<00:49, 838kB/s]
 
      5%|4         | 1.88M/41.5M [00:03<00:47, 875kB/s]
      5%|4         | 2.05M/41.5M [00:03<00:45, 918kB/s]
      5%|5         | 2.23M/41.5M [00:03<00:43, 948kB/s]
      6%|5         | 2.41M/41.5M [00:03<00:41, 983kB/s]
      6%|6         | 2.59M/41.5M [00:03<00:39, 1.02MB/s]
      7%|6         | 2.80M/41.5M [00:04<00:38, 1.06MB/s]
      7%|7         | 3.00M/41.5M [00:04<00:36, 1.11MB/s]
      8%|7         | 3.21M/41.5M [00:04<00:34, 1.15MB/s]
      8%|8         | 3.43M/41.5M [00:04<00:33, 1.21MB/s]
      9%|8         | 3.66M/41.5M [00:04<00:31, 1.26MB/s]
      9%|9         | 3.91M/41.5M [00:04<00:30, 1.31MB/s]
     10%|#         | 4.17M/41.5M [00:05<00:28, 1.38MB/s]
     11%|#         | 4.44M/41.5M [00:05<00:26, 1.45MB/s]
     11%|#1        | 4.73M/41.5M [00:05<00:25, 1.51MB/s]
     12%|#2        | 5.02M/41.5M [00:05<00:23, 1.60MB/s]
     13%|#2        | 5.34M/41.5M [00:05<00:22, 1.68MB/s]
     14%|#3        | 5.66M/41.5M [00:06<00:21, 1.76MB/s]
     14%|#4        | 6.01M/41.5M 
 [00:06<00:18, 1.99MB/s]
     15%|#5        | 6.36M/41.5M [00:06<00:16, 2.18MB/s]
     16%|#6        | 6.65M/41.5M [00:06<00:16, 2.20MB/s]
     17%|#6        | 6.87M/41.5M [00:06<00:17, 2.04MB/s]
     17%|#7        | 7.11M/41.5M [00:06<00:19, 1.87MB/s]
     18%|#8        | 7.52M/41.5M [00:06<00:17, 2.05MB/s]
     19%|#9        | 7.94M/41.5M [00:07<00:16, 2.19MB/s]
     20%|##        | 8.38M/41.5M [00:07<00:14, 2.32MB/s]
     21%|##1       | 8.83M/41.5M [00:07<00:14, 2.43MB/s]
     22%|##2       | 9.30M/41.5M [00:07<00:13, 2.55MB/s]
     24%|##3       | 9.80M/41.5M [00:07<00:12, 2.68MB/s]
     25%|##4       | 10.3M/41.5M [00:07<00:11, 2.79MB/s]
     26%|##6       | 10.9M/41.5M [00:08<00:10, 2.93MB/s]
     28%|##7       | 11.4M/41.5M [00:08<00:09, 3.33MB/s]
     29%|##8       | 12.0M/41.5M [00:08<00:08, 3.63MB/s]
     30%|##9       | 12.4M/41.5M [00:08<00:08, 3.66MB/s]
     31%|###       | 12.7M/41.5M [00:08<00:09, 3.14MB/s]
     32%|###1      | 13.3M/41.5M [00:08<00:09, 3.17MB/s]
    
  34%|###3      | 13.9M/41.5M [00:08<00:07, 3.73MB/s]
     35%|###5      | 14.6M/41.5M [00:09<00:07, 3.85MB/s]
     37%|###6      | 15.3M/41.5M [00:09<00:07, 3.72MB/s]
     39%|###8      | 16.1M/41.5M [00:09<00:06, 4.24MB/s]
     41%|####      | 16.9M/41.5M [00:09<00:05, 4.68MB/s]
     42%|####2     | 17.5M/41.5M [00:09<00:04, 5.09MB/s]
     43%|####3     | 18.0M/41.5M [00:09<00:05, 4.46MB/s]
     45%|####4     | 18.6M/41.5M [00:10<00:05, 4.52MB/s]
     47%|####6     | 19.5M/41.5M [00:10<00:04, 5.20MB/s]
     48%|####8     | 20.1M/41.5M [00:10<00:04, 5.57MB/s]
     50%|####9     | 20.7M/41.5M [00:10<00:04, 4.77MB/s]
     52%|#####1    | 21.4M/41.5M [00:10<00:04, 5.25MB/s]
     54%|#####4    | 22.5M/41.5M [00:10<00:03, 6.51MB/s]
     56%|#####5    | 23.1M/41.5M [00:10<00:03, 6.11MB/s]
     57%|#####7    | 23.7M/41.5M [00:10<00:03, 5.26MB/s]
     60%|#####9    | 24.7M/41.5M [00:11<00:03, 5.61MB/s]
     62%|######2   | 25.9M/41.5M [00:11<00:02, 6.65MB/s]
     65%|######5   | 27.2M/41.5M
  [00:11<00:01, 8.09MB/s]
     67%|######7   | 28.0M/41.5M [00:11<00:01, 7.52MB/s]
     69%|######9   | 28.8M/41.5M [00:11<00:02, 6.51MB/s]
     72%|#######2  | 29.9M/41.5M [00:11<00:01, 6.86MB/s]
     76%|#######5  | 31.4M/41.5M [00:12<00:01, 7.41MB/s]
     79%|#######9  | 32.8M/41.5M [00:12<00:01, 7.83MB/s]
     83%|########2 | 34.3M/41.5M [00:12<00:00, 7.93MB/s]
     86%|########6 | 35.8M/41.5M [00:12<00:00, 8.17MB/s]
     90%|########9 | 37.2M/41.5M [00:12<00:00, 8.33MB/s]
     93%|#########3| 38.7M/41.5M [00:12<00:00, 8.44MB/s]
     97%|#########6| 40.2M/41.5M [00:13<00:00, 8.52MB/s]
    100%|##########| 41.5M/41.5M [00:13<00:00, 3.31MB/s]
 
 
 
diff --git a/docs/_sources/how_to/compile_models/from_paddle.rst.txt b/docs/_sources/how_to/compile_models/from_paddle.rst.txt
index a9fd6a034..109e288bb 100644
--- a/docs/_sources/how_to/compile_models/from_paddle.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_paddle.rst.txt
@@ -210,7 +210,7 @@ Look up prediction top 1 index in 1000 class synset.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  7.419 seconds)
+   **Total running time of the script:** ( 1 minutes  7.170 seconds)
 
 
 .. _sphx_glr_download_how_to_compile_models_from_paddle.py:
diff --git a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
index 3801e100f..a41cf29be 100644
--- a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
@@ -79,7 +79,7 @@ Load a pretrained PyTorch model
  .. code-block:: none
 
     Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
-
      0%|          | 0.00/44.7M [00:00<?, ?B/s]
     19%|#9        | 8.62M/44.7M [00:00<00:00, 90.3MB/s]
     71%|#######   | 31.6M/44.7M [00:00<00:00, 179MB/s] 
    100%|##########| 44.7M/44.7M [00:00<00:00, 183MB/s]
+
      0%|          | 0.00/44.7M [00:00<?, ?B/s]
      7%|7         | 3.27M/44.7M [00:00<00:01, 33.9MB/s]
     15%|#4        | 6.70M/44.7M [00:00<00:01, 35.0MB/s]
     73%|#######2  | 32.6M/44.7M [00:00<00:00, 142MB/s] 
    100%|##########| 44.7M/44.7M [00:00<00:00, 135MB/s]
 
 
 
diff --git a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
index 800ee7c46..fa7a92713 100644
--- a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
@@ -379,11 +379,6 @@ Run the corresponding model on tensorflow
 
 
 
-.. rst-class:: sphx-glr-timing
-
-   **Total running time of the script:** ( 1 minutes  3.030 seconds)
-
-
 .. _sphx_glr_download_how_to_compile_models_from_tensorflow.py:
 
 
diff --git a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
index 7ec30a4b3..e48e211d1 100644
--- a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
@@ -5,15 +5,15 @@
 
 Computation times
 =================
-**05:38.347** total execution time for **how_to_compile_models** files:
+**05:21.764** total execution time for **how_to_compile_models** files:
 
-- **01:07.419**: :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``)
-- **01:03.030**: :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``)
-- **00:59.272**: :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``)
-- **00:33.312**: :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``)
-- **00:27.046**: :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``)
-- **00:24.126**: :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``)
-- **00:21.479**: :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``)
-- **00:21.297**: :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``)
-- **00:18.867**: :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``)
+- **01:07.170**: :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``)
+- **00:59.772**: :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``)
+- **00:55.447**: :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``)
+- **00:37.304**: :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``)
+- **00:23.793**: :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``)
+- **00:21.971**: :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``)
+- **00:21.093**: :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``)
+- **00:19.394**: :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``)
+- **00:13.320**: :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``)
 - **00:02.499**: :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``)
diff --git a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
index ee3649f92..3079c6c89 100644
--- a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
@@ -402,7 +402,7 @@ Execute on TVM
     Evaluate inference time cost...
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      16.0851      15.9495      16.5632      15.8696       0.2554   
+      16.0734      15.9684      16.5769      15.6808       0.3410   
                
 
 
diff --git a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
index c5dc1a0ef..dea483bee 100644
--- a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
@@ -108,7 +108,7 @@ Load pre-trained maskrcnn from torchvision and do tracing
  .. code-block:: none
 
     Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
-
      0%|          | 0.00/170M [00:00<?, ?B/s]
      0%|          | 496k/170M [00:00<00:36, 4.93MB/s]
      1%|          | 984k/170M [00:00<00:39, 4.54MB/s]
     13%|#3        | 22.9M/170M [00:00<00:01, 102MB/s]
     23%|##2       | 38.9M/170M [00:00<00:01, 127MB/s]
     36%|###6      | 61.8M/170M [00:00<00:00, 167MB/s]
     50%|#####     | 85.2M/170M [00:00<00:00, 193MB/s]
     65%|######5   | 110M/170M [00:00<00:00, 216MB/s] 
     79%|#######9  | 134M/170M [00:00<00:00, 227MB/s]
     92%|#########1| 156M/170M [00:00<00:00, 209MB/s]
    100%|##########| 170M/170M [00:01<00:00, 177MB/s]
+
      0%|          | 0.00/170M [00:00<?, ?B/s]
      9%|8         | 14.4M/170M [00:00<00:01, 151MB/s]
     22%|##1       | 37.0M/170M [00:00<00:00, 201MB/s]
     35%|###5      | 59.8M/170M [00:00<00:00, 219MB/s]
     47%|####7     | 80.6M/170M [00:00<00:00, 217MB/s]
     62%|######2   | 106M/170M [00:00<00:00, 235MB/s] 
     78%|#######7  | 132M/170M [00:00<00:00, 246MB/s]
     91%|#########1| 155M/170M [00:00<00:00, 247MB/s]
    100%|##########| 170M/170M [00:00<00:00, 234MB/s]
     /usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:3878: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
       for i in range(dim)
     /usr/local/lib/python3.7/dist-packages/torchvision/models/detection/anchor_utils.py:127: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
@@ -262,7 +262,7 @@ Get boxes with score larger than 0.9
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 3 minutes  4.298 seconds)
+   **Total running time of the script:** ( 3 minutes  1.705 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_object_detection_pytorch.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
index 4455935ab..34cdee61e 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
@@ -187,7 +187,7 @@ training. Other models require a full post training calibration.
  .. code-block:: none
 
     Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
-
      0%|          | 0.00/13.6M [00:00<?, ?B/s]
     71%|#######1  | 9.62M/13.6M [00:00<00:00, 101MB/s]
    100%|##########| 13.6M/13.6M [00:00<00:00, 122MB/s]
+
      0%|          | 0.00/13.6M [00:00<?, ?B/s]
     30%|##9       | 4.04M/13.6M [00:00<00:00, 42.3MB/s]
     61%|######1   | 8.31M/13.6M [00:00<00:00, 43.8MB/s]
    100%|##########| 13.6M/13.6M [00:00<00:00, 61.3MB/s]
 
 
 
@@ -353,7 +353,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      90.2391      90.1584      91.7401      90.0002       0.2421   
+      90.3358      90.2240      95.5686      90.0991       0.5598   
                
 
 
@@ -393,7 +393,7 @@ TODO
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  5.584 seconds)
+   **Total running time of the script:** ( 1 minutes  5.752 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_prequantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
index 03787675c..acff076cc 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
@@ -360,7 +360,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      118.6064     118.5160     125.6847     117.7654      0.7983   
+      118.8476     118.7190     128.0987     117.9396      1.0735   
                
 
 
@@ -394,7 +394,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 2 minutes  4.454 seconds)
+   **Total running time of the script:** ( 1 minutes  56.256 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_prequantized_tflite.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
index d9c5a0c14..7bc68d688 100644
--- a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
@@ -223,7 +223,7 @@ We create a Relay VM to build and execute the model.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  31.885 seconds)
+   **Total running time of the script:** ( 1 minutes  51.334 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_quantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
index 5f0a223ec..0465361a0 100644
--- a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
@@ -137,7 +137,7 @@ Convert and compile model for CPU.
             data: None
       input_sym_arg_type = in_param.infer_type()[0]
     Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
-
      0%|          | 0/132723 [00:00<?, ?KB/s]
      5%|4         | 6345/132723 [00:00<00:01, 63435.84KB/s]
     11%|#1        | 15063/132723 [00:00<00:01, 77394.05KB/s]
     18%|#8        | 23903/132723 [00:00<00:01, 82416.06KB/s]
     25%|##4       | 32725/132723 [00:00<00:01, 84702.37KB/s]
     31%|###1      | 41509/132723 [00:00<00:01, 85828.95KB/s]
     38%|###7      | 50347/132723 [00:00<00:00, 86692.81KB/s]
     45%|####4     | 59254/132723 [00:00<00:00, 87468.15KB/s]
     51%|#####1    | 68159/132723 [00:00<00:00, 87968.46KB/s]
     58%|#####8    | 77041/132723 [00:00<00:00, 88231.64KB/s]
     65%|######4   | 85921/132723 [00:01<00:00, 88404.39KB/s]
     71%|#######1  | 94775/132723 [00:01<00:00, 88443.59KB/s]
     78%|#######8  | 103620/132723 [00:01<00:00, 88359.89KB/s]
     85%|########4 | 112529/132723 [00:01<00:00, 88578.24KB/s]
     91%|#########1| 121436/132723 [00:01<00:00, 88717.79KB/s]
     98%|#########8| 130337/132723 [00:01<00:00, 88802.67KB/s]
    100%|#######
 ###| 132723/132723 [00:01<00:00, 86789.35KB/s]
+
      0%|          | 0/132723 [00:00<?, ?KB/s]
      5%|4         | 6594/132723 [00:00<00:01, 65922.46KB/s]
     11%|#1        | 15107/132723 [00:00<00:01, 77217.54KB/s]
     18%|#7        | 23884/132723 [00:00<00:01, 82032.74KB/s]
     24%|##4       | 32088/132723 [00:00<00:01, 77000.78KB/s]
     30%|###       | 39832/132723 [00:00<00:01, 68793.41KB/s]
     37%|###6      | 48487/132723 [00:00<00:01, 74225.49KB/s]
     43%|####2     | 56691/132723 [00:00<00:00, 76600.03KB/s]
     49%|####9     | 65442/132723 [00:00<00:00, 79902.42KB/s]
     56%|#####5    | 74129/132723 [00:00<00:00, 82004.80KB/s]
     62%|######2   | 82867/132723 [00:01<00:00, 83621.97KB/s]
     69%|######8   | 91504/132723 [00:01<00:00, 84438.72KB/s]
     76%|#######5  | 100269/132723 [00:01<00:00, 85400.43KB/s]
     82%|########2 | 109030/132723 [00:01<00:00, 86063.02KB/s]
     89%|########8 | 117744/132723 [00:01<00:00, 86376.21KB/s]
     95%|#########5| 126445/132723 [00:01<00:00, 86564.76KB/s]
    100%|#######
 ###| 132723/132723 [00:01<00:00, 81643.73KB/s]
 
 
 
@@ -211,7 +211,7 @@ Display result
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 2 minutes  39.070 seconds)
+   **Total running time of the script:** ( 2 minutes  38.624 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_ssd_gluoncv.py:
diff --git a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
index ba45f1a03..c4b84afe1 100644
--- a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
@@ -5,13 +5,13 @@
 
 Computation times
 =================
-**11:16.413** total execution time for **how_to_deploy_models** files:
+**11:23.667** total execution time for **how_to_deploy_models** files:
 
-- **03:04.298**: :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``)
-- **02:39.070**: :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``)
-- **02:04.454**: :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``)
-- **01:31.885**: :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``)
-- **01:05.584**: :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``)
-- **00:28.683**: :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``)
-- **00:22.231**: :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``)
-- **00:00.207**: :ref:`sphx_glr_how_to_deploy_models_deploy_sparse.py` (``deploy_sparse.py``)
+- **03:01.705**: :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``)
+- **02:38.624**: :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``)
+- **01:56.256**: :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``)
+- **01:51.334**: :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``)
+- **01:05.752**: :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``)
+- **00:27.760**: :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``)
+- **00:22.042**: :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``)
+- **00:00.193**: :ref:`sphx_glr_how_to_deploy_models_deploy_sparse.py` (``deploy_sparse.py``)
diff --git a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
index 53c2b39d4..7eda15a9b 100644
--- a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
@@ -425,7 +425,7 @@ First let us define two helper functions to get the mobilenet model and a cat im
 
  .. code-block:: none
 
-    Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip56f6cf58-0e13-4247-9e1a-4d304384b2ba from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+    Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip70153db1-eac3-460b-9858-61569c71e123 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
 
 
 
diff --git a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
index 84b102e44..1ef286fc6 100644
--- a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
@@ -5,9 +5,9 @@
 
 Computation times
 =================
-**00:38.488** total execution time for **how_to_extend_tvm** files:
+**00:38.041** total execution time for **how_to_extend_tvm** files:
 
-- **00:34.938**: :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``)
-- **00:02.285**: :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``)
-- **00:01.055**: :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``)
-- **00:00.210**: :ref:`sphx_glr_how_to_extend_tvm_low_level_custom_pass.py` (``low_level_custom_pass.py``)
+- **00:34.545**: :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``)
+- **00:02.256**: :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``)
+- **00:01.037**: :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``)
+- **00:00.203**: :ref:`sphx_glr_how_to_extend_tvm_low_level_custom_pass.py` (``low_level_custom_pass.py``)
diff --git a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
index da55d9fdf..644c7a55b 100644
--- a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
@@ -199,10 +199,10 @@ profile the execution time of each passes.
  .. code-block:: none
 
     Printing results of timing profile...
-    InferType: 6042us [6042us] (44.79%; 44.79%)
-    FoldScaleAxis: 7446us [5us] (55.21%; 55.21%)
-            FoldConstant: 7441us [1494us] (55.17%; 99.93%)
-                    InferType: 5947us [5947us] (44.09%; 79.92%)
+    InferType: 5991us [5991us] (45.53%; 45.53%)
+    FoldScaleAxis: 7168us [5us] (54.47%; 54.47%)
+            FoldConstant: 7162us [1462us] (54.43%; 99.92%)
+                    InferType: 5700us [5700us] (43.31%; 79.58%)
 
 
 
@@ -239,10 +239,10 @@ Refer to following sections and :py:func:`tvm.instrument.pass_instrument` for th
  .. code-block:: none
 
     Printing results of timing profile...
-    InferType: 5874us [5874us] (43.91%; 43.91%)
-    FoldScaleAxis: 7504us [5us] (56.09%; 56.09%)
-            FoldConstant: 7499us [1758us] (56.06%; 99.93%)
-                    InferType: 5741us [5741us] (42.92%; 76.56%)
+    InferType: 5778us [5778us] (44.81%; 44.81%)
+    FoldScaleAxis: 7117us [4us] (55.19%; 55.19%)
+            FoldConstant: 7113us [1474us] (55.16%; 99.94%)
+                    InferType: 5639us [5639us] (43.73%; 79.28%)
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
index e68465b59..2287f2f9f 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
@@ -295,7 +295,7 @@ latency of convolution.
 
  .. code-block:: none
 
-    Convolution: 54.120575 ms
+    Convolution: 34.780645 ms
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
index fdc599dbf..7db8072a7 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
@@ -628,7 +628,7 @@ be able to run on our build server
 
  .. code-block:: none
 
-    conv2d with tensor core: 7.868202 ms
+    conv2d with tensor core: 8.070889 ms
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
index bd74dd28b..17f205232 100644
--- a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
@@ -118,8 +118,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
 
  .. code-block:: none
 
-    Numpy running time: 0.018608
-    Baseline: 3.227036
+    Numpy running time: 0.017973
+    Baseline: 3.184344
 
 
 
@@ -210,7 +210,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
 
  .. code-block:: none
 
-    Opt1: 0.301383
+    Opt1: 0.295863
 
 
 
@@ -309,7 +309,7 @@ In this tutorial, we chose to vectorize the inner loop row data since it is cach
 
  .. code-block:: none
 
-    Opt2: 0.337461
+    Opt2: 0.334231
 
 
 
@@ -401,7 +401,7 @@ the access pattern for A matrix is more cache friendly.
 
  .. code-block:: none
 
-    Opt3: 0.117425
+    Opt3: 0.116509
 
 
 
@@ -520,7 +520,7 @@ flattening.
 
  .. code-block:: none
 
-    Opt4: 0.110596
+    Opt4: 0.113170
 
 
 
@@ -638,7 +638,7 @@ write to C when all the block results are ready.
 
  .. code-block:: none
 
-    Opt5: 0.110941
+    Opt5: 0.109905
 
 
 
@@ -759,7 +759,7 @@ Futhermore, we can also utilize multi-core processors to do the thread-level par
 
  .. code-block:: none
 
-    Opt6: 0.145143
+    Opt6: 0.144260
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
index 99bde5346..66dedfaa2 100644
--- a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
@@ -5,8 +5,8 @@
 
 Computation times
 =================
-**00:34.552** total execution time for **how_to_optimize_operators** files:
+**00:34.160** total execution time for **how_to_optimize_operators** files:
 
-- **00:31.889**: :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``)
-- **00:01.444**: :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``)
-- **00:01.219**: :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``)
+- **00:31.505**: :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``)
+- **00:01.451**: :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``)
+- **00:01.204**: :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``)
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
index 3ac3cbc7c..fa8596aaa 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
@@ -5,11 +5,11 @@
 
 Computation times
 =================
-**05:01.083** total execution time for **how_to_tune_with_autoscheduler** files:
-
-- **02:26.752**: :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``)
-- **01:19.468**: :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``)
-- **00:41.019**: :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``)
-- **00:16.416**: :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``)
-- **00:08.850**: :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``)
-- **00:08.577**: :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``)
+**05:01.349** total execution time for **how_to_tune_with_autoscheduler** files:
+
+- **02:27.893**: :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``)
+- **01:18.366**: :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``)
+- **00:40.456**: :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``)
+- **00:17.473**: :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``)
+- **00:08.884**: :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``)
+- **00:08.277**: :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``)
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
index 81bb0dfac..97b140604 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
@@ -222,166 +222,1035 @@ cooperative fetching, unrolling and operator fusion.
                  compute: Buffer(compute_2: Pointer(float32), float32, [25088], [])}
       buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, compute_1: compute}
       preflattened_buffer_map = {data_1: data_3: Buffer(data_2, float32, [1, 512, 7, 7], []), kernel_1: kernel_3: Buffer(kernel_2, float32, [512, 512, 3, 3], []), bias_1: bias_3: Buffer(bias_2, float32, [1, 512, 1, 1], []), compute_1: compute_3: Buffer(compute_2, float32, [1, 512, 7, 7], [])} {
-      attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 56;
-      allocate(conv2d_nchw: Pointer(local float32), float32, [7]), storage_scope = local;
-      allocate(pad_temp.shared: Pointer(shared float32), float32, [216]), storage_scope = shared;
-      allocate(kernel.shared: Pointer(shared float32), float32, [4608]), storage_scope = shared;
-      attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64 {
-        conv2d_nchw_1: Buffer(conv2d_nchw, float32, [1], [], scope="local", align=4)[0] = 0f32
-        conv2d_nchw_1[1] = 0f32
+      attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 64;
+      allocate(conv2d_nchw: Pointer(local float32), float32, [8]), storage_scope = local;
+      allocate(pad_temp.shared: Pointer(shared float32), float32, [1568]), storage_scope = shared;
+      allocate(kernel.shared: Pointer(shared float32), float32, [256]), storage_scope = shared;
+      attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49 {
+        conv2d_nchw_1: Buffer(conv2d_nchw, float32, [4], [], scope="local", align=8)[0] = 0f32
         conv2d_nchw_1[2] = 0f32
-        conv2d_nchw_1[3] = 0f32
         conv2d_nchw_1[4] = 0f32
-        conv2d_nchw_1[5] = 0f32
         conv2d_nchw_1[6] = 0f32
-        for (rc.outer.outer: int32, 0, 64) {
-          let cse_var_2: int32 = (rc.outer.outer*392)
-          let cse_var_1: int32 = (rc.outer.outer*72)
-           {
-            attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            pad_temp.shared_1: Buffer(pad_temp.shared, float32, [216], [], scope="shared")[threadIdx.x_1] = @tir.if_then_else(((((3 <= floormod(threadIdx.x_1, 27)) && (floormod(threadIdx.x_1, 27) < 24)) && (1 <= (floormod(blockIdx.x, 7) + floormod(threadIdx.x_1, 3)))) && ((floormod(blockIdx.x, 7) + floormod(threadIdx.x_1, 3)) < 8)), data[(((((cse_var_2 + (floordiv(threadIdx.x_1, 27)*49)) + (floordiv(floormod(threadIdx.x_1, 27), 3)*7)) + floormod(blockIdx.x, 7)) + floormod(threadIdx.x_1,  [...]
-            attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            pad_temp.shared_1[(threadIdx.x_1 + 64)] = @tir.if_then_else(((((3 <= floormod((threadIdx.x_1 + 64), 27)) && (floormod((threadIdx.x_1 + 10), 27) < 24)) && (1 <= (floormod(blockIdx.x, 7) + floormod((threadIdx.x_1 + 1), 3)))) && ((floormod(blockIdx.x, 7) + floormod((threadIdx.x_1 + 1), 3)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 64), 27)*49)) + (floordiv(floormod((threadIdx.x_1 + 10), 27), 3)*7)) + floormod(blockIdx.x, 7)) + floormod((threadIdx.x_1 + 1), 3)) - 8) [...]
-            attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            pad_temp.shared_1[(threadIdx.x_1 + 128)] = @tir.if_then_else(((((3 <= floormod((threadIdx.x_1 + 128), 27)) && (floormod((threadIdx.x_1 + 20), 27) < 24)) && (1 <= (floormod(blockIdx.x, 7) + floormod((threadIdx.x_1 + 2), 3)))) && ((floormod(blockIdx.x, 7) + floormod((threadIdx.x_1 + 2), 3)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 128), 27)*49)) + (floordiv(floormod((threadIdx.x_1 + 20), 27), 3)*7)) + floormod(blockIdx.x, 7)) + floormod((threadIdx.x_1 + 2), 3)) - [...]
-            attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            if @tir.likely((threadIdx.x_1 < 24), dtype=bool) {
-              pad_temp.shared_1[(threadIdx.x_1 + 192)] = @tir.if_then_else((((floormod((threadIdx.x_1 + 3), 27) < 24) && (1 <= (floormod(blockIdx.x, 7) + floormod(threadIdx.x_1, 3)))) && ((floormod(blockIdx.x, 7) + floormod(threadIdx.x_1, 3)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 192), 27)*49)) + (floormod((floordiv(threadIdx.x_1, 3) + 1), 9)*7)) + floormod(blockIdx.x, 7)) + floormod(threadIdx.x_1, 3)) - 8)], 0f32, dtype=float32)
-            }
-            attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared_1: Buffer(kernel.shared, float32, [4608], [], scope="shared")[ramp((threadIdx.x_2*4), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(threadIdx.x_2, 18)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp((threadIdx.x_2*4), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp(threadIdx.x_2, 1, 4), broadcast(3, 4)))]
-            attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared_1[ramp(((threadIdx.x_2*4) + 256), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 256), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 256), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 64), 1, 4), broadcast(3, 4)))]
-            attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared_1[ramp(((threadIdx.x_2*4) + 512), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 512), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 512), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 128), 1, 4), broadcast(3, 4)))]
-            attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared_1[ramp(((threadIdx.x_2*4) + 768), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 768), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 768), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 192), 1, 4), broadcast(3, 4)))]
-            attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared_1[ramp(((threadIdx.x_2*4) + 1024), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 1024), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 1024), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 256), 1, 4), broadcast(3, 4)))]
-            attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared_1[ramp(((threadIdx.x_2*4) + 1280), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 1280), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 1280), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 320), 1, 4), broadcast(3, 4)))]
-            attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared_1[ramp(((threadIdx.x_2*4) + 1536), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 1536), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 1536), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 384), 1, 4), broadcast(3, 4)))]
-            attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared_1[ramp(((threadIdx.x_2*4) + 1792), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 1792), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 1792), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 448), 1, 4), broadcast(3, 4)))]
-            attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared_1[ramp(((threadIdx.x_2*4) + 2048), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 2048), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 2048), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 512), 1, 4), broadcast(3, 4)))]
-            attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared_1[ramp(((threadIdx.x_2*4) + 2304), 1, 4)] = kernel[((broadcast(((((floordiv(blockIdx.x, 7)*294912) + (floordiv(threadIdx.x_2, 18)*4608)) + cse_var_1) + 147456), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 2304), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 576), 1, 4), broadcast(3, 4)))]
-            attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared_1[ramp(((threadIdx.x_2*4) + 2560), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 2560), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 2560), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 640), 1, 4), broadcast(3, 4)))]
-            attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared_1[ramp(((threadIdx.x_2*4) + 2816), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 2816), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 2816), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 704), 1, 4), broadcast(3, 4)))]
-            attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared_1[ramp(((threadIdx.x_2*4) + 3072), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 3072), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 3072), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 768), 1, 4), broadcast(3, 4)))]
-            attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared_1[ramp(((threadIdx.x_2*4) + 3328), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 3328), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 3328), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 832), 1, 4), broadcast(3, 4)))]
-            attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared_1[ramp(((threadIdx.x_2*4) + 3584), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 3584), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 3584), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 896), 1, 4), broadcast(3, 4)))]
-            attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared_1[ramp(((threadIdx.x_2*4) + 3840), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 3840), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 3840), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 960), 1, 4), broadcast(3, 4)))]
-            attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared_1[ramp(((threadIdx.x_2*4) + 4096), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 4096), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 4096), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 1024), 1, 4), broadcast(3, 4)))]
-            attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared_1[ramp(((threadIdx.x_2*4) + 4352), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 4352), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 4352), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 1088), 1, 4), broadcast(3, 4)))]
-            for (rc.inner: int32, 0, 8) {
-              let cse_var_24: int32 = (rc.inner*27)
-              let cse_var_23: int32 = (cse_var_24 + 10)
-              let cse_var_22: int32 = (cse_var_24 + 11)
-              let cse_var_21: int32 = (cse_var_24 + 12)
-              let cse_var_20: int32 = (cse_var_24 + 13)
-              let cse_var_19: int32 = (cse_var_24 + 14)
-              let cse_var_18: int32 = (cse_var_24 + 15)
-              let cse_var_17: int32 = (cse_var_24 + 16)
-              let cse_var_16: int32 = (cse_var_24 + 17)
-              let cse_var_15: int32 = (cse_var_24 + 18)
-              let cse_var_14: int32 = (cse_var_24 + 20)
-              let cse_var_13: int32 = (cse_var_24 + 21)
-              let cse_var_12: int32 = (cse_var_24 + 9)
-              let cse_var_11: int32 = (cse_var_24 + 8)
-              let cse_var_10: int32 = (cse_var_24 + 7)
-              let cse_var_9: int32 = (cse_var_24 + 6)
-              let cse_var_8: int32 = (cse_var_24 + 5)
-              let cse_var_7: int32 = (cse_var_24 + 4)
-              let cse_var_6: int32 = (cse_var_24 + 3)
-              let cse_var_5: int32 = (cse_var_24 + 23)
-              let cse_var_4: int32 = (cse_var_24 + 19)
-              let cse_var_3: int32 = (cse_var_24 + 22)
-               {
-                conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_24]*kernel.shared_1[((threadIdx.x*72) + (rc.inner*9))]))
-                conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_6]*kernel.shared_1[((threadIdx.x*72) + (rc.inner*9))]))
-                conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_9]*kernel.shared_1[((threadIdx.x*72) + (rc.inner*9))]))
-                conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_12]*kernel.shared_1[((threadIdx.x*72) + (rc.inner*9))]))
-                conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_21]*kernel.shared_1[((threadIdx.x*72) + (rc.inner*9))]))
-                conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_18]*kernel.shared_1[((threadIdx.x*72) + (rc.inner*9))]))
-                conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_15]*kernel.shared_1[((threadIdx.x*72) + (rc.inner*9))]))
-                conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(cse_var_24 + 1)]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 1)]))
-                conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_7]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 1)]))
-                conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_10]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 1)]))
-                conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_23]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 1)]))
-                conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_20]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 1)]))
-                conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_17]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 1)]))
-                conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_4]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 1)]))
-                conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(cse_var_24 + 2)]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 2)]))
-                conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_8]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 2)]))
-                conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_11]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 2)]))
-                conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_22]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 2)]))
-                conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_19]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 2)]))
-                conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_16]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 2)]))
-                conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_14]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 2)]))
-                conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_6]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 3)]))
-                conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_9]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 3)]))
-                conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_12]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 3)]))
-                conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_21]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 3)]))
-                conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_18]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 3)]))
-                conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_15]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 3)]))
-                conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_13]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 3)]))
-                conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_7]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 4)]))
-                conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_10]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 4)]))
-                conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_23]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 4)]))
-                conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_20]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 4)]))
-                conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_17]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 4)]))
-                conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_4]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 4)]))
-                conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_3]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 4)]))
-                conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_8]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 5)]))
-                conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_11]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 5)]))
-                conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_22]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 5)]))
-                conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_19]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 5)]))
-                conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_16]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 5)]))
-                conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_14]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 5)]))
-                conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_5]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 5)]))
-                conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_9]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 6)]))
-                conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_12]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 6)]))
-                conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_21]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 6)]))
-                conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_18]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 6)]))
-                conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_15]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 6)]))
-                conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_13]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 6)]))
-                conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(cse_var_24 + 24)]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 6)]))
-                conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_10]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 7)]))
-                conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_23]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 7)]))
-                conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_20]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 7)]))
-                conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_17]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 7)]))
-                conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_4]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 7)]))
-                conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_3]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 7)]))
-                conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(cse_var_24 + 25)]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 7)]))
-                conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_11]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 8)]))
-                conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_22]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 8)]))
-                conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_19]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 8)]))
-                conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_16]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 8)]))
-                conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_14]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 8)]))
-                conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_5]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 8)]))
-                conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(cse_var_24 + 26)]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 8)]))
+        conv2d_nchw_1[1] = 0f32
+        conv2d_nchw_1[3] = 0f32
+        conv2d_nchw_1[5] = 0f32
+        conv2d_nchw_1[7] = 0f32
+        for (rc.outer.outer: int32, 0, 16) {
+          for (ry.outer.outer: int32, 0, 3) {
+            let cse_var_2: int32 = (rc.outer.outer*288)
+            let cse_var_1: int32 = (ry.outer.outer*3)
+             {
+              attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1: Buffer(pad_temp.shared, float32, [1568], [], scope="shared")[threadIdx.x_1] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 49)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 41)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 98)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 90)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 147)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 139)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 196)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 188)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 245)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 237)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 294)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 286)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 343)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 335)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 392)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 384)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 441)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 433)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 490)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 482)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 539)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 531)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 588)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 580)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 637)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 629)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 686)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 678)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 735)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 727)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 784)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 776)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 833)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 825)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 882)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 874)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 931)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 923)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 980)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 972)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1029)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1021)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1078)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1070)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1127)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1119)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1176)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1168)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1225)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1217)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1274)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1266)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1323)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1315)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1372)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1364)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1421)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1413)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1470)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1462)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1519)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1511)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              kernel.shared_1: Buffer(kernel.shared, float32, [256], [], scope="shared")[threadIdx.x_2] = kernel[(((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 32)*4608)) + cse_var_2) + (floormod(threadIdx.x_2, 32)*9)) + cse_var_1)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              kernel.shared_1[(threadIdx.x_2 + 49)] = kernel[(((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 49), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 17), 32)*9)) + cse_var_1)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              kernel.shared_1[(threadIdx.x_2 + 98)] = kernel[(((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 98), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 2), 32)*9)) + cse_var_1)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              kernel.shared_1[(threadIdx.x_2 + 147)] = kernel[(((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 147), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 19), 32)*9)) + cse_var_1)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              kernel.shared_1[(threadIdx.x_2 + 196)] = kernel[(((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 196), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 4), 32)*9)) + cse_var_1)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              if @tir.likely((threadIdx.x_2 < 11), dtype=bool) {
+                kernel.shared_1[(threadIdx.x_2 + 245)] = kernel[(((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 245), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 21), 32)*9)) + cse_var_1)]
+              }
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[0]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[64]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[128]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[192]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[32]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[96]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[160]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[224]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[1]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[65]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[129]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[193]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[33]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[97]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[161]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[225]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[2]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[66]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[130]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[194]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[34]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[98]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[162]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[226]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[3]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[67]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[131]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[195]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[35]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[99]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[163]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[227]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[4]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[68]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[132]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[196]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[36]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[100]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[164]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[228]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[5]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[69]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[133]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[197]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[37]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[101]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[165]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[229]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[6]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[70]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[134]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[198]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[38]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[102]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[166]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[230]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[7]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[71]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[135]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[199]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[39]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[103]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[167]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[231]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[8]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[72]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[136]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[200]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[40]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[104]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[168]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[232]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[9]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[73]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[137]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[201]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[41]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[105]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[169]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[233]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[10]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[74]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[138]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[202]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[42]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[106]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[170]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[234]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[11]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[75]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[139]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[203]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[43]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[107]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[171]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[235]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[12]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[76]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[140]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[204]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[44]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[108]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[172]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[236]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[13]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[77]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[141]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[205]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[45]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[109]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[173]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[237]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[14]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[78]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[142]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[206]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[46]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[110]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[174]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[238]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[15]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[79]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[143]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[207]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[47]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[111]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[175]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[239]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[16]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[80]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[144]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[208]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[48]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[112]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[176]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[240]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[17]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[81]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[145]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[209]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[49]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[113]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[177]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[241]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[18]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[82]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[146]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[210]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[50]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[114]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[178]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[242]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[19]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[83]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[147]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[211]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[51]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[115]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[179]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[243]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[20]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[84]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[148]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[212]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[52]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[116]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[180]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[244]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[21]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[85]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[149]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[213]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[53]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[117]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[181]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[245]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[22]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[86]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[150]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[214]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[54]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[118]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[182]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[246]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[23]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[87]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[151]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[215]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[55]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[119]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[183]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[247]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[24]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[88]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[152]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[216]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[56]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[120]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[184]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[248]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[25]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[89]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[153]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[217]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[57]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[121]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[185]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[249]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[26]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[90]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[154]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[218]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[58]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[122]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[186]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[250]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[27]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[91]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[155]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[219]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[59]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[123]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[187]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[251]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[28]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[92]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[156]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[220]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[60]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[124]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[188]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[252]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[29]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[93]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[157]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[221]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[61]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[125]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[189]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[253]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[30]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[94]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[158]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[222]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[62]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[126]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[190]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[254]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[31]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[95]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[159]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[223]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[63]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[127]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[191]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[255]))
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[threadIdx.x_1] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) - 7)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 49)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 42)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 98)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 91)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 147)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 140)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 196)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 189)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 245)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 238)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 294)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 287)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 343)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 336)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 392)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 385)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 441)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 434)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 490)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 483)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 539)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 532)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 588)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 581)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 637)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 630)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 686)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 679)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 735)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 728)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 784)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 777)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 833)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 826)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 882)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 875)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 931)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 924)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 980)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 973)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1029)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1022)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1078)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1071)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1127)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1120)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1176)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1169)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1225)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1218)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1274)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1267)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1323)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1316)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1372)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1365)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1421)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1414)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1470)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1463)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1519)] = @tir.if_then_else(((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1512)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              kernel.shared_1[threadIdx.x_2] = kernel[((((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 32)*4608)) + cse_var_2) + (floormod(threadIdx.x_2, 32)*9)) + cse_var_1) + 1)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              kernel.shared_1[(threadIdx.x_2 + 49)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 49), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 17), 32)*9)) + cse_var_1) + 1)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              kernel.shared_1[(threadIdx.x_2 + 98)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 98), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 2), 32)*9)) + cse_var_1) + 1)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              kernel.shared_1[(threadIdx.x_2 + 147)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 147), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 19), 32)*9)) + cse_var_1) + 1)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              kernel.shared_1[(threadIdx.x_2 + 196)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 196), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 4), 32)*9)) + cse_var_1) + 1)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              if @tir.likely((threadIdx.x_2 < 11), dtype=bool) {
+                kernel.shared_1[(threadIdx.x_2 + 245)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 245), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 21), 32)*9)) + cse_var_1) + 1)]
+              }
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[0]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[64]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[128]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[192]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[32]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[96]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[160]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[224]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[1]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[65]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[129]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[193]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[33]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[97]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[161]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[225]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[2]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[66]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[130]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[194]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[34]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[98]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[162]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[226]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[3]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[67]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[131]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[195]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[35]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[99]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[163]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[227]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[4]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[68]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[132]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[196]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[36]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[100]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[164]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[228]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[5]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[69]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[133]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[197]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[37]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[101]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[165]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[229]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[6]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[70]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[134]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[198]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[38]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[102]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[166]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[230]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[7]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[71]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[135]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[199]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[39]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[103]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[167]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[231]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[8]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[72]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[136]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[200]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[40]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[104]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[168]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[232]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[9]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[73]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[137]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[201]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[41]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[105]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[169]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[233]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[10]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[74]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[138]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[202]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[42]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[106]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[170]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[234]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[11]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[75]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[139]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[203]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[43]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[107]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[171]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[235]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[12]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[76]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[140]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[204]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[44]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[108]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[172]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[236]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[13]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[77]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[141]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[205]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[45]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[109]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[173]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[237]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[14]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[78]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[142]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[206]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[46]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[110]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[174]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[238]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[15]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[79]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[143]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[207]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[47]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[111]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[175]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[239]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[16]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[80]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[144]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[208]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[48]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[112]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[176]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[240]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[17]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[81]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[145]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[209]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[49]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[113]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[177]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[241]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[18]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[82]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[146]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[210]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[50]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[114]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[178]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[242]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[19]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[83]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[147]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[211]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[51]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[115]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[179]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[243]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[20]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[84]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[148]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[212]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[52]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[116]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[180]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[244]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[21]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[85]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[149]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[213]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[53]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[117]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[181]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[245]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[22]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[86]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[150]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[214]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[54]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[118]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[182]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[246]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[23]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[87]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[151]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[215]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[55]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[119]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[183]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[247]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[24]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[88]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[152]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[216]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[56]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[120]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[184]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[248]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[25]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[89]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[153]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[217]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[57]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[121]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[185]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[249]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[26]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[90]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[154]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[218]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[58]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[122]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[186]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[250]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[27]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[91]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[155]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[219]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[59]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[123]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[187]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[251]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[28]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[92]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[156]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[220]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[60]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[124]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[188]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[252]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[29]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[93]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[157]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[221]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[61]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[125]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[189]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[253]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[30]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[94]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[158]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[222]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[62]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[126]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[190]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[254]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[31]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[95]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[159]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[223]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[63]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[127]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[191]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[255]))
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[threadIdx.x_1] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) - 6)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 49)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 43)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 98)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 92)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 147)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 141)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 196)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 190)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 245)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 239)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 294)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 288)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 343)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 337)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 392)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 386)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 441)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 435)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 490)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 484)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 539)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 533)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 588)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 582)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 637)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 631)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 686)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 680)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 735)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 729)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 784)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 778)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 833)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 827)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 882)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 876)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 931)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 925)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 980)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 974)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1029)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1023)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1078)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1072)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1127)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1121)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1176)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1170)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1225)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1219)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1274)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1268)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1323)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1317)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1372)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1366)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1421)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1415)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1470)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1464)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              pad_temp.shared_1[(threadIdx.x_1 + 1519)] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (floormod(threadIdx.x_1, 7) < 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1513)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              kernel.shared_1[threadIdx.x_2] = kernel[((((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 32)*4608)) + cse_var_2) + (floormod(threadIdx.x_2, 32)*9)) + cse_var_1) + 2)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              kernel.shared_1[(threadIdx.x_2 + 49)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 49), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 17), 32)*9)) + cse_var_1) + 2)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              kernel.shared_1[(threadIdx.x_2 + 98)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 98), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 2), 32)*9)) + cse_var_1) + 2)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              kernel.shared_1[(threadIdx.x_2 + 147)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 147), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 19), 32)*9)) + cse_var_1) + 2)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              kernel.shared_1[(threadIdx.x_2 + 196)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 196), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 4), 32)*9)) + cse_var_1) + 2)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
+              if @tir.likely((threadIdx.x_2 < 11), dtype=bool) {
+                kernel.shared_1[(threadIdx.x_2 + 245)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 245), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 21), 32)*9)) + cse_var_1) + 2)]
               }
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[0]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[64]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[128]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[192]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[32]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[96]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[160]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[224]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[1]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[65]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[129]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[193]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[33]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[97]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[161]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[225]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[2]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[66]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[130]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[194]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[34]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[98]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[162]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[226]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[3]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[67]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[131]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[195]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[35]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[99]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[163]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[227]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[4]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[68]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[132]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[196]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[36]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[100]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[164]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[228]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[5]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[69]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[133]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[197]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[37]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[101]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[165]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[229]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[6]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[70]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[134]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[198]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[38]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[102]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[166]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[230]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[7]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[71]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[135]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[199]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[39]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[103]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[167]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[231]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[8]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[72]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[136]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[200]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[40]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[104]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[168]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[232]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[9]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[73]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[137]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[201]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[41]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[105]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[169]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[233]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[10]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[74]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[138]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[202]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[42]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[106]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[170]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[234]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[11]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[75]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[139]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[203]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[43]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[107]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[171]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[235]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[12]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[76]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[140]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[204]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[44]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[108]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[172]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[236]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[13]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[77]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[141]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[205]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[45]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[109]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[173]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[237]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[14]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[78]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[142]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[206]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[46]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[110]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[174]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[238]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[15]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[79]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[143]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[207]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[47]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[111]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[175]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[239]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[16]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[80]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[144]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[208]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[48]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[112]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[176]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[240]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[17]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[81]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[145]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[209]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[49]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[113]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[177]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[241]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[18]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[82]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[146]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[210]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[50]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[114]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[178]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[242]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[19]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[83]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[147]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[211]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[51]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[115]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[179]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[243]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[20]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[84]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[148]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[212]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[52]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[116]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[180]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[244]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[21]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[85]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[149]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[213]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[53]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[117]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[181]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[245]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[22]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[86]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[150]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[214]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[54]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[118]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[182]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[246]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[23]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[87]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[151]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[215]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[55]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[119]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[183]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[247]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[24]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[88]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[152]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[216]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[56]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[120]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[184]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[248]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[25]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[89]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[153]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[217]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[57]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[121]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[185]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[249]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[26]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[90]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[154]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[218]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[58]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[122]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[186]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[250]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[27]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[91]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[155]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[219]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[59]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[123]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[187]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[251]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[28]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[92]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[156]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[220]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[60]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[124]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[188]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[252]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[29]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[93]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[157]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[221]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[61]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[125]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[189]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[253]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[30]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[94]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[158]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[222]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[62]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[126]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[190]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[254]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[31]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[95]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[159]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[223]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[63]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[127]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[191]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[255]))
             }
           }
         }
-        compute[(((floordiv(blockIdx.x, 7)*3136) + (threadIdx.x*49)) + floormod(blockIdx.x, 7))] = max((conv2d_nchw_1[0] + bias[((floordiv(blockIdx.x, 7)*64) + threadIdx.x)]), 0f32)
-        compute[((((floordiv(blockIdx.x, 7)*3136) + (threadIdx.x*49)) + floormod(blockIdx.x, 7)) + 7)] = max((conv2d_nchw_1[1] + bias[((floordiv(blockIdx.x, 7)*64) + threadIdx.x)]), 0f32)
-        compute[((((floordiv(blockIdx.x, 7)*3136) + (threadIdx.x*49)) + floormod(blockIdx.x, 7)) + 14)] = max((conv2d_nchw_1[2] + bias[((floordiv(blockIdx.x, 7)*64) + threadIdx.x)]), 0f32)
-        compute[((((floordiv(blockIdx.x, 7)*3136) + (threadIdx.x*49)) + floormod(blockIdx.x, 7)) + 21)] = max((conv2d_nchw_1[3] + bias[((floordiv(blockIdx.x, 7)*64) + threadIdx.x)]), 0f32)
-        compute[((((floordiv(blockIdx.x, 7)*3136) + (threadIdx.x*49)) + floormod(blockIdx.x, 7)) + 28)] = max((conv2d_nchw_1[4] + bias[((floordiv(blockIdx.x, 7)*64) + threadIdx.x)]), 0f32)
-        compute[((((floordiv(blockIdx.x, 7)*3136) + (threadIdx.x*49)) + floormod(blockIdx.x, 7)) + 35)] = max((conv2d_nchw_1[5] + bias[((floordiv(blockIdx.x, 7)*64) + threadIdx.x)]), 0f32)
-        compute[((((floordiv(blockIdx.x, 7)*3136) + (threadIdx.x*49)) + floormod(blockIdx.x, 7)) + 42)] = max((conv2d_nchw_1[6] + bias[((floordiv(blockIdx.x, 7)*64) + threadIdx.x)]), 0f32)
+        for (i1.inner: int32, 0, 2) {
+          compute[(((blockIdx.x*392) + (i1.inner*49)) + threadIdx.x)] = max((conv2d_nchw_1[i1.inner] + bias[((blockIdx.x*8) + i1.inner)]), 0f32)
+          compute[((((blockIdx.x*392) + (i1.inner*49)) + threadIdx.x) + 98)] = max((conv2d_nchw_1[(i1.inner + 2)] + bias[(((blockIdx.x*8) + i1.inner) + 2)]), 0f32)
+          compute[((((blockIdx.x*392) + (i1.inner*49)) + threadIdx.x) + 196)] = max((conv2d_nchw_1[(i1.inner + 4)] + bias[(((blockIdx.x*8) + i1.inner) + 4)]), 0f32)
+          compute[((((blockIdx.x*392) + (i1.inner*49)) + threadIdx.x) + 294)] = max((conv2d_nchw_1[(i1.inner + 6)] + bias[(((blockIdx.x*8) + i1.inner) + 6)]), 0f32)
+        }
       }
     }
 
@@ -433,7 +1302,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 0.243 ms
+    Execution time of this operator: 0.288 ms
 
 
 
@@ -477,36 +1346,36 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
     conv2d_nchw_nn_o_o_i, conv2d_nchw_nn_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_i, factor=1)
     conv2d_nchw_nn_o_o_o_i, conv2d_nchw_nn_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_i, factor=1)
     conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
-    conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
+    conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=2)
     conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=1)
-    conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=64)
-    conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
+    conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=1)
+    conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=4)
     conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
     conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
-    conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=1)
-    conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=7)
+    conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=7)
+    conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
     conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
     conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
-    conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
+    conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=7)
     conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
-    conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=8)
+    conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=32)
     conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=1)
-    conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=3)
+    conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1)
     conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=1)
-    conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=3)
+    conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=1)
     conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=1)
     s[conv2d_nchw].reorder(conv2d_nchw_nn_o_o_o_o, conv2d_nchw_ff_o_o_o_o, conv2d_nchw_yy_o_o_o_o, conv2d_nchw_xx_o_o_o_o, conv2d_nchw_nn_o_o_o_i, conv2d_nchw_ff_o_o_o_i, conv2d_nchw_yy_o_o_o_i, conv2d_nchw_xx_o_o_o_i, conv2d_nchw_nn_o_o_i, conv2d_nchw_ff_o_o_i, conv2d_nchw_yy_o_o_i, conv2d_nchw_xx_o_o_i, conv2d_nchw_rc_o_o, conv2d_nchw_ry_o_o, conv2d_nchw_rx_o_o, conv2d_nchw_rc_o_i, conv2d_nchw_ry_o_i, conv2d_nchw_rx_o_i, conv2d_nchw_nn_o_i, conv2d_nchw_ff_o_i, conv2d_nchw_yy_o_i, conv2 [...]
     compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1)
     compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
     compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
-    compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=1)
-    compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=64)
-    compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
+    compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=2)
+    compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=1)
+    compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=4)
     compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
-    compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=1)
-    compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=7)
+    compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=7)
+    compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
     compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
-    compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
+    compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=7)
     compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
     s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
     s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
@@ -524,16 +1393,16 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
     compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused = s[compute].fuse(compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i)
     s[compute].bind(compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused, te.thread_axis("threadIdx.x"))
     kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
-    kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=4)
+    kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
     s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-    kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+    kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=49)
     s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
     pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
     pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
     s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=49)
     s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
-    s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 64)
+    s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 1024)
     s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "unroll_explicit", True)
 
     CUDA source code:
@@ -551,3717 +1420,922 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
       #define int64_t long long
       #define uint64_t unsigned long long
     #endif
-    extern "C" __global__ void __launch_bounds__(64) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
-      float conv2d_nchw[7];
-      __shared__ float pad_temp_shared[216];
-      __shared__ float kernel_shared[4608];
+    extern "C" __global__ void __launch_bounds__(49) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+      float conv2d_nchw[8];
+      __shared__ float pad_temp_shared[1568];
+      __shared__ float kernel_shared[256];
       conv2d_nchw[0] = 0.000000e+00f;
-      conv2d_nchw[1] = 0.000000e+00f;
       conv2d_nchw[2] = 0.000000e+00f;
-      conv2d_nchw[3] = 0.000000e+00f;
       conv2d_nchw[4] = 0.000000e+00f;
-      conv2d_nchw[5] = 0.000000e+00f;
       conv2d_nchw[6] = 0.000000e+00f;
-      for (int rc_outer_outer = 0; rc_outer_outer < 64; ++rc_outer_outer) {
-        __syncthreads();
-        pad_temp_shared[((int)threadIdx.x)] = (((((3 <= (((int)threadIdx.x) % 27)) && ((((int)threadIdx.x) % 27) < 24)) && (1 <= ((((int)blockIdx.x) % 7) + (((int)threadIdx.x) % 3)))) && (((((int)blockIdx.x) % 7) + (((int)threadIdx.x) % 3)) < 8)) ? data[((((((rc_outer_outer * 392) + ((((int)threadIdx.x) / 27) * 49)) + (((((int)threadIdx.x) % 27) / 3) * 7)) + (((int)blockIdx.x) % 7)) + (((int)threadIdx.x) % 3)) - 8)] : 0.000000e+00f);
-        pad_temp_shared[(((int)threadIdx.x) + 64)] = (((((3 <= ((((int)threadIdx.x) + 10) % 27)) && (((((int)threadIdx.x) + 10) % 27) < 24)) && (1 <= ((((int)blockIdx.x) % 7) + ((((int)threadIdx.x) + 1) % 3)))) && (((((int)blockIdx.x) % 7) + ((((int)threadIdx.x) + 1) % 3)) < 8)) ? data[((((((rc_outer_outer * 392) + (((((int)threadIdx.x) + 64) / 27) * 49)) + ((((((int)threadIdx.x) + 10) % 27) / 3) * 7)) + (((int)blockIdx.x) % 7)) + ((((int)threadIdx.x) + 1) % 3)) - 8)] : 0.000000e+00f);
-        pad_temp_shared[(((int)threadIdx.x) + 128)] = (((((3 <= ((((int)threadIdx.x) + 20) % 27)) && (((((int)threadIdx.x) + 20) % 27) < 24)) && (1 <= ((((int)blockIdx.x) % 7) + ((((int)threadIdx.x) + 2) % 3)))) && (((((int)blockIdx.x) % 7) + ((((int)threadIdx.x) + 2) % 3)) < 8)) ? data[((((((rc_outer_outer * 392) + (((((int)threadIdx.x) + 128) / 27) * 49)) + ((((((int)threadIdx.x) + 20) % 27) / 3) * 7)) + (((int)blockIdx.x) % 7)) + ((((int)threadIdx.x) + 2) % 3)) - 8)] : 0.000000e+00f);
-        if (((int)threadIdx.x) < 24) {
-          pad_temp_shared[(((int)threadIdx.x) + 192)] = ((((((int)threadIdx.x) < 21) && (1 <= ((((int)blockIdx.x) % 7) + (((int)threadIdx.x) % 3)))) && (((((int)blockIdx.x) % 7) + (((int)threadIdx.x) % 3)) < 8)) ? data[((((((rc_outer_outer * 392) + (((((int)threadIdx.x) + 192) / 27) * 49)) + (((((int)threadIdx.x) / 3) + 1) * 7)) + (((int)blockIdx.x) % 7)) + (((int)threadIdx.x) % 3)) - 8)] : 0.000000e+00f);
-        }
-        int4 _1;
-          int4 _2;
-            int4 _3 = make_int4(((((((int)blockIdx.x) / 7) * 294912) + ((((int)threadIdx.x) / 18) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((int)threadIdx.x) / 18) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((int)threadIdx.x) / 18) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((int)threadIdx.x) / 18) * 4608)) + (rc_outer_outer * 72)));
-            int4 _4;
-              int4 _5;
-                int4 _6;
-                  int4 _7 = make_int4(((((int)threadIdx.x) * 4))+(1*0), ((((int)threadIdx.x) * 4))+(1*1), ((((int)threadIdx.x) * 4))+(1*2), ((((int)threadIdx.x) * 4))+(1*3));
-                  int4 _8 = make_int4(3, 3, 3, 3);
-                  _6.x = (_7.x%_8.x);
-                  _6.y = (_7.y%_8.y);
-                  _6.z = (_7.z%_8.z);
-                  _6.w = (_7.w%_8.w);
-                int4 _9;
-                  int4 _10 = make_int4(((((int)threadIdx.x) * 4))+(1*0), ((((int)threadIdx.x) * 4))+(1*1), ((((int)threadIdx.x) * 4))+(1*2), ((((int)threadIdx.x) * 4))+(1*3));
-                  int4 _11 = make_int4(3, 3, 3, 3);
-                  _9.x = (_10.x/_11.x);
-                  _9.y = (_10.y/_11.y);
-                  _9.z = (_10.z/_11.z);
-                  _9.w = (_10.w/_11.w);
-                int4 _12;
-                ushort4 _13;
-                  ushort4 _14;
-                    ushort4 _15;
-                      int4 _16 = make_int4(3, 3, 3, 3);
-                      int4 _17 = make_int4(0, 0, 0, 0);
-                      _15.x = (_16.x>=_17.x);
-                      _15.y = (_16.y>=_17.y);
-                      _15.z = (_16.z>=_17.z);
-                      _15.w = (_16.w>=_17.w);
-                    ushort4 _18;
-                      int4 _19 = make_int4(0, 0, 0, 0);
-                      _18.x = (_6.x>=_19.x);
-                      _18.y = (_6.y>=_19.y);
-                      _18.z = (_6.z>=_19.z);
-                      _18.w = (_6.w>=_19.w);
-                    _14.x = (_15.x&&_18.x);
-                    _14.y = (_15.y&&_18.y);
-                    _14.z = (_15.z&&_18.z);
-                    _14.w = (_15.w&&_18.w);
-                  ushort4 _20;
-                    ushort4 _21;
-                      int4 _22 = make_int4(3, 3, 3, 3);
-                      int4 _23 = make_int4(0, 0, 0, 0);
-                      _21.x = (_22.x<_23.x);
-                      _21.y = (_22.y<_23.y);
-                      _21.z = (_22.z<_23.z);
-                      _21.w = (_22.w<_23.w);
-                    ushort4 _24;
-                      int4 _25 = make_int4(0, 0, 0, 0);
-                      _24.x = (_6.x<=_25.x);
-                      _24.y = (_6.y<=_25.y);
-                      _24.z = (_6.z<=_25.z);
-                      _24.w = (_6.w<=_25.w);
-                    _20.x = (_21.x&&_24.x);
-                    _20.y = (_21.y&&_24.y);
-                    _20.z = (_21.z&&_24.z);
-                    _20.w = (_21.w&&_24.w);
-                  _13.x = (_14.x||_20.x);
-                  _13.y = (_14.y||_20.y);
-                  _13.z = (_14.z||_20.z);
-                  _13.w = (_14.w||_20.w);
-                int4 _26;
-                  int4 _27 = make_int4(1, 1, 1, 1);
-                  _26.x = (_9.x-_27.x);
-                  _26.y = (_9.y-_27.y);
-                  _26.z = (_9.z-_27.z);
-                  _26.w = (_9.w-_27.w);
-                _12.x = (bool(_13.x)?_9.x:_26.x);
-                _12.y = (bool(_13.y)?_9.y:_26.y);
-                _12.z = (bool(_13.z)?_9.z:_26.z);
-                _12.w = (bool(_13.w)?_9.w:_26.w);
-                int4 _28 = make_int4(24, 24, 24, 24);
-                _5.x = (_12.x%_28.x);
-                _5.y = (_12.y%_28.y);
-                _5.z = (_12.z%_28.z);
-                _5.w = (_12.w%_28.w);
-              int4 _29;
-              ushort4 _30;
-                ushort4 _31;
-                  ushort4 _32;
-                    int4 _33 = make_int4(24, 24, 24, 24);
-                    int4 _34 = make_int4(0, 0, 0, 0);
-                    _32.x = (_33.x>=_34.x);
-                    _32.y = (_33.y>=_34.y);
-                    _32.z = (_33.z>=_34.z);
-                    _32.w = (_33.w>=_34.w);
-                  ushort4 _35;
-                    int4 _36 = make_int4(0, 0, 0, 0);
-                    _35.x = (_5.x>=_36.x);
-                    _35.y = (_5.y>=_36.y);
-                    _35.z = (_5.z>=_36.z);
-                    _35.w = (_5.w>=_36.w);
-                  _31.x = (_32.x&&_35.x);
-                  _31.y = (_32.y&&_35.y);
-                  _31.z = (_32.z&&_35.z);
-                  _31.w = (_32.w&&_35.w);
-                ushort4 _37;
-                  ushort4 _38;
-                    int4 _39 = make_int4(24, 24, 24, 24);
-                    int4 _40 = make_int4(0, 0, 0, 0);
-                    _38.x = (_39.x<_40.x);
-                    _38.y = (_39.y<_40.y);
-                    _38.z = (_39.z<_40.z);
-                    _38.w = (_39.w<_40.w);
-                  ushort4 _41;
-                    int4 _42 = make_int4(0, 0, 0, 0);
-                    _41.x = (_5.x<=_42.x);
-                    _41.y = (_5.y<=_42.y);
-                    _41.z = (_5.z<=_42.z);
-                    _41.w = (_5.w<=_42.w);
-                  _37.x = (_38.x&&_41.x);
-                  _37.y = (_38.y&&_41.y);
-                  _37.z = (_38.z&&_41.z);
-                  _37.w = (_38.w&&_41.w);
-                _30.x = (_31.x||_37.x);
-                _30.y = (_31.y||_37.y);
-                _30.z = (_31.z||_37.z);
-                _30.w = (_31.w||_37.w);
-              int4 _43;
-                int4 _44 = make_int4(24, 24, 24, 24);
-                _43.x = (_5.x+_44.x);
-                _43.y = (_5.y+_44.y);
-                _43.z = (_5.z+_44.z);
-                _43.w = (_5.w+_44.w);
-              _29.x = (bool(_30.x)?_5.x:_43.x);
-              _29.y = (bool(_30.y)?_5.y:_43.y);
-              _29.z = (bool(_30.z)?_5.z:_43.z);
-              _29.w = (bool(_30.w)?_5.w:_43.w);
-              int4 _45 = make_int4(3, 3, 3, 3);
-              _4.x = (_29.x*_45.x);
-              _4.y = (_29.y*_45.y);
-              _4.z = (_29.z*_45.z);
-              _4.w = (_29.w*_45.w);
-            _2.x = (_3.x+_4.x);
-            _2.y = (_3.y+_4.y);
-            _2.z = (_3.z+_4.z);
-            _2.w = (_3.w+_4.w);
-          int4 _46;
-            int4 _47 = make_int4((((int)threadIdx.x))+(1*0), (((int)threadIdx.x))+(1*1), (((int)threadIdx.x))+(1*2), (((int)threadIdx.x))+(1*3));
-            int4 _48 = make_int4(3, 3, 3, 3);
-            _46.x = (_47.x%_48.x);
-            _46.y = (_47.y%_48.y);
-            _46.z = (_47.z%_48.z);
-            _46.w = (_47.w%_48.w);
-          int4 _49;
-          ushort4 _50;
-            ushort4 _51;
-              ushort4 _52;
-                int4 _53 = make_int4(3, 3, 3, 3);
-                int4 _54 = make_int4(0, 0, 0, 0);
-                _52.x = (_53.x>=_54.x);
-                _52.y = (_53.y>=_54.y);
-                _52.z = (_53.z>=_54.z);
-                _52.w = (_53.w>=_54.w);
-              ushort4 _55;
-                int4 _56 = make_int4(0, 0, 0, 0);
-                _55.x = (_46.x>=_56.x);
-                _55.y = (_46.y>=_56.y);
-                _55.z = (_46.z>=_56.z);
-                _55.w = (_46.w>=_56.w);
-              _51.x = (_52.x&&_55.x);
-              _51.y = (_52.y&&_55.y);
-              _51.z = (_52.z&&_55.z);
-              _51.w = (_52.w&&_55.w);
-            ushort4 _57;
-              ushort4 _58;
-                int4 _59 = make_int4(3, 3, 3, 3);
-                int4 _60 = make_int4(0, 0, 0, 0);
-                _58.x = (_59.x<_60.x);
-                _58.y = (_59.y<_60.y);
-                _58.z = (_59.z<_60.z);
-                _58.w = (_59.w<_60.w);
-              ushort4 _61;
-                int4 _62 = make_int4(0, 0, 0, 0);
-                _61.x = (_46.x<=_62.x);
-                _61.y = (_46.y<=_62.y);
-                _61.z = (_46.z<=_62.z);
-                _61.w = (_46.w<=_62.w);
-              _57.x = (_58.x&&_61.x);
-              _57.y = (_58.y&&_61.y);
-              _57.z = (_58.z&&_61.z);
-              _57.w = (_58.w&&_61.w);
-            _50.x = (_51.x||_57.x);
-            _50.y = (_51.y||_57.y);
-            _50.z = (_51.z||_57.z);
-            _50.w = (_51.w||_57.w);
-          int4 _63;
-            int4 _64 = make_int4(3, 3, 3, 3);
-            _63.x = (_46.x+_64.x);
-            _63.y = (_46.y+_64.y);
-            _63.z = (_46.z+_64.z);
-            _63.w = (_46.w+_64.w);
-          _49.x = (bool(_50.x)?_46.x:_63.x);
-          _49.y = (bool(_50.y)?_46.y:_63.y);
-          _49.z = (bool(_50.z)?_46.z:_63.z);
-          _49.w = (bool(_50.w)?_46.w:_63.w);
-          _1.x = (_2.x+_49.x);
-          _1.y = (_2.y+_49.y);
-          _1.z = (_2.z+_49.z);
-          _1.w = (_2.w+_49.w);
-        *(float4*)(kernel_shared + (((int)threadIdx.x) * 4)) = make_float4(kernel[_1.x],kernel[_1.y],kernel[_1.z],kernel[_1.w]);
-        int4 _65;
-          int4 _66;
-            int4 _67 = make_int4(((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 256) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 256) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 256) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 256) / 72) * 4608)) + (rc_outer_outer * 72)));
-            int4 _68;
-              int4 _69;
-                int4 _70;
-                  int4 _71 = make_int4((((((int)threadIdx.x) * 4) + 256))+(1*0), (((((int)threadIdx.x) * 4) + 256))+(1*1), (((((int)threadIdx.x) * 4) + 256))+(1*2), (((((int)threadIdx.x) * 4) + 256))+(1*3));
-                  int4 _72 = make_int4(3, 3, 3, 3);
-                  _70.x = (_71.x%_72.x);
-                  _70.y = (_71.y%_72.y);
-                  _70.z = (_71.z%_72.z);
-                  _70.w = (_71.w%_72.w);
-                int4 _73;
-                  int4 _74 = make_int4((((((int)threadIdx.x) * 4) + 256))+(1*0), (((((int)threadIdx.x) * 4) + 256))+(1*1), (((((int)threadIdx.x) * 4) + 256))+(1*2), (((((int)threadIdx.x) * 4) + 256))+(1*3));
-                  int4 _75 = make_int4(3, 3, 3, 3);
-                  _73.x = (_74.x/_75.x);
-                  _73.y = (_74.y/_75.y);
-                  _73.z = (_74.z/_75.z);
-                  _73.w = (_74.w/_75.w);
-                int4 _76;
-                ushort4 _77;
-                  ushort4 _78;
-                    ushort4 _79;
-                      int4 _80 = make_int4(3, 3, 3, 3);
-                      int4 _81 = make_int4(0, 0, 0, 0);
-                      _79.x = (_80.x>=_81.x);
-                      _79.y = (_80.y>=_81.y);
-                      _79.z = (_80.z>=_81.z);
-                      _79.w = (_80.w>=_81.w);
-                    ushort4 _82;
-                      int4 _83 = make_int4(0, 0, 0, 0);
-                      _82.x = (_70.x>=_83.x);
-                      _82.y = (_70.y>=_83.y);
-                      _82.z = (_70.z>=_83.z);
-                      _82.w = (_70.w>=_83.w);
-                    _78.x = (_79.x&&_82.x);
-                    _78.y = (_79.y&&_82.y);
-                    _78.z = (_79.z&&_82.z);
-                    _78.w = (_79.w&&_82.w);
-                  ushort4 _84;
-                    ushort4 _85;
-                      int4 _86 = make_int4(3, 3, 3, 3);
-                      int4 _87 = make_int4(0, 0, 0, 0);
-                      _85.x = (_86.x<_87.x);
-                      _85.y = (_86.y<_87.y);
-                      _85.z = (_86.z<_87.z);
-                      _85.w = (_86.w<_87.w);
-                    ushort4 _88;
-                      int4 _89 = make_int4(0, 0, 0, 0);
-                      _88.x = (_70.x<=_89.x);
-                      _88.y = (_70.y<=_89.y);
-                      _88.z = (_70.z<=_89.z);
-                      _88.w = (_70.w<=_89.w);
-                    _84.x = (_85.x&&_88.x);
-                    _84.y = (_85.y&&_88.y);
-                    _84.z = (_85.z&&_88.z);
-                    _84.w = (_85.w&&_88.w);
-                  _77.x = (_78.x||_84.x);
-                  _77.y = (_78.y||_84.y);
-                  _77.z = (_78.z||_84.z);
-                  _77.w = (_78.w||_84.w);
-                int4 _90;
-                  int4 _91 = make_int4(1, 1, 1, 1);
-                  _90.x = (_73.x-_91.x);
-                  _90.y = (_73.y-_91.y);
-                  _90.z = (_73.z-_91.z);
-                  _90.w = (_73.w-_91.w);
-                _76.x = (bool(_77.x)?_73.x:_90.x);
-                _76.y = (bool(_77.y)?_73.y:_90.y);
-                _76.z = (bool(_77.z)?_73.z:_90.z);
-                _76.w = (bool(_77.w)?_73.w:_90.w);
-                int4 _92 = make_int4(24, 24, 24, 24);
-                _69.x = (_76.x%_92.x);
-                _69.y = (_76.y%_92.y);
-                _69.z = (_76.z%_92.z);
-                _69.w = (_76.w%_92.w);
-              int4 _93;
-              ushort4 _94;
-                ushort4 _95;
-                  ushort4 _96;
-                    int4 _97 = make_int4(24, 24, 24, 24);
-                    int4 _98 = make_int4(0, 0, 0, 0);
-                    _96.x = (_97.x>=_98.x);
-                    _96.y = (_97.y>=_98.y);
-                    _96.z = (_97.z>=_98.z);
-                    _96.w = (_97.w>=_98.w);
-                  ushort4 _99;
-                    int4 _100 = make_int4(0, 0, 0, 0);
-                    _99.x = (_69.x>=_100.x);
-                    _99.y = (_69.y>=_100.y);
-                    _99.z = (_69.z>=_100.z);
-                    _99.w = (_69.w>=_100.w);
-                  _95.x = (_96.x&&_99.x);
-                  _95.y = (_96.y&&_99.y);
-                  _95.z = (_96.z&&_99.z);
-                  _95.w = (_96.w&&_99.w);
-                ushort4 _101;
-                  ushort4 _102;
-                    int4 _103 = make_int4(24, 24, 24, 24);
-                    int4 _104 = make_int4(0, 0, 0, 0);
-                    _102.x = (_103.x<_104.x);
-                    _102.y = (_103.y<_104.y);
-                    _102.z = (_103.z<_104.z);
-                    _102.w = (_103.w<_104.w);
-                  ushort4 _105;
-                    int4 _106 = make_int4(0, 0, 0, 0);
-                    _105.x = (_69.x<=_106.x);
-                    _105.y = (_69.y<=_106.y);
-                    _105.z = (_69.z<=_106.z);
-                    _105.w = (_69.w<=_106.w);
-                  _101.x = (_102.x&&_105.x);
-                  _101.y = (_102.y&&_105.y);
-                  _101.z = (_102.z&&_105.z);
-                  _101.w = (_102.w&&_105.w);
-                _94.x = (_95.x||_101.x);
-                _94.y = (_95.y||_101.y);
-                _94.z = (_95.z||_101.z);
-                _94.w = (_95.w||_101.w);
-              int4 _107;
-                int4 _108 = make_int4(24, 24, 24, 24);
-                _107.x = (_69.x+_108.x);
-                _107.y = (_69.y+_108.y);
-                _107.z = (_69.z+_108.z);
-                _107.w = (_69.w+_108.w);
-              _93.x = (bool(_94.x)?_69.x:_107.x);
-              _93.y = (bool(_94.y)?_69.y:_107.y);
-              _93.z = (bool(_94.z)?_69.z:_107.z);
-              _93.w = (bool(_94.w)?_69.w:_107.w);
-              int4 _109 = make_int4(3, 3, 3, 3);
-              _68.x = (_93.x*_109.x);
-              _68.y = (_93.y*_109.y);
-              _68.z = (_93.z*_109.z);
-              _68.w = (_93.w*_109.w);
-            _66.x = (_67.x+_68.x);
-            _66.y = (_67.y+_68.y);
-            _66.z = (_67.z+_68.z);
-            _66.w = (_67.w+_68.w);
-          int4 _110;
-            int4 _111 = make_int4(((((int)threadIdx.x) + 64))+(1*0), ((((int)threadIdx.x) + 64))+(1*1), ((((int)threadIdx.x) + 64))+(1*2), ((((int)threadIdx.x) + 64))+(1*3));
-            int4 _112 = make_int4(3, 3, 3, 3);
-            _110.x = (_111.x%_112.x);
-            _110.y = (_111.y%_112.y);
-            _110.z = (_111.z%_112.z);
-            _110.w = (_111.w%_112.w);
-          int4 _113;
-          ushort4 _114;
-            ushort4 _115;
-              ushort4 _116;
-                int4 _117 = make_int4(3, 3, 3, 3);
-                int4 _118 = make_int4(0, 0, 0, 0);
-                _116.x = (_117.x>=_118.x);
-                _116.y = (_117.y>=_118.y);
-                _116.z = (_117.z>=_118.z);
-                _116.w = (_117.w>=_118.w);
-              ushort4 _119;
-                int4 _120 = make_int4(0, 0, 0, 0);
-                _119.x = (_110.x>=_120.x);
-                _119.y = (_110.y>=_120.y);
-                _119.z = (_110.z>=_120.z);
-                _119.w = (_110.w>=_120.w);
-              _115.x = (_116.x&&_119.x);
-              _115.y = (_116.y&&_119.y);
-              _115.z = (_116.z&&_119.z);
-              _115.w = (_116.w&&_119.w);
-            ushort4 _121;
-              ushort4 _122;
-                int4 _123 = make_int4(3, 3, 3, 3);
-                int4 _124 = make_int4(0, 0, 0, 0);
-                _122.x = (_123.x<_124.x);
-                _122.y = (_123.y<_124.y);
-                _122.z = (_123.z<_124.z);
-                _122.w = (_123.w<_124.w);
-              ushort4 _125;
-                int4 _126 = make_int4(0, 0, 0, 0);
-                _125.x = (_110.x<=_126.x);
-                _125.y = (_110.y<=_126.y);
-                _125.z = (_110.z<=_126.z);
-                _125.w = (_110.w<=_126.w);
-              _121.x = (_122.x&&_125.x);
-              _121.y = (_122.y&&_125.y);
-              _121.z = (_122.z&&_125.z);
-              _121.w = (_122.w&&_125.w);
-            _114.x = (_115.x||_121.x);
-            _114.y = (_115.y||_121.y);
-            _114.z = (_115.z||_121.z);
-            _114.w = (_115.w||_121.w);
-          int4 _127;
-            int4 _128 = make_int4(3, 3, 3, 3);
-            _127.x = (_110.x+_128.x);
-            _127.y = (_110.y+_128.y);
-            _127.z = (_110.z+_128.z);
-            _127.w = (_110.w+_128.w);
-          _113.x = (bool(_114.x)?_110.x:_127.x);
-          _113.y = (bool(_114.y)?_110.y:_127.y);
-          _113.z = (bool(_114.z)?_110.z:_127.z);
-          _113.w = (bool(_114.w)?_110.w:_127.w);
-          _65.x = (_66.x+_113.x);
-          _65.y = (_66.y+_113.y);
-          _65.z = (_66.z+_113.z);
-          _65.w = (_66.w+_113.w);
-        *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 256)) = make_float4(kernel[_65.x],kernel[_65.y],kernel[_65.z],kernel[_65.w]);
-        int4 _129;
-          int4 _130;
-            int4 _131 = make_int4(((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 512) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 512) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 512) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 512) / 72) * 4608)) + (rc_outer_outer * 72)));
-            int4 _132;
-              int4 _133;
-                int4 _134;
-                  int4 _135 = make_int4((((((int)threadIdx.x) * 4) + 512))+(1*0), (((((int)threadIdx.x) * 4) + 512))+(1*1), (((((int)threadIdx.x) * 4) + 512))+(1*2), (((((int)threadIdx.x) * 4) + 512))+(1*3));
-                  int4 _136 = make_int4(3, 3, 3, 3);
-                  _134.x = (_135.x%_136.x);
-                  _134.y = (_135.y%_136.y);
-                  _134.z = (_135.z%_136.z);
-                  _134.w = (_135.w%_136.w);
-                int4 _137;
-                  int4 _138 = make_int4((((((int)threadIdx.x) * 4) + 512))+(1*0), (((((int)threadIdx.x) * 4) + 512))+(1*1), (((((int)threadIdx.x) * 4) + 512))+(1*2), (((((int)threadIdx.x) * 4) + 512))+(1*3));
-                  int4 _139 = make_int4(3, 3, 3, 3);
-                  _137.x = (_138.x/_139.x);
-                  _137.y = (_138.y/_139.y);
-                  _137.z = (_138.z/_139.z);
-                  _137.w = (_138.w/_139.w);
-                int4 _140;
-                ushort4 _141;
-                  ushort4 _142;
-                    ushort4 _143;
-                      int4 _144 = make_int4(3, 3, 3, 3);
-                      int4 _145 = make_int4(0, 0, 0, 0);
-                      _143.x = (_144.x>=_145.x);
-                      _143.y = (_144.y>=_145.y);
-                      _143.z = (_144.z>=_145.z);
-                      _143.w = (_144.w>=_145.w);
-                    ushort4 _146;
-                      int4 _147 = make_int4(0, 0, 0, 0);
-                      _146.x = (_134.x>=_147.x);
-                      _146.y = (_134.y>=_147.y);
-                      _146.z = (_134.z>=_147.z);
-                      _146.w = (_134.w>=_147.w);
-                    _142.x = (_143.x&&_146.x);
-                    _142.y = (_143.y&&_146.y);
-                    _142.z = (_143.z&&_146.z);
-                    _142.w = (_143.w&&_146.w);
-                  ushort4 _148;
-                    ushort4 _149;
-                      int4 _150 = make_int4(3, 3, 3, 3);
-                      int4 _151 = make_int4(0, 0, 0, 0);
-                      _149.x = (_150.x<_151.x);
-                      _149.y = (_150.y<_151.y);
-                      _149.z = (_150.z<_151.z);
-                      _149.w = (_150.w<_151.w);
-                    ushort4 _152;
-                      int4 _153 = make_int4(0, 0, 0, 0);
-                      _152.x = (_134.x<=_153.x);
-                      _152.y = (_134.y<=_153.y);
-                      _152.z = (_134.z<=_153.z);
-                      _152.w = (_134.w<=_153.w);
-                    _148.x = (_149.x&&_152.x);
-                    _148.y = (_149.y&&_152.y);
-                    _148.z = (_149.z&&_152.z);
-                    _148.w = (_149.w&&_152.w);
-                  _141.x = (_142.x||_148.x);
-                  _141.y = (_142.y||_148.y);
-                  _141.z = (_142.z||_148.z);
-                  _141.w = (_142.w||_148.w);
-                int4 _154;
-                  int4 _155 = make_int4(1, 1, 1, 1);
-                  _154.x = (_137.x-_155.x);
-                  _154.y = (_137.y-_155.y);
-                  _154.z = (_137.z-_155.z);
-                  _154.w = (_137.w-_155.w);
-                _140.x = (bool(_141.x)?_137.x:_154.x);
-                _140.y = (bool(_141.y)?_137.y:_154.y);
-                _140.z = (bool(_141.z)?_137.z:_154.z);
-                _140.w = (bool(_141.w)?_137.w:_154.w);
-                int4 _156 = make_int4(24, 24, 24, 24);
-                _133.x = (_140.x%_156.x);
-                _133.y = (_140.y%_156.y);
-                _133.z = (_140.z%_156.z);
-                _133.w = (_140.w%_156.w);
-              int4 _157;
-              ushort4 _158;
-                ushort4 _159;
-                  ushort4 _160;
-                    int4 _161 = make_int4(24, 24, 24, 24);
-                    int4 _162 = make_int4(0, 0, 0, 0);
-                    _160.x = (_161.x>=_162.x);
-                    _160.y = (_161.y>=_162.y);
-                    _160.z = (_161.z>=_162.z);
-                    _160.w = (_161.w>=_162.w);
-                  ushort4 _163;
-                    int4 _164 = make_int4(0, 0, 0, 0);
-                    _163.x = (_133.x>=_164.x);
-                    _163.y = (_133.y>=_164.y);
-                    _163.z = (_133.z>=_164.z);
-                    _163.w = (_133.w>=_164.w);
-                  _159.x = (_160.x&&_163.x);
-                  _159.y = (_160.y&&_163.y);
-                  _159.z = (_160.z&&_163.z);
-                  _159.w = (_160.w&&_163.w);
-                ushort4 _165;
-                  ushort4 _166;
-                    int4 _167 = make_int4(24, 24, 24, 24);
-                    int4 _168 = make_int4(0, 0, 0, 0);
-                    _166.x = (_167.x<_168.x);
-                    _166.y = (_167.y<_168.y);
-                    _166.z = (_167.z<_168.z);
-                    _166.w = (_167.w<_168.w);
-                  ushort4 _169;
-                    int4 _170 = make_int4(0, 0, 0, 0);
-                    _169.x = (_133.x<=_170.x);
-                    _169.y = (_133.y<=_170.y);
-                    _169.z = (_133.z<=_170.z);
-                    _169.w = (_133.w<=_170.w);
-                  _165.x = (_166.x&&_169.x);
-                  _165.y = (_166.y&&_169.y);
-                  _165.z = (_166.z&&_169.z);
-                  _165.w = (_166.w&&_169.w);
-                _158.x = (_159.x||_165.x);
-                _158.y = (_159.y||_165.y);
-                _158.z = (_159.z||_165.z);
-                _158.w = (_159.w||_165.w);
-              int4 _171;
-                int4 _172 = make_int4(24, 24, 24, 24);
-                _171.x = (_133.x+_172.x);
-                _171.y = (_133.y+_172.y);
-                _171.z = (_133.z+_172.z);
-                _171.w = (_133.w+_172.w);
-              _157.x = (bool(_158.x)?_133.x:_171.x);
-              _157.y = (bool(_158.y)?_133.y:_171.y);
-              _157.z = (bool(_158.z)?_133.z:_171.z);
-              _157.w = (bool(_158.w)?_133.w:_171.w);
-              int4 _173 = make_int4(3, 3, 3, 3);
-              _132.x = (_157.x*_173.x);
-              _132.y = (_157.y*_173.y);
-              _132.z = (_157.z*_173.z);
-              _132.w = (_157.w*_173.w);
-            _130.x = (_131.x+_132.x);
-            _130.y = (_131.y+_132.y);
-            _130.z = (_131.z+_132.z);
-            _130.w = (_131.w+_132.w);
-          int4 _174;
-            int4 _175 = make_int4(((((int)threadIdx.x) + 128))+(1*0), ((((int)threadIdx.x) + 128))+(1*1), ((((int)threadIdx.x) + 128))+(1*2), ((((int)threadIdx.x) + 128))+(1*3));
-            int4 _176 = make_int4(3, 3, 3, 3);
-            _174.x = (_175.x%_176.x);
-            _174.y = (_175.y%_176.y);
-            _174.z = (_175.z%_176.z);
-            _174.w = (_175.w%_176.w);
-          int4 _177;
-          ushort4 _178;
-            ushort4 _179;
-              ushort4 _180;
-                int4 _181 = make_int4(3, 3, 3, 3);
-                int4 _182 = make_int4(0, 0, 0, 0);
-                _180.x = (_181.x>=_182.x);
-                _180.y = (_181.y>=_182.y);
-                _180.z = (_181.z>=_182.z);
-                _180.w = (_181.w>=_182.w);
-              ushort4 _183;
-                int4 _184 = make_int4(0, 0, 0, 0);
-                _183.x = (_174.x>=_184.x);
-                _183.y = (_174.y>=_184.y);
-                _183.z = (_174.z>=_184.z);
-                _183.w = (_174.w>=_184.w);
-              _179.x = (_180.x&&_183.x);
-              _179.y = (_180.y&&_183.y);
-              _179.z = (_180.z&&_183.z);
-              _179.w = (_180.w&&_183.w);
-            ushort4 _185;
-              ushort4 _186;
-                int4 _187 = make_int4(3, 3, 3, 3);
-                int4 _188 = make_int4(0, 0, 0, 0);
-                _186.x = (_187.x<_188.x);
-                _186.y = (_187.y<_188.y);
-                _186.z = (_187.z<_188.z);
-                _186.w = (_187.w<_188.w);
-              ushort4 _189;
-                int4 _190 = make_int4(0, 0, 0, 0);
-                _189.x = (_174.x<=_190.x);
-                _189.y = (_174.y<=_190.y);
-                _189.z = (_174.z<=_190.z);
-                _189.w = (_174.w<=_190.w);
-              _185.x = (_186.x&&_189.x);
-              _185.y = (_186.y&&_189.y);
-              _185.z = (_186.z&&_189.z);
-              _185.w = (_186.w&&_189.w);
-            _178.x = (_179.x||_185.x);
-            _178.y = (_179.y||_185.y);
-            _178.z = (_179.z||_185.z);
-            _178.w = (_179.w||_185.w);
-          int4 _191;
-            int4 _192 = make_int4(3, 3, 3, 3);
-            _191.x = (_174.x+_192.x);
-            _191.y = (_174.y+_192.y);
-            _191.z = (_174.z+_192.z);
-            _191.w = (_174.w+_192.w);
-          _177.x = (bool(_178.x)?_174.x:_191.x);
-          _177.y = (bool(_178.y)?_174.y:_191.y);
-          _177.z = (bool(_178.z)?_174.z:_191.z);
-          _177.w = (bool(_178.w)?_174.w:_191.w);
-          _129.x = (_130.x+_177.x);
-          _129.y = (_130.y+_177.y);
-          _129.z = (_130.z+_177.z);
-          _129.w = (_130.w+_177.w);
-        *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 512)) = make_float4(kernel[_129.x],kernel[_129.y],kernel[_129.z],kernel[_129.w]);
-        int4 _193;
-          int4 _194;
-            int4 _195 = make_int4(((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 768) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 768) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 768) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 768) / 72) * 4608)) + (rc_outer_outer * 72)));
-            int4 _196;
-              int4 _197;
-                int4 _198;
-                  int4 _199 = make_int4((((((int)threadIdx.x) * 4) + 768))+(1*0), (((((int)threadIdx.x) * 4) + 768))+(1*1), (((((int)threadIdx.x) * 4) + 768))+(1*2), (((((int)threadIdx.x) * 4) + 768))+(1*3));
-                  int4 _200 = make_int4(3, 3, 3, 3);
-                  _198.x = (_199.x%_200.x);
-                  _198.y = (_199.y%_200.y);
-                  _198.z = (_199.z%_200.z);
-                  _198.w = (_199.w%_200.w);
-                int4 _201;
-                  int4 _202 = make_int4((((((int)threadIdx.x) * 4) + 768))+(1*0), (((((int)threadIdx.x) * 4) + 768))+(1*1), (((((int)threadIdx.x) * 4) + 768))+(1*2), (((((int)threadIdx.x) * 4) + 768))+(1*3));
-                  int4 _203 = make_int4(3, 3, 3, 3);
-                  _201.x = (_202.x/_203.x);
-                  _201.y = (_202.y/_203.y);
-                  _201.z = (_202.z/_203.z);
-                  _201.w = (_202.w/_203.w);
-                int4 _204;
-                ushort4 _205;
-                  ushort4 _206;
-                    ushort4 _207;
-                      int4 _208 = make_int4(3, 3, 3, 3);
-                      int4 _209 = make_int4(0, 0, 0, 0);
-                      _207.x = (_208.x>=_209.x);
-                      _207.y = (_208.y>=_209.y);
-                      _207.z = (_208.z>=_209.z);
-                      _207.w = (_208.w>=_209.w);
-                    ushort4 _210;
-                      int4 _211 = make_int4(0, 0, 0, 0);
-                      _210.x = (_198.x>=_211.x);
-                      _210.y = (_198.y>=_211.y);
-                      _210.z = (_198.z>=_211.z);
-                      _210.w = (_198.w>=_211.w);
-                    _206.x = (_207.x&&_210.x);
-                    _206.y = (_207.y&&_210.y);
-                    _206.z = (_207.z&&_210.z);
-                    _206.w = (_207.w&&_210.w);
-                  ushort4 _212;
-                    ushort4 _213;
-                      int4 _214 = make_int4(3, 3, 3, 3);
-                      int4 _215 = make_int4(0, 0, 0, 0);
-                      _213.x = (_214.x<_215.x);
-                      _213.y = (_214.y<_215.y);
-                      _213.z = (_214.z<_215.z);
-                      _213.w = (_214.w<_215.w);
-                    ushort4 _216;
-                      int4 _217 = make_int4(0, 0, 0, 0);
-                      _216.x = (_198.x<=_217.x);
-                      _216.y = (_198.y<=_217.y);
-                      _216.z = (_198.z<=_217.z);
-                      _216.w = (_198.w<=_217.w);
-                    _212.x = (_213.x&&_216.x);
-                    _212.y = (_213.y&&_216.y);
-                    _212.z = (_213.z&&_216.z);
-                    _212.w = (_213.w&&_216.w);
-                  _205.x = (_206.x||_212.x);
-                  _205.y = (_206.y||_212.y);
-                  _205.z = (_206.z||_212.z);
-                  _205.w = (_206.w||_212.w);
-                int4 _218;
-                  int4 _219 = make_int4(1, 1, 1, 1);
-                  _218.x = (_201.x-_219.x);
-                  _218.y = (_201.y-_219.y);
-                  _218.z = (_201.z-_219.z);
-                  _218.w = (_201.w-_219.w);
-                _204.x = (bool(_205.x)?_201.x:_218.x);
-                _204.y = (bool(_205.y)?_201.y:_218.y);
-                _204.z = (bool(_205.z)?_201.z:_218.z);
-                _204.w = (bool(_205.w)?_201.w:_218.w);
-                int4 _220 = make_int4(24, 24, 24, 24);
-                _197.x = (_204.x%_220.x);
-                _197.y = (_204.y%_220.y);
-                _197.z = (_204.z%_220.z);
-                _197.w = (_204.w%_220.w);
-              int4 _221;
-              ushort4 _222;
-                ushort4 _223;
-                  ushort4 _224;
-                    int4 _225 = make_int4(24, 24, 24, 24);
-                    int4 _226 = make_int4(0, 0, 0, 0);
-                    _224.x = (_225.x>=_226.x);
-                    _224.y = (_225.y>=_226.y);
-                    _224.z = (_225.z>=_226.z);
-                    _224.w = (_225.w>=_226.w);
-                  ushort4 _227;
-                    int4 _228 = make_int4(0, 0, 0, 0);
-                    _227.x = (_197.x>=_228.x);
-                    _227.y = (_197.y>=_228.y);
-                    _227.z = (_197.z>=_228.z);
-                    _227.w = (_197.w>=_228.w);
-                  _223.x = (_224.x&&_227.x);
-                  _223.y = (_224.y&&_227.y);
-                  _223.z = (_224.z&&_227.z);
-                  _223.w = (_224.w&&_227.w);
-                ushort4 _229;
-                  ushort4 _230;
-                    int4 _231 = make_int4(24, 24, 24, 24);
-                    int4 _232 = make_int4(0, 0, 0, 0);
-                    _230.x = (_231.x<_232.x);
-                    _230.y = (_231.y<_232.y);
-                    _230.z = (_231.z<_232.z);
-                    _230.w = (_231.w<_232.w);
-                  ushort4 _233;
-                    int4 _234 = make_int4(0, 0, 0, 0);
-                    _233.x = (_197.x<=_234.x);
-                    _233.y = (_197.y<=_234.y);
-                    _233.z = (_197.z<=_234.z);
-                    _233.w = (_197.w<=_234.w);
-                  _229.x = (_230.x&&_233.x);
-                  _229.y = (_230.y&&_233.y);
-                  _229.z = (_230.z&&_233.z);
-                  _229.w = (_230.w&&_233.w);
-                _222.x = (_223.x||_229.x);
-                _222.y = (_223.y||_229.y);
-                _222.z = (_223.z||_229.z);
-                _222.w = (_223.w||_229.w);
-              int4 _235;
-                int4 _236 = make_int4(24, 24, 24, 24);
-                _235.x = (_197.x+_236.x);
-                _235.y = (_197.y+_236.y);
-                _235.z = (_197.z+_236.z);
-                _235.w = (_197.w+_236.w);
-              _221.x = (bool(_222.x)?_197.x:_235.x);
-              _221.y = (bool(_222.y)?_197.y:_235.y);
-              _221.z = (bool(_222.z)?_197.z:_235.z);
-              _221.w = (bool(_222.w)?_197.w:_235.w);
-              int4 _237 = make_int4(3, 3, 3, 3);
-              _196.x = (_221.x*_237.x);
-              _196.y = (_221.y*_237.y);
-              _196.z = (_221.z*_237.z);
-              _196.w = (_221.w*_237.w);
-            _194.x = (_195.x+_196.x);
-            _194.y = (_195.y+_196.y);
-            _194.z = (_195.z+_196.z);
-            _194.w = (_195.w+_196.w);
-          int4 _238;
-            int4 _239 = make_int4(((((int)threadIdx.x) + 192))+(1*0), ((((int)threadIdx.x) + 192))+(1*1), ((((int)threadIdx.x) + 192))+(1*2), ((((int)threadIdx.x) + 192))+(1*3));
-            int4 _240 = make_int4(3, 3, 3, 3);
-            _238.x = (_239.x%_240.x);
-            _238.y = (_239.y%_240.y);
-            _238.z = (_239.z%_240.z);
-            _238.w = (_239.w%_240.w);
-          int4 _241;
-          ushort4 _242;
-            ushort4 _243;
-              ushort4 _244;
-                int4 _245 = make_int4(3, 3, 3, 3);
-                int4 _246 = make_int4(0, 0, 0, 0);
-                _244.x = (_245.x>=_246.x);
-                _244.y = (_245.y>=_246.y);
-                _244.z = (_245.z>=_246.z);
-                _244.w = (_245.w>=_246.w);
-              ushort4 _247;
-                int4 _248 = make_int4(0, 0, 0, 0);
-                _247.x = (_238.x>=_248.x);
-                _247.y = (_238.y>=_248.y);
-                _247.z = (_238.z>=_248.z);
-                _247.w = (_238.w>=_248.w);
-              _243.x = (_244.x&&_247.x);
-              _243.y = (_244.y&&_247.y);
-              _243.z = (_244.z&&_247.z);
-              _243.w = (_244.w&&_247.w);
-            ushort4 _249;
-              ushort4 _250;
-                int4 _251 = make_int4(3, 3, 3, 3);
-                int4 _252 = make_int4(0, 0, 0, 0);
-                _250.x = (_251.x<_252.x);
-                _250.y = (_251.y<_252.y);
-                _250.z = (_251.z<_252.z);
-                _250.w = (_251.w<_252.w);
-              ushort4 _253;
-                int4 _254 = make_int4(0, 0, 0, 0);
-                _253.x = (_238.x<=_254.x);
-                _253.y = (_238.y<=_254.y);
-                _253.z = (_238.z<=_254.z);
-                _253.w = (_238.w<=_254.w);
-              _249.x = (_250.x&&_253.x);
-              _249.y = (_250.y&&_253.y);
-              _249.z = (_250.z&&_253.z);
-              _249.w = (_250.w&&_253.w);
-            _242.x = (_243.x||_249.x);
-            _242.y = (_243.y||_249.y);
-            _242.z = (_243.z||_249.z);
-            _242.w = (_243.w||_249.w);
-          int4 _255;
-            int4 _256 = make_int4(3, 3, 3, 3);
-            _255.x = (_238.x+_256.x);
-            _255.y = (_238.y+_256.y);
-            _255.z = (_238.z+_256.z);
-            _255.w = (_238.w+_256.w);
-          _241.x = (bool(_242.x)?_238.x:_255.x);
-          _241.y = (bool(_242.y)?_238.y:_255.y);
-          _241.z = (bool(_242.z)?_238.z:_255.z);
-          _241.w = (bool(_242.w)?_238.w:_255.w);
-          _193.x = (_194.x+_241.x);
-          _193.y = (_194.y+_241.y);
-          _193.z = (_194.z+_241.z);
-          _193.w = (_194.w+_241.w);
-        *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 768)) = make_float4(kernel[_193.x],kernel[_193.y],kernel[_193.z],kernel[_193.w]);
-        int4 _257;
-          int4 _258;
-            int4 _259 = make_int4(((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 1024) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 1024) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 1024) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 1024) / 72) * 4608)) + (rc_outer_outer * 72)));
-            int4 _260;
-              int4 _261;
-                int4 _262;
-                  int4 _263 = make_int4((((((int)threadIdx.x) * 4) + 1024))+(1*0), (((((int)threadIdx.x) * 4) + 1024))+(1*1), (((((int)threadIdx.x) * 4) + 1024))+(1*2), (((((int)threadIdx.x) * 4) + 1024))+(1*3));
-                  int4 _264 = make_int4(3, 3, 3, 3);
-                  _262.x = (_263.x%_264.x);
-                  _262.y = (_263.y%_264.y);
-                  _262.z = (_263.z%_264.z);
-                  _262.w = (_263.w%_264.w);
-                int4 _265;
-                  int4 _266 = make_int4((((((int)threadIdx.x) * 4) + 1024))+(1*0), (((((int)threadIdx.x) * 4) + 1024))+(1*1), (((((int)threadIdx.x) * 4) + 1024))+(1*2), (((((int)threadIdx.x) * 4) + 1024))+(1*3));
-                  int4 _267 = make_int4(3, 3, 3, 3);
-                  _265.x = (_266.x/_267.x);
-                  _265.y = (_266.y/_267.y);
-                  _265.z = (_266.z/_267.z);
-                  _265.w = (_266.w/_267.w);
-                int4 _268;
-                ushort4 _269;
-                  ushort4 _270;
-                    ushort4 _271;
-                      int4 _272 = make_int4(3, 3, 3, 3);
-                      int4 _273 = make_int4(0, 0, 0, 0);
-                      _271.x = (_272.x>=_273.x);
-                      _271.y = (_272.y>=_273.y);
-                      _271.z = (_272.z>=_273.z);
-                      _271.w = (_272.w>=_273.w);
-                    ushort4 _274;
-                      int4 _275 = make_int4(0, 0, 0, 0);
-                      _274.x = (_262.x>=_275.x);
-                      _274.y = (_262.y>=_275.y);
-                      _274.z = (_262.z>=_275.z);
-                      _274.w = (_262.w>=_275.w);
-                    _270.x = (_271.x&&_274.x);
-                    _270.y = (_271.y&&_274.y);
-                    _270.z = (_271.z&&_274.z);
-                    _270.w = (_271.w&&_274.w);
-                  ushort4 _276;
-                    ushort4 _277;
-                      int4 _278 = make_int4(3, 3, 3, 3);
-                      int4 _279 = make_int4(0, 0, 0, 0);
-                      _277.x = (_278.x<_279.x);
-                      _277.y = (_278.y<_279.y);
-                      _277.z = (_278.z<_279.z);
-                      _277.w = (_278.w<_279.w);
-                    ushort4 _280;
-                      int4 _281 = make_int4(0, 0, 0, 0);
-                      _280.x = (_262.x<=_281.x);
-                      _280.y = (_262.y<=_281.y);
-                      _280.z = (_262.z<=_281.z);
-                      _280.w = (_262.w<=_281.w);
-                    _276.x = (_277.x&&_280.x);
-                    _276.y = (_277.y&&_280.y);
-                    _276.z = (_277.z&&_280.z);
-                    _276.w = (_277.w&&_280.w);
-                  _269.x = (_270.x||_276.x);
-                  _269.y = (_270.y||_276.y);
-                  _269.z = (_270.z||_276.z);
-                  _269.w = (_270.w||_276.w);
-                int4 _282;
-                  int4 _283 = make_int4(1, 1, 1, 1);
-                  _282.x = (_265.x-_283.x);
-                  _282.y = (_265.y-_283.y);
-                  _282.z = (_265.z-_283.z);
-                  _282.w = (_265.w-_283.w);
-                _268.x = (bool(_269.x)?_265.x:_282.x);
-                _268.y = (bool(_269.y)?_265.y:_282.y);
-                _268.z = (bool(_269.z)?_265.z:_282.z);
-                _268.w = (bool(_269.w)?_265.w:_282.w);
-                int4 _284 = make_int4(24, 24, 24, 24);
-                _261.x = (_268.x%_284.x);
-                _261.y = (_268.y%_284.y);
-                _261.z = (_268.z%_284.z);
-                _261.w = (_268.w%_284.w);
-              int4 _285;
-              ushort4 _286;
-                ushort4 _287;
-                  ushort4 _288;
-                    int4 _289 = make_int4(24, 24, 24, 24);
-                    int4 _290 = make_int4(0, 0, 0, 0);
-                    _288.x = (_289.x>=_290.x);
-                    _288.y = (_289.y>=_290.y);
-                    _288.z = (_289.z>=_290.z);
-                    _288.w = (_289.w>=_290.w);
-                  ushort4 _291;
-                    int4 _292 = make_int4(0, 0, 0, 0);
-                    _291.x = (_261.x>=_292.x);
-                    _291.y = (_261.y>=_292.y);
-                    _291.z = (_261.z>=_292.z);
-                    _291.w = (_261.w>=_292.w);
-                  _287.x = (_288.x&&_291.x);
-                  _287.y = (_288.y&&_291.y);
-                  _287.z = (_288.z&&_291.z);
-                  _287.w = (_288.w&&_291.w);
-                ushort4 _293;
-                  ushort4 _294;
-                    int4 _295 = make_int4(24, 24, 24, 24);
-                    int4 _296 = make_int4(0, 0, 0, 0);
-                    _294.x = (_295.x<_296.x);
-                    _294.y = (_295.y<_296.y);
-                    _294.z = (_295.z<_296.z);
-                    _294.w = (_295.w<_296.w);
-                  ushort4 _297;
-                    int4 _298 = make_int4(0, 0, 0, 0);
-                    _297.x = (_261.x<=_298.x);
-                    _297.y = (_261.y<=_298.y);
-                    _297.z = (_261.z<=_298.z);
-                    _297.w = (_261.w<=_298.w);
-                  _293.x = (_294.x&&_297.x);
-                  _293.y = (_294.y&&_297.y);
-                  _293.z = (_294.z&&_297.z);
-                  _293.w = (_294.w&&_297.w);
-                _286.x = (_287.x||_293.x);
-                _286.y = (_287.y||_293.y);
-                _286.z = (_287.z||_293.z);
-                _286.w = (_287.w||_293.w);
-              int4 _299;
-                int4 _300 = make_int4(24, 24, 24, 24);
-                _299.x = (_261.x+_300.x);
-                _299.y = (_261.y+_300.y);
-                _299.z = (_261.z+_300.z);
-                _299.w = (_261.w+_300.w);
-              _285.x = (bool(_286.x)?_261.x:_299.x);
-              _285.y = (bool(_286.y)?_261.y:_299.y);
-              _285.z = (bool(_286.z)?_261.z:_299.z);
-              _285.w = (bool(_286.w)?_261.w:_299.w);
-              int4 _301 = make_int4(3, 3, 3, 3);
-              _260.x = (_285.x*_301.x);
-              _260.y = (_285.y*_301.y);
-              _260.z = (_285.z*_301.z);
-              _260.w = (_285.w*_301.w);
-            _258.x = (_259.x+_260.x);
-            _258.y = (_259.y+_260.y);
-            _258.z = (_259.z+_260.z);
-            _258.w = (_259.w+_260.w);
-          int4 _302;
-            int4 _303 = make_int4(((((int)threadIdx.x) + 256))+(1*0), ((((int)threadIdx.x) + 256))+(1*1), ((((int)threadIdx.x) + 256))+(1*2), ((((int)threadIdx.x) + 256))+(1*3));
-            int4 _304 = make_int4(3, 3, 3, 3);
-            _302.x = (_303.x%_304.x);
-            _302.y = (_303.y%_304.y);
-            _302.z = (_303.z%_304.z);
-            _302.w = (_303.w%_304.w);
-          int4 _305;
-          ushort4 _306;
-            ushort4 _307;
-              ushort4 _308;
-                int4 _309 = make_int4(3, 3, 3, 3);
-                int4 _310 = make_int4(0, 0, 0, 0);
-                _308.x = (_309.x>=_310.x);
-                _308.y = (_309.y>=_310.y);
-                _308.z = (_309.z>=_310.z);
-                _308.w = (_309.w>=_310.w);
-              ushort4 _311;
-                int4 _312 = make_int4(0, 0, 0, 0);
-                _311.x = (_302.x>=_312.x);
-                _311.y = (_302.y>=_312.y);
-                _311.z = (_302.z>=_312.z);
-                _311.w = (_302.w>=_312.w);
-              _307.x = (_308.x&&_311.x);
-              _307.y = (_308.y&&_311.y);
-              _307.z = (_308.z&&_311.z);
-              _307.w = (_308.w&&_311.w);
-            ushort4 _313;
-              ushort4 _314;
-                int4 _315 = make_int4(3, 3, 3, 3);
-                int4 _316 = make_int4(0, 0, 0, 0);
-                _314.x = (_315.x<_316.x);
-                _314.y = (_315.y<_316.y);
-                _314.z = (_315.z<_316.z);
-                _314.w = (_315.w<_316.w);
-              ushort4 _317;
-                int4 _318 = make_int4(0, 0, 0, 0);
-                _317.x = (_302.x<=_318.x);
-                _317.y = (_302.y<=_318.y);
-                _317.z = (_302.z<=_318.z);
-                _317.w = (_302.w<=_318.w);
-              _313.x = (_314.x&&_317.x);
-              _313.y = (_314.y&&_317.y);
-              _313.z = (_314.z&&_317.z);
-              _313.w = (_314.w&&_317.w);
-            _306.x = (_307.x||_313.x);
-            _306.y = (_307.y||_313.y);
-            _306.z = (_307.z||_313.z);
-            _306.w = (_307.w||_313.w);
-          int4 _319;
-            int4 _320 = make_int4(3, 3, 3, 3);
-            _319.x = (_302.x+_320.x);
-            _319.y = (_302.y+_320.y);
-            _319.z = (_302.z+_320.z);
-            _319.w = (_302.w+_320.w);
-          _305.x = (bool(_306.x)?_302.x:_319.x);
-          _305.y = (bool(_306.y)?_302.y:_319.y);
-          _305.z = (bool(_306.z)?_302.z:_319.z);
-          _305.w = (bool(_306.w)?_302.w:_319.w);
-          _257.x = (_258.x+_305.x);
-          _257.y = (_258.y+_305.y);
-          _257.z = (_258.z+_305.z);
-          _257.w = (_258.w+_305.w);
-        *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 1024)) = make_float4(kernel[_257.x],kernel[_257.y],kernel[_257.z],kernel[_257.w]);
-        int4 _321;
-          int4 _322;
-            int4 _323 = make_int4(((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 1280) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 1280) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 1280) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 1280) / 72) * 4608)) + (rc_outer_outer * 72)));
-            int4 _324;
-              int4 _325;
-                int4 _326;
-                  int4 _327 = make_int4((((((int)threadIdx.x) * 4) + 1280))+(1*0), (((((int)threadIdx.x) * 4) + 1280))+(1*1), (((((int)threadIdx.x) * 4) + 1280))+(1*2), (((((int)threadIdx.x) * 4) + 1280))+(1*3));
-                  int4 _328 = make_int4(3, 3, 3, 3);
-                  _326.x = (_327.x%_328.x);
-                  _326.y = (_327.y%_328.y);
-                  _326.z = (_327.z%_328.z);
-                  _326.w = (_327.w%_328.w);
-                int4 _329;
-                  int4 _330 = make_int4((((((int)threadIdx.x) * 4) + 1280))+(1*0), (((((int)threadIdx.x) * 4) + 1280))+(1*1), (((((int)threadIdx.x) * 4) + 1280))+(1*2), (((((int)threadIdx.x) * 4) + 1280))+(1*3));
-                  int4 _331 = make_int4(3, 3, 3, 3);
-                  _329.x = (_330.x/_331.x);
-                  _329.y = (_330.y/_331.y);
-                  _329.z = (_330.z/_331.z);
-                  _329.w = (_330.w/_331.w);
-                int4 _332;
-                ushort4 _333;
-                  ushort4 _334;
-                    ushort4 _335;
-                      int4 _336 = make_int4(3, 3, 3, 3);
-                      int4 _337 = make_int4(0, 0, 0, 0);
-                      _335.x = (_336.x>=_337.x);
-                      _335.y = (_336.y>=_337.y);
-                      _335.z = (_336.z>=_337.z);
-                      _335.w = (_336.w>=_337.w);
-                    ushort4 _338;
-                      int4 _339 = make_int4(0, 0, 0, 0);
-                      _338.x = (_326.x>=_339.x);
-                      _338.y = (_326.y>=_339.y);
-                      _338.z = (_326.z>=_339.z);
-                      _338.w = (_326.w>=_339.w);
-                    _334.x = (_335.x&&_338.x);
-                    _334.y = (_335.y&&_338.y);
-                    _334.z = (_335.z&&_338.z);
-                    _334.w = (_335.w&&_338.w);
-                  ushort4 _340;
-                    ushort4 _341;
-                      int4 _342 = make_int4(3, 3, 3, 3);
-                      int4 _343 = make_int4(0, 0, 0, 0);
-                      _341.x = (_342.x<_343.x);
-                      _341.y = (_342.y<_343.y);
-                      _341.z = (_342.z<_343.z);
-                      _341.w = (_342.w<_343.w);
-                    ushort4 _344;
-                      int4 _345 = make_int4(0, 0, 0, 0);
-                      _344.x = (_326.x<=_345.x);
-                      _344.y = (_326.y<=_345.y);
-                      _344.z = (_326.z<=_345.z);
-                      _344.w = (_326.w<=_345.w);
-                    _340.x = (_341.x&&_344.x);
-                    _340.y = (_341.y&&_344.y);
-                    _340.z = (_341.z&&_344.z);
-                    _340.w = (_341.w&&_344.w);
-                  _333.x = (_334.x||_340.x);
-                  _333.y = (_334.y||_340.y);
-                  _333.z = (_334.z||_340.z);
-                  _333.w = (_334.w||_340.w);
-                int4 _346;
-                  int4 _347 = make_int4(1, 1, 1, 1);
-                  _346.x = (_329.x-_347.x);
-                  _346.y = (_329.y-_347.y);
-                  _346.z = (_329.z-_347.z);
-                  _346.w = (_329.w-_347.w);
-                _332.x = (bool(_333.x)?_329.x:_346.x);
-                _332.y = (bool(_333.y)?_329.y:_346.y);
-                _332.z = (bool(_333.z)?_329.z:_346.z);
-                _332.w = (bool(_333.w)?_329.w:_346.w);
-                int4 _348 = make_int4(24, 24, 24, 24);
-                _325.x = (_332.x%_348.x);
-                _325.y = (_332.y%_348.y);
-                _325.z = (_332.z%_348.z);
-                _325.w = (_332.w%_348.w);
-              int4 _349;
-              ushort4 _350;
-                ushort4 _351;
-                  ushort4 _352;
-                    int4 _353 = make_int4(24, 24, 24, 24);
-                    int4 _354 = make_int4(0, 0, 0, 0);
-                    _352.x = (_353.x>=_354.x);
-                    _352.y = (_353.y>=_354.y);
-                    _352.z = (_353.z>=_354.z);
-                    _352.w = (_353.w>=_354.w);
-                  ushort4 _355;
-                    int4 _356 = make_int4(0, 0, 0, 0);
-                    _355.x = (_325.x>=_356.x);
-                    _355.y = (_325.y>=_356.y);
-                    _355.z = (_325.z>=_356.z);
-                    _355.w = (_325.w>=_356.w);
-                  _351.x = (_352.x&&_355.x);
-                  _351.y = (_352.y&&_355.y);
-                  _351.z = (_352.z&&_355.z);
-                  _351.w = (_352.w&&_355.w);
-                ushort4 _357;
-                  ushort4 _358;
-                    int4 _359 = make_int4(24, 24, 24, 24);
-                    int4 _360 = make_int4(0, 0, 0, 0);
-                    _358.x = (_359.x<_360.x);
-                    _358.y = (_359.y<_360.y);
-                    _358.z = (_359.z<_360.z);
-                    _358.w = (_359.w<_360.w);
-                  ushort4 _361;
-                    int4 _362 = make_int4(0, 0, 0, 0);
-                    _361.x = (_325.x<=_362.x);
-                    _361.y = (_325.y<=_362.y);
-                    _361.z = (_325.z<=_362.z);
-                    _361.w = (_325.w<=_362.w);
-                  _357.x = (_358.x&&_361.x);
-                  _357.y = (_358.y&&_361.y);
-                  _357.z = (_358.z&&_361.z);
-                  _357.w = (_358.w&&_361.w);
-                _350.x = (_351.x||_357.x);
-                _350.y = (_351.y||_357.y);
-                _350.z = (_351.z||_357.z);
-                _350.w = (_351.w||_357.w);
-              int4 _363;
-                int4 _364 = make_int4(24, 24, 24, 24);
-                _363.x = (_325.x+_364.x);
-                _363.y = (_325.y+_364.y);
-                _363.z = (_325.z+_364.z);
-                _363.w = (_325.w+_364.w);
-              _349.x = (bool(_350.x)?_325.x:_363.x);
-              _349.y = (bool(_350.y)?_325.y:_363.y);
-              _349.z = (bool(_350.z)?_325.z:_363.z);
-              _349.w = (bool(_350.w)?_325.w:_363.w);
-              int4 _365 = make_int4(3, 3, 3, 3);
-              _324.x = (_349.x*_365.x);
-              _324.y = (_349.y*_365.y);
-              _324.z = (_349.z*_365.z);
-              _324.w = (_349.w*_365.w);
-            _322.x = (_323.x+_324.x);
-            _322.y = (_323.y+_324.y);
-            _322.z = (_323.z+_324.z);
-            _322.w = (_323.w+_324.w);
-          int4 _366;
-            int4 _367 = make_int4(((((int)threadIdx.x) + 320))+(1*0), ((((int)threadIdx.x) + 320))+(1*1), ((((int)threadIdx.x) + 320))+(1*2), ((((int)threadIdx.x) + 320))+(1*3));
-            int4 _368 = make_int4(3, 3, 3, 3);
-            _366.x = (_367.x%_368.x);
-            _366.y = (_367.y%_368.y);
-            _366.z = (_367.z%_368.z);
-            _366.w = (_367.w%_368.w);
-          int4 _369;
-          ushort4 _370;
-            ushort4 _371;
-              ushort4 _372;
-                int4 _373 = make_int4(3, 3, 3, 3);
-                int4 _374 = make_int4(0, 0, 0, 0);
-                _372.x = (_373.x>=_374.x);
-                _372.y = (_373.y>=_374.y);
-                _372.z = (_373.z>=_374.z);
-                _372.w = (_373.w>=_374.w);
-              ushort4 _375;
-                int4 _376 = make_int4(0, 0, 0, 0);
-                _375.x = (_366.x>=_376.x);
-                _375.y = (_366.y>=_376.y);
-                _375.z = (_366.z>=_376.z);
-                _375.w = (_366.w>=_376.w);
-              _371.x = (_372.x&&_375.x);
-              _371.y = (_372.y&&_375.y);
-              _371.z = (_372.z&&_375.z);
-              _371.w = (_372.w&&_375.w);
-            ushort4 _377;
-              ushort4 _378;
-                int4 _379 = make_int4(3, 3, 3, 3);
-                int4 _380 = make_int4(0, 0, 0, 0);
-                _378.x = (_379.x<_380.x);
-                _378.y = (_379.y<_380.y);
-                _378.z = (_379.z<_380.z);
-                _378.w = (_379.w<_380.w);
-              ushort4 _381;
-                int4 _382 = make_int4(0, 0, 0, 0);
-                _381.x = (_366.x<=_382.x);
-                _381.y = (_366.y<=_382.y);
-                _381.z = (_366.z<=_382.z);
-                _381.w = (_366.w<=_382.w);
-              _377.x = (_378.x&&_381.x);
-              _377.y = (_378.y&&_381.y);
-              _377.z = (_378.z&&_381.z);
-              _377.w = (_378.w&&_381.w);
-            _370.x = (_371.x||_377.x);
-            _370.y = (_371.y||_377.y);
-            _370.z = (_371.z||_377.z);
-            _370.w = (_371.w||_377.w);
-          int4 _383;
-            int4 _384 = make_int4(3, 3, 3, 3);
-            _383.x = (_366.x+_384.x);
-            _383.y = (_366.y+_384.y);
-            _383.z = (_366.z+_384.z);
-            _383.w = (_366.w+_384.w);
-          _369.x = (bool(_370.x)?_366.x:_383.x);
-          _369.y = (bool(_370.y)?_366.y:_383.y);
-          _369.z = (bool(_370.z)?_366.z:_383.z);
-          _369.w = (bool(_370.w)?_366.w:_383.w);
-          _321.x = (_322.x+_369.x);
-          _321.y = (_322.y+_369.y);
-          _321.z = (_322.z+_369.z);
-          _321.w = (_322.w+_369.w);
-        *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 1280)) = make_float4(kernel[_321.x],kernel[_321.y],kernel[_321.z],kernel[_321.w]);
-        int4 _385;
-          int4 _386;
-            int4 _387 = make_int4(((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 1536) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 1536) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 1536) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 1536) / 72) * 4608)) + (rc_outer_outer * 72)));
-            int4 _388;
-              int4 _389;
-                int4 _390;
-                  int4 _391 = make_int4((((((int)threadIdx.x) * 4) + 1536))+(1*0), (((((int)threadIdx.x) * 4) + 1536))+(1*1), (((((int)threadIdx.x) * 4) + 1536))+(1*2), (((((int)threadIdx.x) * 4) + 1536))+(1*3));
-                  int4 _392 = make_int4(3, 3, 3, 3);
-                  _390.x = (_391.x%_392.x);
-                  _390.y = (_391.y%_392.y);
-                  _390.z = (_391.z%_392.z);
-                  _390.w = (_391.w%_392.w);
-                int4 _393;
-                  int4 _394 = make_int4((((((int)threadIdx.x) * 4) + 1536))+(1*0), (((((int)threadIdx.x) * 4) + 1536))+(1*1), (((((int)threadIdx.x) * 4) + 1536))+(1*2), (((((int)threadIdx.x) * 4) + 1536))+(1*3));
-                  int4 _395 = make_int4(3, 3, 3, 3);
-                  _393.x = (_394.x/_395.x);
-                  _393.y = (_394.y/_395.y);
-                  _393.z = (_394.z/_395.z);
-                  _393.w = (_394.w/_395.w);
-                int4 _396;
-                ushort4 _397;
-                  ushort4 _398;
-                    ushort4 _399;
-                      int4 _400 = make_int4(3, 3, 3, 3);
-                      int4 _401 = make_int4(0, 0, 0, 0);
-                      _399.x = (_400.x>=_401.x);
-                      _399.y = (_400.y>=_401.y);
-                      _399.z = (_400.z>=_401.z);
-                      _399.w = (_400.w>=_401.w);
-                    ushort4 _402;
-                      int4 _403 = make_int4(0, 0, 0, 0);
-                      _402.x = (_390.x>=_403.x);
-                      _402.y = (_390.y>=_403.y);
-                      _402.z = (_390.z>=_403.z);
-                      _402.w = (_390.w>=_403.w);
-                    _398.x = (_399.x&&_402.x);
-                    _398.y = (_399.y&&_402.y);
-                    _398.z = (_399.z&&_402.z);
-                    _398.w = (_399.w&&_402.w);
-                  ushort4 _404;
-                    ushort4 _405;
-                      int4 _406 = make_int4(3, 3, 3, 3);
-                      int4 _407 = make_int4(0, 0, 0, 0);
-                      _405.x = (_406.x<_407.x);
-                      _405.y = (_406.y<_407.y);
-                      _405.z = (_406.z<_407.z);
-                      _405.w = (_406.w<_407.w);
-                    ushort4 _408;
-                      int4 _409 = make_int4(0, 0, 0, 0);
-                      _408.x = (_390.x<=_409.x);
-                      _408.y = (_390.y<=_409.y);
-                      _408.z = (_390.z<=_409.z);
-                      _408.w = (_390.w<=_409.w);
-                    _404.x = (_405.x&&_408.x);
-                    _404.y = (_405.y&&_408.y);
-                    _404.z = (_405.z&&_408.z);
-                    _404.w = (_405.w&&_408.w);
-                  _397.x = (_398.x||_404.x);
-                  _397.y = (_398.y||_404.y);
-                  _397.z = (_398.z||_404.z);
-                  _397.w = (_398.w||_404.w);
-                int4 _410;
-                  int4 _411 = make_int4(1, 1, 1, 1);
-                  _410.x = (_393.x-_411.x);
-                  _410.y = (_393.y-_411.y);
-                  _410.z = (_393.z-_411.z);
-                  _410.w = (_393.w-_411.w);
-                _396.x = (bool(_397.x)?_393.x:_410.x);
-                _396.y = (bool(_397.y)?_393.y:_410.y);
-                _396.z = (bool(_397.z)?_393.z:_410.z);
-                _396.w = (bool(_397.w)?_393.w:_410.w);
-                int4 _412 = make_int4(24, 24, 24, 24);
-                _389.x = (_396.x%_412.x);
-                _389.y = (_396.y%_412.y);
-                _389.z = (_396.z%_412.z);
-                _389.w = (_396.w%_412.w);
-              int4 _413;
-              ushort4 _414;
-                ushort4 _415;
-                  ushort4 _416;
-                    int4 _417 = make_int4(24, 24, 24, 24);
-                    int4 _418 = make_int4(0, 0, 0, 0);
-                    _416.x = (_417.x>=_418.x);
-                    _416.y = (_417.y>=_418.y);
-                    _416.z = (_417.z>=_418.z);
-                    _416.w = (_417.w>=_418.w);
-                  ushort4 _419;
-                    int4 _420 = make_int4(0, 0, 0, 0);
-                    _419.x = (_389.x>=_420.x);
-                    _419.y = (_389.y>=_420.y);
-                    _419.z = (_389.z>=_420.z);
-                    _419.w = (_389.w>=_420.w);
-                  _415.x = (_416.x&&_419.x);
-                  _415.y = (_416.y&&_419.y);
-                  _415.z = (_416.z&&_419.z);
-                  _415.w = (_416.w&&_419.w);
-                ushort4 _421;
-                  ushort4 _422;
-                    int4 _423 = make_int4(24, 24, 24, 24);
-                    int4 _424 = make_int4(0, 0, 0, 0);
-                    _422.x = (_423.x<_424.x);
-                    _422.y = (_423.y<_424.y);
-                    _422.z = (_423.z<_424.z);
-                    _422.w = (_423.w<_424.w);
-                  ushort4 _425;
-                    int4 _426 = make_int4(0, 0, 0, 0);
-                    _425.x = (_389.x<=_426.x);
-                    _425.y = (_389.y<=_426.y);
-                    _425.z = (_389.z<=_426.z);
-                    _425.w = (_389.w<=_426.w);
-                  _421.x = (_422.x&&_425.x);
-                  _421.y = (_422.y&&_425.y);
-                  _421.z = (_422.z&&_425.z);
-                  _421.w = (_422.w&&_425.w);
-                _414.x = (_415.x||_421.x);
-                _414.y = (_415.y||_421.y);
-                _414.z = (_415.z||_421.z);
-                _414.w = (_415.w||_421.w);
-              int4 _427;
-                int4 _428 = make_int4(24, 24, 24, 24);
-                _427.x = (_389.x+_428.x);
-                _427.y = (_389.y+_428.y);
-                _427.z = (_389.z+_428.z);
-                _427.w = (_389.w+_428.w);
-              _413.x = (bool(_414.x)?_389.x:_427.x);
-              _413.y = (bool(_414.y)?_389.y:_427.y);
-              _413.z = (bool(_414.z)?_389.z:_427.z);
-              _413.w = (bool(_414.w)?_389.w:_427.w);
-              int4 _429 = make_int4(3, 3, 3, 3);
-              _388.x = (_413.x*_429.x);
-              _388.y = (_413.y*_429.y);
-              _388.z = (_413.z*_429.z);
-              _388.w = (_413.w*_429.w);
-            _386.x = (_387.x+_388.x);
-            _386.y = (_387.y+_388.y);
-            _386.z = (_387.z+_388.z);
-            _386.w = (_387.w+_388.w);
-          int4 _430;
-            int4 _431 = make_int4(((((int)threadIdx.x) + 384))+(1*0), ((((int)threadIdx.x) + 384))+(1*1), ((((int)threadIdx.x) + 384))+(1*2), ((((int)threadIdx.x) + 384))+(1*3));
-            int4 _432 = make_int4(3, 3, 3, 3);
-            _430.x = (_431.x%_432.x);
-            _430.y = (_431.y%_432.y);
-            _430.z = (_431.z%_432.z);
-            _430.w = (_431.w%_432.w);
-          int4 _433;
-          ushort4 _434;
-            ushort4 _435;
-              ushort4 _436;
-                int4 _437 = make_int4(3, 3, 3, 3);
-                int4 _438 = make_int4(0, 0, 0, 0);
-                _436.x = (_437.x>=_438.x);
-                _436.y = (_437.y>=_438.y);
-                _436.z = (_437.z>=_438.z);
-                _436.w = (_437.w>=_438.w);
-              ushort4 _439;
-                int4 _440 = make_int4(0, 0, 0, 0);
-                _439.x = (_430.x>=_440.x);
-                _439.y = (_430.y>=_440.y);
-                _439.z = (_430.z>=_440.z);
-                _439.w = (_430.w>=_440.w);
-              _435.x = (_436.x&&_439.x);
-              _435.y = (_436.y&&_439.y);
-              _435.z = (_436.z&&_439.z);
-              _435.w = (_436.w&&_439.w);
-            ushort4 _441;
-              ushort4 _442;
-                int4 _443 = make_int4(3, 3, 3, 3);
-                int4 _444 = make_int4(0, 0, 0, 0);
-                _442.x = (_443.x<_444.x);
-                _442.y = (_443.y<_444.y);
-                _442.z = (_443.z<_444.z);
-                _442.w = (_443.w<_444.w);
-              ushort4 _445;
-                int4 _446 = make_int4(0, 0, 0, 0);
-                _445.x = (_430.x<=_446.x);
-                _445.y = (_430.y<=_446.y);
-                _445.z = (_430.z<=_446.z);
-                _445.w = (_430.w<=_446.w);
-              _441.x = (_442.x&&_445.x);
-              _441.y = (_442.y&&_445.y);
-              _441.z = (_442.z&&_445.z);
-              _441.w = (_442.w&&_445.w);
-            _434.x = (_435.x||_441.x);
-            _434.y = (_435.y||_441.y);
-            _434.z = (_435.z||_441.z);
-            _434.w = (_435.w||_441.w);
-          int4 _447;
-            int4 _448 = make_int4(3, 3, 3, 3);
-            _447.x = (_430.x+_448.x);
-            _447.y = (_430.y+_448.y);
-            _447.z = (_430.z+_448.z);
-            _447.w = (_430.w+_448.w);
-          _433.x = (bool(_434.x)?_430.x:_447.x);
-          _433.y = (bool(_434.y)?_430.y:_447.y);
-          _433.z = (bool(_434.z)?_430.z:_447.z);
-          _433.w = (bool(_434.w)?_430.w:_447.w);
-          _385.x = (_386.x+_433.x);
-          _385.y = (_386.y+_433.y);
-          _385.z = (_386.z+_433.z);
-          _385.w = (_386.w+_433.w);
-        *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 1536)) = make_float4(kernel[_385.x],kernel[_385.y],kernel[_385.z],kernel[_385.w]);
-        int4 _449;
-          int4 _450;
-            int4 _451 = make_int4(((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 1792) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 1792) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 1792) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 1792) / 72) * 4608)) + (rc_outer_outer * 72)));
-            int4 _452;
-              int4 _453;
-                int4 _454;
-                  int4 _455 = make_int4((((((int)threadIdx.x) * 4) + 1792))+(1*0), (((((int)threadIdx.x) * 4) + 1792))+(1*1), (((((int)threadIdx.x) * 4) + 1792))+(1*2), (((((int)threadIdx.x) * 4) + 1792))+(1*3));
-                  int4 _456 = make_int4(3, 3, 3, 3);
-                  _454.x = (_455.x%_456.x);
-                  _454.y = (_455.y%_456.y);
-                  _454.z = (_455.z%_456.z);
-                  _454.w = (_455.w%_456.w);
-                int4 _457;
-                  int4 _458 = make_int4((((((int)threadIdx.x) * 4) + 1792))+(1*0), (((((int)threadIdx.x) * 4) + 1792))+(1*1), (((((int)threadIdx.x) * 4) + 1792))+(1*2), (((((int)threadIdx.x) * 4) + 1792))+(1*3));
-                  int4 _459 = make_int4(3, 3, 3, 3);
-                  _457.x = (_458.x/_459.x);
-                  _457.y = (_458.y/_459.y);
-                  _457.z = (_458.z/_459.z);
-                  _457.w = (_458.w/_459.w);
-                int4 _460;
-                ushort4 _461;
-                  ushort4 _462;
-                    ushort4 _463;
-                      int4 _464 = make_int4(3, 3, 3, 3);
-                      int4 _465 = make_int4(0, 0, 0, 0);
-                      _463.x = (_464.x>=_465.x);
-                      _463.y = (_464.y>=_465.y);
-                      _463.z = (_464.z>=_465.z);
-                      _463.w = (_464.w>=_465.w);
-                    ushort4 _466;
-                      int4 _467 = make_int4(0, 0, 0, 0);
-                      _466.x = (_454.x>=_467.x);
-                      _466.y = (_454.y>=_467.y);
-                      _466.z = (_454.z>=_467.z);
-                      _466.w = (_454.w>=_467.w);
-                    _462.x = (_463.x&&_466.x);
-                    _462.y = (_463.y&&_466.y);
-                    _462.z = (_463.z&&_466.z);
-                    _462.w = (_463.w&&_466.w);
-                  ushort4 _468;
-                    ushort4 _469;
-                      int4 _470 = make_int4(3, 3, 3, 3);
-                      int4 _471 = make_int4(0, 0, 0, 0);
-                      _469.x = (_470.x<_471.x);
-                      _469.y = (_470.y<_471.y);
-                      _469.z = (_470.z<_471.z);
-                      _469.w = (_470.w<_471.w);
-                    ushort4 _472;
-                      int4 _473 = make_int4(0, 0, 0, 0);
-                      _472.x = (_454.x<=_473.x);
-                      _472.y = (_454.y<=_473.y);
-                      _472.z = (_454.z<=_473.z);
-                      _472.w = (_454.w<=_473.w);
-                    _468.x = (_469.x&&_472.x);
-                    _468.y = (_469.y&&_472.y);
-                    _468.z = (_469.z&&_472.z);
-                    _468.w = (_469.w&&_472.w);
-                  _461.x = (_462.x||_468.x);
-                  _461.y = (_462.y||_468.y);
-                  _461.z = (_462.z||_468.z);
-                  _461.w = (_462.w||_468.w);
-                int4 _474;
-                  int4 _475 = make_int4(1, 1, 1, 1);
-                  _474.x = (_457.x-_475.x);
-                  _474.y = (_457.y-_475.y);
-                  _474.z = (_457.z-_475.z);
-                  _474.w = (_457.w-_475.w);
-                _460.x = (bool(_461.x)?_457.x:_474.x);
-                _460.y = (bool(_461.y)?_457.y:_474.y);
-                _460.z = (bool(_461.z)?_457.z:_474.z);
-                _460.w = (bool(_461.w)?_457.w:_474.w);
-                int4 _476 = make_int4(24, 24, 24, 24);
-                _453.x = (_460.x%_476.x);
-                _453.y = (_460.y%_476.y);
-                _453.z = (_460.z%_476.z);
-                _453.w = (_460.w%_476.w);
-              int4 _477;
-              ushort4 _478;
-                ushort4 _479;
-                  ushort4 _480;
-                    int4 _481 = make_int4(24, 24, 24, 24);
-                    int4 _482 = make_int4(0, 0, 0, 0);
-                    _480.x = (_481.x>=_482.x);
-                    _480.y = (_481.y>=_482.y);
-                    _480.z = (_481.z>=_482.z);
-                    _480.w = (_481.w>=_482.w);
-                  ushort4 _483;
-                    int4 _484 = make_int4(0, 0, 0, 0);
-                    _483.x = (_453.x>=_484.x);
-                    _483.y = (_453.y>=_484.y);
-                    _483.z = (_453.z>=_484.z);
-                    _483.w = (_453.w>=_484.w);
-                  _479.x = (_480.x&&_483.x);
-                  _479.y = (_480.y&&_483.y);
-                  _479.z = (_480.z&&_483.z);
-                  _479.w = (_480.w&&_483.w);
-                ushort4 _485;
-                  ushort4 _486;
-                    int4 _487 = make_int4(24, 24, 24, 24);
-                    int4 _488 = make_int4(0, 0, 0, 0);
-                    _486.x = (_487.x<_488.x);
-                    _486.y = (_487.y<_488.y);
-                    _486.z = (_487.z<_488.z);
-                    _486.w = (_487.w<_488.w);
-                  ushort4 _489;
-                    int4 _490 = make_int4(0, 0, 0, 0);
-                    _489.x = (_453.x<=_490.x);
-                    _489.y = (_453.y<=_490.y);
-                    _489.z = (_453.z<=_490.z);
-                    _489.w = (_453.w<=_490.w);
-                  _485.x = (_486.x&&_489.x);
-                  _485.y = (_486.y&&_489.y);
-                  _485.z = (_486.z&&_489.z);
-                  _485.w = (_486.w&&_489.w);
-                _478.x = (_479.x||_485.x);
-                _478.y = (_479.y||_485.y);
-                _478.z = (_479.z||_485.z);
-                _478.w = (_479.w||_485.w);
-              int4 _491;
-                int4 _492 = make_int4(24, 24, 24, 24);
-                _491.x = (_453.x+_492.x);
-                _491.y = (_453.y+_492.y);
-                _491.z = (_453.z+_492.z);
-                _491.w = (_453.w+_492.w);
-              _477.x = (bool(_478.x)?_453.x:_491.x);
-              _477.y = (bool(_478.y)?_453.y:_491.y);
-              _477.z = (bool(_478.z)?_453.z:_491.z);
-              _477.w = (bool(_478.w)?_453.w:_491.w);
-              int4 _493 = make_int4(3, 3, 3, 3);
-              _452.x = (_477.x*_493.x);
-              _452.y = (_477.y*_493.y);
-              _452.z = (_477.z*_493.z);
-              _452.w = (_477.w*_493.w);
-            _450.x = (_451.x+_452.x);
-            _450.y = (_451.y+_452.y);
-            _450.z = (_451.z+_452.z);
-            _450.w = (_451.w+_452.w);
-          int4 _494;
-            int4 _495 = make_int4(((((int)threadIdx.x) + 448))+(1*0), ((((int)threadIdx.x) + 448))+(1*1), ((((int)threadIdx.x) + 448))+(1*2), ((((int)threadIdx.x) + 448))+(1*3));
-            int4 _496 = make_int4(3, 3, 3, 3);
-            _494.x = (_495.x%_496.x);
-            _494.y = (_495.y%_496.y);
-            _494.z = (_495.z%_496.z);
-            _494.w = (_495.w%_496.w);
-          int4 _497;
-          ushort4 _498;
-            ushort4 _499;
-              ushort4 _500;
-                int4 _501 = make_int4(3, 3, 3, 3);
-                int4 _502 = make_int4(0, 0, 0, 0);
-                _500.x = (_501.x>=_502.x);
-                _500.y = (_501.y>=_502.y);
-                _500.z = (_501.z>=_502.z);
-                _500.w = (_501.w>=_502.w);
-              ushort4 _503;
-                int4 _504 = make_int4(0, 0, 0, 0);
-                _503.x = (_494.x>=_504.x);
-                _503.y = (_494.y>=_504.y);
-                _503.z = (_494.z>=_504.z);
-                _503.w = (_494.w>=_504.w);
-              _499.x = (_500.x&&_503.x);
-              _499.y = (_500.y&&_503.y);
-              _499.z = (_500.z&&_503.z);
-              _499.w = (_500.w&&_503.w);
-            ushort4 _505;
-              ushort4 _506;
-                int4 _507 = make_int4(3, 3, 3, 3);
-                int4 _508 = make_int4(0, 0, 0, 0);
-                _506.x = (_507.x<_508.x);
-                _506.y = (_507.y<_508.y);
-                _506.z = (_507.z<_508.z);
-                _506.w = (_507.w<_508.w);
-              ushort4 _509;
-                int4 _510 = make_int4(0, 0, 0, 0);
-                _509.x = (_494.x<=_510.x);
-                _509.y = (_494.y<=_510.y);
-                _509.z = (_494.z<=_510.z);
-                _509.w = (_494.w<=_510.w);
-              _505.x = (_506.x&&_509.x);
-              _505.y = (_506.y&&_509.y);
-              _505.z = (_506.z&&_509.z);
-              _505.w = (_506.w&&_509.w);
-            _498.x = (_499.x||_505.x);
-            _498.y = (_499.y||_505.y);
-            _498.z = (_499.z||_505.z);
-            _498.w = (_499.w||_505.w);
-          int4 _511;
-            int4 _512 = make_int4(3, 3, 3, 3);
-            _511.x = (_494.x+_512.x);
-            _511.y = (_494.y+_512.y);
-            _511.z = (_494.z+_512.z);
-            _511.w = (_494.w+_512.w);
-          _497.x = (bool(_498.x)?_494.x:_511.x);
-          _497.y = (bool(_498.y)?_494.y:_511.y);
-          _497.z = (bool(_498.z)?_494.z:_511.z);
-          _497.w = (bool(_498.w)?_494.w:_511.w);
-          _449.x = (_450.x+_497.x);
-          _449.y = (_450.y+_497.y);
-          _449.z = (_450.z+_497.z);
-          _449.w = (_450.w+_497.w);
-        *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 1792)) = make_float4(kernel[_449.x],kernel[_449.y],kernel[_449.z],kernel[_449.w]);
-        int4 _513;
-          int4 _514;
-            int4 _515 = make_int4(((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 2048) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 2048) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 2048) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 2048) / 72) * 4608)) + (rc_outer_outer * 72)));
-            int4 _516;
-              int4 _517;
-                int4 _518;
-                  int4 _519 = make_int4((((((int)threadIdx.x) * 4) + 2048))+(1*0), (((((int)threadIdx.x) * 4) + 2048))+(1*1), (((((int)threadIdx.x) * 4) + 2048))+(1*2), (((((int)threadIdx.x) * 4) + 2048))+(1*3));
-                  int4 _520 = make_int4(3, 3, 3, 3);
-                  _518.x = (_519.x%_520.x);
-                  _518.y = (_519.y%_520.y);
-                  _518.z = (_519.z%_520.z);
-                  _518.w = (_519.w%_520.w);
-                int4 _521;
-                  int4 _522 = make_int4((((((int)threadIdx.x) * 4) + 2048))+(1*0), (((((int)threadIdx.x) * 4) + 2048))+(1*1), (((((int)threadIdx.x) * 4) + 2048))+(1*2), (((((int)threadIdx.x) * 4) + 2048))+(1*3));
-                  int4 _523 = make_int4(3, 3, 3, 3);
-                  _521.x = (_522.x/_523.x);
-                  _521.y = (_522.y/_523.y);
-                  _521.z = (_522.z/_523.z);
-                  _521.w = (_522.w/_523.w);
-                int4 _524;
-                ushort4 _525;
-                  ushort4 _526;
-                    ushort4 _527;
-                      int4 _528 = make_int4(3, 3, 3, 3);
-                      int4 _529 = make_int4(0, 0, 0, 0);
-                      _527.x = (_528.x>=_529.x);
-                      _527.y = (_528.y>=_529.y);
-                      _527.z = (_528.z>=_529.z);
-                      _527.w = (_528.w>=_529.w);
-                    ushort4 _530;
-                      int4 _531 = make_int4(0, 0, 0, 0);
-                      _530.x = (_518.x>=_531.x);
-                      _530.y = (_518.y>=_531.y);
-                      _530.z = (_518.z>=_531.z);
-                      _530.w = (_518.w>=_531.w);
-                    _526.x = (_527.x&&_530.x);
-                    _526.y = (_527.y&&_530.y);
-                    _526.z = (_527.z&&_530.z);
-                    _526.w = (_527.w&&_530.w);
-                  ushort4 _532;
-                    ushort4 _533;
-                      int4 _534 = make_int4(3, 3, 3, 3);
-                      int4 _535 = make_int4(0, 0, 0, 0);
-                      _533.x = (_534.x<_535.x);
-                      _533.y = (_534.y<_535.y);
-                      _533.z = (_534.z<_535.z);
-                      _533.w = (_534.w<_535.w);
-                    ushort4 _536;
-                      int4 _537 = make_int4(0, 0, 0, 0);
-                      _536.x = (_518.x<=_537.x);
-                      _536.y = (_518.y<=_537.y);
-                      _536.z = (_518.z<=_537.z);
-                      _536.w = (_518.w<=_537.w);
-                    _532.x = (_533.x&&_536.x);
-                    _532.y = (_533.y&&_536.y);
-                    _532.z = (_533.z&&_536.z);
-                    _532.w = (_533.w&&_536.w);
-                  _525.x = (_526.x||_532.x);
-                  _525.y = (_526.y||_532.y);
-                  _525.z = (_526.z||_532.z);
-                  _525.w = (_526.w||_532.w);
-                int4 _538;
-                  int4 _539 = make_int4(1, 1, 1, 1);
-                  _538.x = (_521.x-_539.x);
-                  _538.y = (_521.y-_539.y);
-                  _538.z = (_521.z-_539.z);
-                  _538.w = (_521.w-_539.w);
-                _524.x = (bool(_525.x)?_521.x:_538.x);
-                _524.y = (bool(_525.y)?_521.y:_538.y);
-                _524.z = (bool(_525.z)?_521.z:_538.z);
-                _524.w = (bool(_525.w)?_521.w:_538.w);
-                int4 _540 = make_int4(24, 24, 24, 24);
-                _517.x = (_524.x%_540.x);
-                _517.y = (_524.y%_540.y);
-                _517.z = (_524.z%_540.z);
-                _517.w = (_524.w%_540.w);
-              int4 _541;
-              ushort4 _542;
-                ushort4 _543;
-                  ushort4 _544;
-                    int4 _545 = make_int4(24, 24, 24, 24);
-                    int4 _546 = make_int4(0, 0, 0, 0);
-                    _544.x = (_545.x>=_546.x);
-                    _544.y = (_545.y>=_546.y);
-                    _544.z = (_545.z>=_546.z);
-                    _544.w = (_545.w>=_546.w);
-                  ushort4 _547;
-                    int4 _548 = make_int4(0, 0, 0, 0);
-                    _547.x = (_517.x>=_548.x);
-                    _547.y = (_517.y>=_548.y);
-                    _547.z = (_517.z>=_548.z);
-                    _547.w = (_517.w>=_548.w);
-                  _543.x = (_544.x&&_547.x);
-                  _543.y = (_544.y&&_547.y);
-                  _543.z = (_544.z&&_547.z);
-                  _543.w = (_544.w&&_547.w);
-                ushort4 _549;
-                  ushort4 _550;
-                    int4 _551 = make_int4(24, 24, 24, 24);
-                    int4 _552 = make_int4(0, 0, 0, 0);
-                    _550.x = (_551.x<_552.x);
-                    _550.y = (_551.y<_552.y);
-                    _550.z = (_551.z<_552.z);
-                    _550.w = (_551.w<_552.w);
-                  ushort4 _553;
-                    int4 _554 = make_int4(0, 0, 0, 0);
-                    _553.x = (_517.x<=_554.x);
-                    _553.y = (_517.y<=_554.y);
-                    _553.z = (_517.z<=_554.z);
-                    _553.w = (_517.w<=_554.w);
-                  _549.x = (_550.x&&_553.x);
-                  _549.y = (_550.y&&_553.y);
-                  _549.z = (_550.z&&_553.z);
-                  _549.w = (_550.w&&_553.w);
-                _542.x = (_543.x||_549.x);
-                _542.y = (_543.y||_549.y);
-                _542.z = (_543.z||_549.z);
-                _542.w = (_543.w||_549.w);
-              int4 _555;
-                int4 _556 = make_int4(24, 24, 24, 24);
-                _555.x = (_517.x+_556.x);
-                _555.y = (_517.y+_556.y);
-                _555.z = (_517.z+_556.z);
-                _555.w = (_517.w+_556.w);
-              _541.x = (bool(_542.x)?_517.x:_555.x);
-              _541.y = (bool(_542.y)?_517.y:_555.y);
-              _541.z = (bool(_542.z)?_517.z:_555.z);
-              _541.w = (bool(_542.w)?_517.w:_555.w);
-              int4 _557 = make_int4(3, 3, 3, 3);
-              _516.x = (_541.x*_557.x);
-              _516.y = (_541.y*_557.y);
-              _516.z = (_541.z*_557.z);
-              _516.w = (_541.w*_557.w);
-            _514.x = (_515.x+_516.x);
-            _514.y = (_515.y+_516.y);
-            _514.z = (_515.z+_516.z);
-            _514.w = (_515.w+_516.w);
-          int4 _558;
-            int4 _559 = make_int4(((((int)threadIdx.x) + 512))+(1*0), ((((int)threadIdx.x) + 512))+(1*1), ((((int)threadIdx.x) + 512))+(1*2), ((((int)threadIdx.x) + 512))+(1*3));
-            int4 _560 = make_int4(3, 3, 3, 3);
-            _558.x = (_559.x%_560.x);
-            _558.y = (_559.y%_560.y);
-            _558.z = (_559.z%_560.z);
-            _558.w = (_559.w%_560.w);
-          int4 _561;
-          ushort4 _562;
-            ushort4 _563;
-              ushort4 _564;
-                int4 _565 = make_int4(3, 3, 3, 3);
-                int4 _566 = make_int4(0, 0, 0, 0);
-                _564.x = (_565.x>=_566.x);
-                _564.y = (_565.y>=_566.y);
-                _564.z = (_565.z>=_566.z);
-                _564.w = (_565.w>=_566.w);
-              ushort4 _567;
-                int4 _568 = make_int4(0, 0, 0, 0);
-                _567.x = (_558.x>=_568.x);
-                _567.y = (_558.y>=_568.y);
-                _567.z = (_558.z>=_568.z);
-                _567.w = (_558.w>=_568.w);
-              _563.x = (_564.x&&_567.x);
-              _563.y = (_564.y&&_567.y);
-              _563.z = (_564.z&&_567.z);
-              _563.w = (_564.w&&_567.w);
-            ushort4 _569;
-              ushort4 _570;
-                int4 _571 = make_int4(3, 3, 3, 3);
-                int4 _572 = make_int4(0, 0, 0, 0);
-                _570.x = (_571.x<_572.x);
-                _570.y = (_571.y<_572.y);
-                _570.z = (_571.z<_572.z);
-                _570.w = (_571.w<_572.w);
-              ushort4 _573;
-                int4 _574 = make_int4(0, 0, 0, 0);
-                _573.x = (_558.x<=_574.x);
-                _573.y = (_558.y<=_574.y);
-                _573.z = (_558.z<=_574.z);
-                _573.w = (_558.w<=_574.w);
-              _569.x = (_570.x&&_573.x);
-              _569.y = (_570.y&&_573.y);
-              _569.z = (_570.z&&_573.z);
-              _569.w = (_570.w&&_573.w);
-            _562.x = (_563.x||_569.x);
-            _562.y = (_563.y||_569.y);
-            _562.z = (_563.z||_569.z);
-            _562.w = (_563.w||_569.w);
-          int4 _575;
-            int4 _576 = make_int4(3, 3, 3, 3);
-            _575.x = (_558.x+_576.x);
-            _575.y = (_558.y+_576.y);
-            _575.z = (_558.z+_576.z);
-            _575.w = (_558.w+_576.w);
-          _561.x = (bool(_562.x)?_558.x:_575.x);
-          _561.y = (bool(_562.y)?_558.y:_575.y);
-          _561.z = (bool(_562.z)?_558.z:_575.z);
-          _561.w = (bool(_562.w)?_558.w:_575.w);
-          _513.x = (_514.x+_561.x);
-          _513.y = (_514.y+_561.y);
-          _513.z = (_514.z+_561.z);
-          _513.w = (_514.w+_561.w);
-        *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 2048)) = make_float4(kernel[_513.x],kernel[_513.y],kernel[_513.z],kernel[_513.w]);
-        int4 _577;
-          int4 _578;
-            int4 _579 = make_int4((((((((int)blockIdx.x) / 7) * 294912) + ((((int)threadIdx.x) / 18) * 4608)) + (rc_outer_outer * 72)) + 147456), (((((((int)blockIdx.x) / 7) * 294912) + ((((int)threadIdx.x) / 18) * 4608)) + (rc_outer_outer * 72)) + 147456), (((((((int)blockIdx.x) / 7) * 294912) + ((((int)threadIdx.x) / 18) * 4608)) + (rc_outer_outer * 72)) + 147456), (((((((int)blockIdx.x) / 7) * 294912) + ((((int)threadIdx.x) / 18) * 4608)) + (rc_outer_outer * 72)) + 147456));
-            int4 _580;
-              int4 _581;
-                int4 _582;
-                  int4 _583 = make_int4((((((int)threadIdx.x) * 4) + 2304))+(1*0), (((((int)threadIdx.x) * 4) + 2304))+(1*1), (((((int)threadIdx.x) * 4) + 2304))+(1*2), (((((int)threadIdx.x) * 4) + 2304))+(1*3));
-                  int4 _584 = make_int4(3, 3, 3, 3);
-                  _582.x = (_583.x%_584.x);
-                  _582.y = (_583.y%_584.y);
-                  _582.z = (_583.z%_584.z);
-                  _582.w = (_583.w%_584.w);
-                int4 _585;
-                  int4 _586 = make_int4((((((int)threadIdx.x) * 4) + 2304))+(1*0), (((((int)threadIdx.x) * 4) + 2304))+(1*1), (((((int)threadIdx.x) * 4) + 2304))+(1*2), (((((int)threadIdx.x) * 4) + 2304))+(1*3));
-                  int4 _587 = make_int4(3, 3, 3, 3);
-                  _585.x = (_586.x/_587.x);
-                  _585.y = (_586.y/_587.y);
-                  _585.z = (_586.z/_587.z);
-                  _585.w = (_586.w/_587.w);
-                int4 _588;
-                ushort4 _589;
-                  ushort4 _590;
-                    ushort4 _591;
-                      int4 _592 = make_int4(3, 3, 3, 3);
-                      int4 _593 = make_int4(0, 0, 0, 0);
-                      _591.x = (_592.x>=_593.x);
-                      _591.y = (_592.y>=_593.y);
-                      _591.z = (_592.z>=_593.z);
-                      _591.w = (_592.w>=_593.w);
-                    ushort4 _594;
-                      int4 _595 = make_int4(0, 0, 0, 0);
-                      _594.x = (_582.x>=_595.x);
-                      _594.y = (_582.y>=_595.y);
-                      _594.z = (_582.z>=_595.z);
-                      _594.w = (_582.w>=_595.w);
-                    _590.x = (_591.x&&_594.x);
-                    _590.y = (_591.y&&_594.y);
-                    _590.z = (_591.z&&_594.z);
-                    _590.w = (_591.w&&_594.w);
-                  ushort4 _596;
-                    ushort4 _597;
-                      int4 _598 = make_int4(3, 3, 3, 3);
-                      int4 _599 = make_int4(0, 0, 0, 0);
-                      _597.x = (_598.x<_599.x);
-                      _597.y = (_598.y<_599.y);
-                      _597.z = (_598.z<_599.z);
-                      _597.w = (_598.w<_599.w);
-                    ushort4 _600;
-                      int4 _601 = make_int4(0, 0, 0, 0);
-                      _600.x = (_582.x<=_601.x);
-                      _600.y = (_582.y<=_601.y);
-                      _600.z = (_582.z<=_601.z);
-                      _600.w = (_582.w<=_601.w);
-                    _596.x = (_597.x&&_600.x);
-                    _596.y = (_597.y&&_600.y);
-                    _596.z = (_597.z&&_600.z);
-                    _596.w = (_597.w&&_600.w);
-                  _589.x = (_590.x||_596.x);
-                  _589.y = (_590.y||_596.y);
-                  _589.z = (_590.z||_596.z);
-                  _589.w = (_590.w||_596.w);
-                int4 _602;
-                  int4 _603 = make_int4(1, 1, 1, 1);
-                  _602.x = (_585.x-_603.x);
-                  _602.y = (_585.y-_603.y);
-                  _602.z = (_585.z-_603.z);
-                  _602.w = (_585.w-_603.w);
-                _588.x = (bool(_589.x)?_585.x:_602.x);
-                _588.y = (bool(_589.y)?_585.y:_602.y);
-                _588.z = (bool(_589.z)?_585.z:_602.z);
-                _588.w = (bool(_589.w)?_585.w:_602.w);
-                int4 _604 = make_int4(24, 24, 24, 24);
-                _581.x = (_588.x%_604.x);
-                _581.y = (_588.y%_604.y);
-                _581.z = (_588.z%_604.z);
-                _581.w = (_588.w%_604.w);
-              int4 _605;
-              ushort4 _606;
-                ushort4 _607;
-                  ushort4 _608;
-                    int4 _609 = make_int4(24, 24, 24, 24);
-                    int4 _610 = make_int4(0, 0, 0, 0);
-                    _608.x = (_609.x>=_610.x);
-                    _608.y = (_609.y>=_610.y);
-                    _608.z = (_609.z>=_610.z);
-                    _608.w = (_609.w>=_610.w);
-                  ushort4 _611;
-                    int4 _612 = make_int4(0, 0, 0, 0);
-                    _611.x = (_581.x>=_612.x);
-                    _611.y = (_581.y>=_612.y);
-                    _611.z = (_581.z>=_612.z);
-                    _611.w = (_581.w>=_612.w);
-                  _607.x = (_608.x&&_611.x);
-                  _607.y = (_608.y&&_611.y);
-                  _607.z = (_608.z&&_611.z);
-                  _607.w = (_608.w&&_611.w);
-                ushort4 _613;
-                  ushort4 _614;
-                    int4 _615 = make_int4(24, 24, 24, 24);
-                    int4 _616 = make_int4(0, 0, 0, 0);
-                    _614.x = (_615.x<_616.x);
-                    _614.y = (_615.y<_616.y);
-                    _614.z = (_615.z<_616.z);
-                    _614.w = (_615.w<_616.w);
-                  ushort4 _617;
-                    int4 _618 = make_int4(0, 0, 0, 0);
-                    _617.x = (_581.x<=_618.x);
-                    _617.y = (_581.y<=_618.y);
-                    _617.z = (_581.z<=_618.z);
-                    _617.w = (_581.w<=_618.w);
-                  _613.x = (_614.x&&_617.x);
-                  _613.y = (_614.y&&_617.y);
-                  _613.z = (_614.z&&_617.z);
-                  _613.w = (_614.w&&_617.w);
-                _606.x = (_607.x||_613.x);
-                _606.y = (_607.y||_613.y);
-                _606.z = (_607.z||_613.z);
-                _606.w = (_607.w||_613.w);
-              int4 _619;
-                int4 _620 = make_int4(24, 24, 24, 24);
-                _619.x = (_581.x+_620.x);
-                _619.y = (_581.y+_620.y);
-                _619.z = (_581.z+_620.z);
-                _619.w = (_581.w+_620.w);
-              _605.x = (bool(_606.x)?_581.x:_619.x);
-              _605.y = (bool(_606.y)?_581.y:_619.y);
-              _605.z = (bool(_606.z)?_581.z:_619.z);
-              _605.w = (bool(_606.w)?_581.w:_619.w);
-              int4 _621 = make_int4(3, 3, 3, 3);
-              _580.x = (_605.x*_621.x);
-              _580.y = (_605.y*_621.y);
-              _580.z = (_605.z*_621.z);
-              _580.w = (_605.w*_621.w);
-            _578.x = (_579.x+_580.x);
-            _578.y = (_579.y+_580.y);
-            _578.z = (_579.z+_580.z);
-            _578.w = (_579.w+_580.w);
-          int4 _622;
-            int4 _623 = make_int4(((((int)threadIdx.x) + 576))+(1*0), ((((int)threadIdx.x) + 576))+(1*1), ((((int)threadIdx.x) + 576))+(1*2), ((((int)threadIdx.x) + 576))+(1*3));
-            int4 _624 = make_int4(3, 3, 3, 3);
-            _622.x = (_623.x%_624.x);
-            _622.y = (_623.y%_624.y);
-            _622.z = (_623.z%_624.z);
-            _622.w = (_623.w%_624.w);
-          int4 _625;
-          ushort4 _626;
-            ushort4 _627;
-              ushort4 _628;
-                int4 _629 = make_int4(3, 3, 3, 3);
-                int4 _630 = make_int4(0, 0, 0, 0);
-                _628.x = (_629.x>=_630.x);
-                _628.y = (_629.y>=_630.y);
-                _628.z = (_629.z>=_630.z);
-                _628.w = (_629.w>=_630.w);
-              ushort4 _631;
-                int4 _632 = make_int4(0, 0, 0, 0);
-                _631.x = (_622.x>=_632.x);
-                _631.y = (_622.y>=_632.y);
-                _631.z = (_622.z>=_632.z);
-                _631.w = (_622.w>=_632.w);
-              _627.x = (_628.x&&_631.x);
-              _627.y = (_628.y&&_631.y);
-              _627.z = (_628.z&&_631.z);
-              _627.w = (_628.w&&_631.w);
-            ushort4 _633;
-              ushort4 _634;
-                int4 _635 = make_int4(3, 3, 3, 3);
-                int4 _636 = make_int4(0, 0, 0, 0);
-                _634.x = (_635.x<_636.x);
-                _634.y = (_635.y<_636.y);
-                _634.z = (_635.z<_636.z);
-                _634.w = (_635.w<_636.w);
-              ushort4 _637;
-                int4 _638 = make_int4(0, 0, 0, 0);
-                _637.x = (_622.x<=_638.x);
-                _637.y = (_622.y<=_638.y);
-                _637.z = (_622.z<=_638.z);
-                _637.w = (_622.w<=_638.w);
-              _633.x = (_634.x&&_637.x);
-              _633.y = (_634.y&&_637.y);
-              _633.z = (_634.z&&_637.z);
-              _633.w = (_634.w&&_637.w);
-            _626.x = (_627.x||_633.x);
-            _626.y = (_627.y||_633.y);
-            _626.z = (_627.z||_633.z);
-            _626.w = (_627.w||_633.w);
-          int4 _639;
-            int4 _640 = make_int4(3, 3, 3, 3);
-            _639.x = (_622.x+_640.x);
-            _639.y = (_622.y+_640.y);
-            _639.z = (_622.z+_640.z);
-            _639.w = (_622.w+_640.w);
-          _625.x = (bool(_626.x)?_622.x:_639.x);
-          _625.y = (bool(_626.y)?_622.y:_639.y);
-          _625.z = (bool(_626.z)?_622.z:_639.z);
-          _625.w = (bool(_626.w)?_622.w:_639.w);
-          _577.x = (_578.x+_625.x);
-          _577.y = (_578.y+_625.y);
-          _577.z = (_578.z+_625.z);
-          _577.w = (_578.w+_625.w);
-        *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 2304)) = make_float4(kernel[_577.x],kernel[_577.y],kernel[_577.z],kernel[_577.w]);
-        int4 _641;
-          int4 _642;
-            int4 _643 = make_int4(((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 2560) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 2560) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 2560) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 2560) / 72) * 4608)) + (rc_outer_outer * 72)));
-            int4 _644;
-              int4 _645;
-                int4 _646;
-                  int4 _647 = make_int4((((((int)threadIdx.x) * 4) + 2560))+(1*0), (((((int)threadIdx.x) * 4) + 2560))+(1*1), (((((int)threadIdx.x) * 4) + 2560))+(1*2), (((((int)threadIdx.x) * 4) + 2560))+(1*3));
-                  int4 _648 = make_int4(3, 3, 3, 3);
-                  _646.x = (_647.x%_648.x);
-                  _646.y = (_647.y%_648.y);
-                  _646.z = (_647.z%_648.z);
-                  _646.w = (_647.w%_648.w);
-                int4 _649;
-                  int4 _650 = make_int4((((((int)threadIdx.x) * 4) + 2560))+(1*0), (((((int)threadIdx.x) * 4) + 2560))+(1*1), (((((int)threadIdx.x) * 4) + 2560))+(1*2), (((((int)threadIdx.x) * 4) + 2560))+(1*3));
-                  int4 _651 = make_int4(3, 3, 3, 3);
-                  _649.x = (_650.x/_651.x);
-                  _649.y = (_650.y/_651.y);
-                  _649.z = (_650.z/_651.z);
-                  _649.w = (_650.w/_651.w);
-                int4 _652;
-                ushort4 _653;
-                  ushort4 _654;
-                    ushort4 _655;
-                      int4 _656 = make_int4(3, 3, 3, 3);
-                      int4 _657 = make_int4(0, 0, 0, 0);
-                      _655.x = (_656.x>=_657.x);
-                      _655.y = (_656.y>=_657.y);
-                      _655.z = (_656.z>=_657.z);
-                      _655.w = (_656.w>=_657.w);
-                    ushort4 _658;
-                      int4 _659 = make_int4(0, 0, 0, 0);
-                      _658.x = (_646.x>=_659.x);
-                      _658.y = (_646.y>=_659.y);
-                      _658.z = (_646.z>=_659.z);
-                      _658.w = (_646.w>=_659.w);
-                    _654.x = (_655.x&&_658.x);
-                    _654.y = (_655.y&&_658.y);
-                    _654.z = (_655.z&&_658.z);
-                    _654.w = (_655.w&&_658.w);
-                  ushort4 _660;
-                    ushort4 _661;
-                      int4 _662 = make_int4(3, 3, 3, 3);
-                      int4 _663 = make_int4(0, 0, 0, 0);
-                      _661.x = (_662.x<_663.x);
-                      _661.y = (_662.y<_663.y);
-                      _661.z = (_662.z<_663.z);
-                      _661.w = (_662.w<_663.w);
-                    ushort4 _664;
-                      int4 _665 = make_int4(0, 0, 0, 0);
-                      _664.x = (_646.x<=_665.x);
-                      _664.y = (_646.y<=_665.y);
-                      _664.z = (_646.z<=_665.z);
-                      _664.w = (_646.w<=_665.w);
-                    _660.x = (_661.x&&_664.x);
-                    _660.y = (_661.y&&_664.y);
-                    _660.z = (_661.z&&_664.z);
-                    _660.w = (_661.w&&_664.w);
-                  _653.x = (_654.x||_660.x);
-                  _653.y = (_654.y||_660.y);
-                  _653.z = (_654.z||_660.z);
-                  _653.w = (_654.w||_660.w);
-                int4 _666;
-                  int4 _667 = make_int4(1, 1, 1, 1);
-                  _666.x = (_649.x-_667.x);
-                  _666.y = (_649.y-_667.y);
-                  _666.z = (_649.z-_667.z);
-                  _666.w = (_649.w-_667.w);
-                _652.x = (bool(_653.x)?_649.x:_666.x);
-                _652.y = (bool(_653.y)?_649.y:_666.y);
-                _652.z = (bool(_653.z)?_649.z:_666.z);
-                _652.w = (bool(_653.w)?_649.w:_666.w);
-                int4 _668 = make_int4(24, 24, 24, 24);
-                _645.x = (_652.x%_668.x);
-                _645.y = (_652.y%_668.y);
-                _645.z = (_652.z%_668.z);
-                _645.w = (_652.w%_668.w);
-              int4 _669;
-              ushort4 _670;
-                ushort4 _671;
-                  ushort4 _672;
-                    int4 _673 = make_int4(24, 24, 24, 24);
-                    int4 _674 = make_int4(0, 0, 0, 0);
-                    _672.x = (_673.x>=_674.x);
-                    _672.y = (_673.y>=_674.y);
-                    _672.z = (_673.z>=_674.z);
-                    _672.w = (_673.w>=_674.w);
-                  ushort4 _675;
-                    int4 _676 = make_int4(0, 0, 0, 0);
-                    _675.x = (_645.x>=_676.x);
-                    _675.y = (_645.y>=_676.y);
-                    _675.z = (_645.z>=_676.z);
-                    _675.w = (_645.w>=_676.w);
-                  _671.x = (_672.x&&_675.x);
-                  _671.y = (_672.y&&_675.y);
-                  _671.z = (_672.z&&_675.z);
-                  _671.w = (_672.w&&_675.w);
-                ushort4 _677;
-                  ushort4 _678;
-                    int4 _679 = make_int4(24, 24, 24, 24);
-                    int4 _680 = make_int4(0, 0, 0, 0);
-                    _678.x = (_679.x<_680.x);
-                    _678.y = (_679.y<_680.y);
-                    _678.z = (_679.z<_680.z);
-                    _678.w = (_679.w<_680.w);
-                  ushort4 _681;
-                    int4 _682 = make_int4(0, 0, 0, 0);
-                    _681.x = (_645.x<=_682.x);
-                    _681.y = (_645.y<=_682.y);
-                    _681.z = (_645.z<=_682.z);
-                    _681.w = (_645.w<=_682.w);
-                  _677.x = (_678.x&&_681.x);
-                  _677.y = (_678.y&&_681.y);
-                  _677.z = (_678.z&&_681.z);
-                  _677.w = (_678.w&&_681.w);
-                _670.x = (_671.x||_677.x);
-                _670.y = (_671.y||_677.y);
-                _670.z = (_671.z||_677.z);
-                _670.w = (_671.w||_677.w);
-              int4 _683;
-                int4 _684 = make_int4(24, 24, 24, 24);
-                _683.x = (_645.x+_684.x);
-                _683.y = (_645.y+_684.y);
-                _683.z = (_645.z+_684.z);
-                _683.w = (_645.w+_684.w);
-              _669.x = (bool(_670.x)?_645.x:_683.x);
-              _669.y = (bool(_670.y)?_645.y:_683.y);
-              _669.z = (bool(_670.z)?_645.z:_683.z);
-              _669.w = (bool(_670.w)?_645.w:_683.w);
-              int4 _685 = make_int4(3, 3, 3, 3);
-              _644.x = (_669.x*_685.x);
-              _644.y = (_669.y*_685.y);
-              _644.z = (_669.z*_685.z);
-              _644.w = (_669.w*_685.w);
-            _642.x = (_643.x+_644.x);
-            _642.y = (_643.y+_644.y);
-            _642.z = (_643.z+_644.z);
-            _642.w = (_643.w+_644.w);
-          int4 _686;
-            int4 _687 = make_int4(((((int)threadIdx.x) + 640))+(1*0), ((((int)threadIdx.x) + 640))+(1*1), ((((int)threadIdx.x) + 640))+(1*2), ((((int)threadIdx.x) + 640))+(1*3));
-            int4 _688 = make_int4(3, 3, 3, 3);
-            _686.x = (_687.x%_688.x);
-            _686.y = (_687.y%_688.y);
-            _686.z = (_687.z%_688.z);
-            _686.w = (_687.w%_688.w);
-          int4 _689;
-          ushort4 _690;
-            ushort4 _691;
-              ushort4 _692;
-                int4 _693 = make_int4(3, 3, 3, 3);
-                int4 _694 = make_int4(0, 0, 0, 0);
-                _692.x = (_693.x>=_694.x);
-                _692.y = (_693.y>=_694.y);
-                _692.z = (_693.z>=_694.z);
-                _692.w = (_693.w>=_694.w);
-              ushort4 _695;
-                int4 _696 = make_int4(0, 0, 0, 0);
-                _695.x = (_686.x>=_696.x);
-                _695.y = (_686.y>=_696.y);
-                _695.z = (_686.z>=_696.z);
-                _695.w = (_686.w>=_696.w);
-              _691.x = (_692.x&&_695.x);
-              _691.y = (_692.y&&_695.y);
-              _691.z = (_692.z&&_695.z);
-              _691.w = (_692.w&&_695.w);
-            ushort4 _697;
-              ushort4 _698;
-                int4 _699 = make_int4(3, 3, 3, 3);
-                int4 _700 = make_int4(0, 0, 0, 0);
-                _698.x = (_699.x<_700.x);
-                _698.y = (_699.y<_700.y);
-                _698.z = (_699.z<_700.z);
-                _698.w = (_699.w<_700.w);
-              ushort4 _701;
-                int4 _702 = make_int4(0, 0, 0, 0);
-                _701.x = (_686.x<=_702.x);
-                _701.y = (_686.y<=_702.y);
-                _701.z = (_686.z<=_702.z);
-                _701.w = (_686.w<=_702.w);
-              _697.x = (_698.x&&_701.x);
-              _697.y = (_698.y&&_701.y);
-              _697.z = (_698.z&&_701.z);
-              _697.w = (_698.w&&_701.w);
-            _690.x = (_691.x||_697.x);
-            _690.y = (_691.y||_697.y);
-            _690.z = (_691.z||_697.z);
-            _690.w = (_691.w||_697.w);
-          int4 _703;
-            int4 _704 = make_int4(3, 3, 3, 3);
-            _703.x = (_686.x+_704.x);
-            _703.y = (_686.y+_704.y);
-            _703.z = (_686.z+_704.z);
-            _703.w = (_686.w+_704.w);
-          _689.x = (bool(_690.x)?_686.x:_703.x);
-          _689.y = (bool(_690.y)?_686.y:_703.y);
-          _689.z = (bool(_690.z)?_686.z:_703.z);
-          _689.w = (bool(_690.w)?_686.w:_703.w);
-          _641.x = (_642.x+_689.x);
-          _641.y = (_642.y+_689.y);
-          _641.z = (_642.z+_689.z);
-          _641.w = (_642.w+_689.w);
-        *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 2560)) = make_float4(kernel[_641.x],kernel[_641.y],kernel[_641.z],kernel[_641.w]);
-        int4 _705;
-          int4 _706;
-            int4 _707 = make_int4(((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 2816) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 2816) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 2816) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 2816) / 72) * 4608)) + (rc_outer_outer * 72)));
-            int4 _708;
-              int4 _709;
-                int4 _710;
-                  int4 _711 = make_int4((((((int)threadIdx.x) * 4) + 2816))+(1*0), (((((int)threadIdx.x) * 4) + 2816))+(1*1), (((((int)threadIdx.x) * 4) + 2816))+(1*2), (((((int)threadIdx.x) * 4) + 2816))+(1*3));
-                  int4 _712 = make_int4(3, 3, 3, 3);
-                  _710.x = (_711.x%_712.x);
-                  _710.y = (_711.y%_712.y);
-                  _710.z = (_711.z%_712.z);
-                  _710.w = (_711.w%_712.w);
-                int4 _713;
-                  int4 _714 = make_int4((((((int)threadIdx.x) * 4) + 2816))+(1*0), (((((int)threadIdx.x) * 4) + 2816))+(1*1), (((((int)threadIdx.x) * 4) + 2816))+(1*2), (((((int)threadIdx.x) * 4) + 2816))+(1*3));
-                  int4 _715 = make_int4(3, 3, 3, 3);
-                  _713.x = (_714.x/_715.x);
-                  _713.y = (_714.y/_715.y);
-                  _713.z = (_714.z/_715.z);
-                  _713.w = (_714.w/_715.w);
-                int4 _716;
-                ushort4 _717;
-                  ushort4 _718;
-                    ushort4 _719;
-                      int4 _720 = make_int4(3, 3, 3, 3);
-                      int4 _721 = make_int4(0, 0, 0, 0);
-                      _719.x = (_720.x>=_721.x);
-                      _719.y = (_720.y>=_721.y);
-                      _719.z = (_720.z>=_721.z);
-                      _719.w = (_720.w>=_721.w);
-                    ushort4 _722;
-                      int4 _723 = make_int4(0, 0, 0, 0);
-                      _722.x = (_710.x>=_723.x);
-                      _722.y = (_710.y>=_723.y);
-                      _722.z = (_710.z>=_723.z);
-                      _722.w = (_710.w>=_723.w);
-                    _718.x = (_719.x&&_722.x);
-                    _718.y = (_719.y&&_722.y);
-                    _718.z = (_719.z&&_722.z);
-                    _718.w = (_719.w&&_722.w);
-                  ushort4 _724;
-                    ushort4 _725;
-                      int4 _726 = make_int4(3, 3, 3, 3);
-                      int4 _727 = make_int4(0, 0, 0, 0);
-                      _725.x = (_726.x<_727.x);
-                      _725.y = (_726.y<_727.y);
-                      _725.z = (_726.z<_727.z);
-                      _725.w = (_726.w<_727.w);
-                    ushort4 _728;
-                      int4 _729 = make_int4(0, 0, 0, 0);
-                      _728.x = (_710.x<=_729.x);
-                      _728.y = (_710.y<=_729.y);
-                      _728.z = (_710.z<=_729.z);
-                      _728.w = (_710.w<=_729.w);
-                    _724.x = (_725.x&&_728.x);
-                    _724.y = (_725.y&&_728.y);
-                    _724.z = (_725.z&&_728.z);
-                    _724.w = (_725.w&&_728.w);
-                  _717.x = (_718.x||_724.x);
-                  _717.y = (_718.y||_724.y);
-                  _717.z = (_718.z||_724.z);
-                  _717.w = (_718.w||_724.w);
-                int4 _730;
-                  int4 _731 = make_int4(1, 1, 1, 1);
-                  _730.x = (_713.x-_731.x);
-                  _730.y = (_713.y-_731.y);
-                  _730.z = (_713.z-_731.z);
-                  _730.w = (_713.w-_731.w);
-                _716.x = (bool(_717.x)?_713.x:_730.x);
-                _716.y = (bool(_717.y)?_713.y:_730.y);
-                _716.z = (bool(_717.z)?_713.z:_730.z);
-                _716.w = (bool(_717.w)?_713.w:_730.w);
-                int4 _732 = make_int4(24, 24, 24, 24);
-                _709.x = (_716.x%_732.x);
-                _709.y = (_716.y%_732.y);
-                _709.z = (_716.z%_732.z);
-                _709.w = (_716.w%_732.w);
-              int4 _733;
-              ushort4 _734;
-                ushort4 _735;
-                  ushort4 _736;
-                    int4 _737 = make_int4(24, 24, 24, 24);
-                    int4 _738 = make_int4(0, 0, 0, 0);
-                    _736.x = (_737.x>=_738.x);
-                    _736.y = (_737.y>=_738.y);
-                    _736.z = (_737.z>=_738.z);
-                    _736.w = (_737.w>=_738.w);
-                  ushort4 _739;
-                    int4 _740 = make_int4(0, 0, 0, 0);
-                    _739.x = (_709.x>=_740.x);
-                    _739.y = (_709.y>=_740.y);
-                    _739.z = (_709.z>=_740.z);
-                    _739.w = (_709.w>=_740.w);
-                  _735.x = (_736.x&&_739.x);
-                  _735.y = (_736.y&&_739.y);
-                  _735.z = (_736.z&&_739.z);
-                  _735.w = (_736.w&&_739.w);
-                ushort4 _741;
-                  ushort4 _742;
-                    int4 _743 = make_int4(24, 24, 24, 24);
-                    int4 _744 = make_int4(0, 0, 0, 0);
-                    _742.x = (_743.x<_744.x);
-                    _742.y = (_743.y<_744.y);
-                    _742.z = (_743.z<_744.z);
-                    _742.w = (_743.w<_744.w);
-                  ushort4 _745;
-                    int4 _746 = make_int4(0, 0, 0, 0);
-                    _745.x = (_709.x<=_746.x);
-                    _745.y = (_709.y<=_746.y);
-                    _745.z = (_709.z<=_746.z);
-                    _745.w = (_709.w<=_746.w);
-                  _741.x = (_742.x&&_745.x);
-                  _741.y = (_742.y&&_745.y);
-                  _741.z = (_742.z&&_745.z);
-                  _741.w = (_742.w&&_745.w);
-                _734.x = (_735.x||_741.x);
-                _734.y = (_735.y||_741.y);
-                _734.z = (_735.z||_741.z);
-                _734.w = (_735.w||_741.w);
-              int4 _747;
-                int4 _748 = make_int4(24, 24, 24, 24);
-                _747.x = (_709.x+_748.x);
-                _747.y = (_709.y+_748.y);
-                _747.z = (_709.z+_748.z);
-                _747.w = (_709.w+_748.w);
-              _733.x = (bool(_734.x)?_709.x:_747.x);
-              _733.y = (bool(_734.y)?_709.y:_747.y);
-              _733.z = (bool(_734.z)?_709.z:_747.z);
-              _733.w = (bool(_734.w)?_709.w:_747.w);
-              int4 _749 = make_int4(3, 3, 3, 3);
-              _708.x = (_733.x*_749.x);
-              _708.y = (_733.y*_749.y);
-              _708.z = (_733.z*_749.z);
-              _708.w = (_733.w*_749.w);
-            _706.x = (_707.x+_708.x);
-            _706.y = (_707.y+_708.y);
-            _706.z = (_707.z+_708.z);
-            _706.w = (_707.w+_708.w);
-          int4 _750;
-            int4 _751 = make_int4(((((int)threadIdx.x) + 704))+(1*0), ((((int)threadIdx.x) + 704))+(1*1), ((((int)threadIdx.x) + 704))+(1*2), ((((int)threadIdx.x) + 704))+(1*3));
-            int4 _752 = make_int4(3, 3, 3, 3);
-            _750.x = (_751.x%_752.x);
-            _750.y = (_751.y%_752.y);
-            _750.z = (_751.z%_752.z);
-            _750.w = (_751.w%_752.w);
-          int4 _753;
-          ushort4 _754;
-            ushort4 _755;
-              ushort4 _756;
-                int4 _757 = make_int4(3, 3, 3, 3);
-                int4 _758 = make_int4(0, 0, 0, 0);
-                _756.x = (_757.x>=_758.x);
-                _756.y = (_757.y>=_758.y);
-                _756.z = (_757.z>=_758.z);
-                _756.w = (_757.w>=_758.w);
-              ushort4 _759;
-                int4 _760 = make_int4(0, 0, 0, 0);
-                _759.x = (_750.x>=_760.x);
-                _759.y = (_750.y>=_760.y);
-                _759.z = (_750.z>=_760.z);
-                _759.w = (_750.w>=_760.w);
-              _755.x = (_756.x&&_759.x);
-              _755.y = (_756.y&&_759.y);
-              _755.z = (_756.z&&_759.z);
-              _755.w = (_756.w&&_759.w);
-            ushort4 _761;
-              ushort4 _762;
-                int4 _763 = make_int4(3, 3, 3, 3);
-                int4 _764 = make_int4(0, 0, 0, 0);
-                _762.x = (_763.x<_764.x);
-                _762.y = (_763.y<_764.y);
-                _762.z = (_763.z<_764.z);
-                _762.w = (_763.w<_764.w);
-              ushort4 _765;
-                int4 _766 = make_int4(0, 0, 0, 0);
-                _765.x = (_750.x<=_766.x);
-                _765.y = (_750.y<=_766.y);
-                _765.z = (_750.z<=_766.z);
-                _765.w = (_750.w<=_766.w);
-              _761.x = (_762.x&&_765.x);
-              _761.y = (_762.y&&_765.y);
-              _761.z = (_762.z&&_765.z);
-              _761.w = (_762.w&&_765.w);
-            _754.x = (_755.x||_761.x);
-            _754.y = (_755.y||_761.y);
-            _754.z = (_755.z||_761.z);
-            _754.w = (_755.w||_761.w);
-          int4 _767;
-            int4 _768 = make_int4(3, 3, 3, 3);
-            _767.x = (_750.x+_768.x);
-            _767.y = (_750.y+_768.y);
-            _767.z = (_750.z+_768.z);
-            _767.w = (_750.w+_768.w);
-          _753.x = (bool(_754.x)?_750.x:_767.x);
-          _753.y = (bool(_754.y)?_750.y:_767.y);
-          _753.z = (bool(_754.z)?_750.z:_767.z);
-          _753.w = (bool(_754.w)?_750.w:_767.w);
-          _705.x = (_706.x+_753.x);
-          _705.y = (_706.y+_753.y);
-          _705.z = (_706.z+_753.z);
-          _705.w = (_706.w+_753.w);
-        *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 2816)) = make_float4(kernel[_705.x],kernel[_705.y],kernel[_705.z],kernel[_705.w]);
-        int4 _769;
-          int4 _770;
-            int4 _771 = make_int4(((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 3072) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 3072) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 3072) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 3072) / 72) * 4608)) + (rc_outer_outer * 72)));
-            int4 _772;
-              int4 _773;
-                int4 _774;
-                  int4 _775 = make_int4((((((int)threadIdx.x) * 4) + 3072))+(1*0), (((((int)threadIdx.x) * 4) + 3072))+(1*1), (((((int)threadIdx.x) * 4) + 3072))+(1*2), (((((int)threadIdx.x) * 4) + 3072))+(1*3));
-                  int4 _776 = make_int4(3, 3, 3, 3);
-                  _774.x = (_775.x%_776.x);
-                  _774.y = (_775.y%_776.y);
-                  _774.z = (_775.z%_776.z);
-                  _774.w = (_775.w%_776.w);
-                int4 _777;
-                  int4 _778 = make_int4((((((int)threadIdx.x) * 4) + 3072))+(1*0), (((((int)threadIdx.x) * 4) + 3072))+(1*1), (((((int)threadIdx.x) * 4) + 3072))+(1*2), (((((int)threadIdx.x) * 4) + 3072))+(1*3));
-                  int4 _779 = make_int4(3, 3, 3, 3);
-                  _777.x = (_778.x/_779.x);
-                  _777.y = (_778.y/_779.y);
-                  _777.z = (_778.z/_779.z);
-                  _777.w = (_778.w/_779.w);
-                int4 _780;
-                ushort4 _781;
-                  ushort4 _782;
-                    ushort4 _783;
-                      int4 _784 = make_int4(3, 3, 3, 3);
-                      int4 _785 = make_int4(0, 0, 0, 0);
-                      _783.x = (_784.x>=_785.x);
-                      _783.y = (_784.y>=_785.y);
-                      _783.z = (_784.z>=_785.z);
-                      _783.w = (_784.w>=_785.w);
-                    ushort4 _786;
-                      int4 _787 = make_int4(0, 0, 0, 0);
-                      _786.x = (_774.x>=_787.x);
-                      _786.y = (_774.y>=_787.y);
-                      _786.z = (_774.z>=_787.z);
-                      _786.w = (_774.w>=_787.w);
-                    _782.x = (_783.x&&_786.x);
-                    _782.y = (_783.y&&_786.y);
-                    _782.z = (_783.z&&_786.z);
-                    _782.w = (_783.w&&_786.w);
-                  ushort4 _788;
-                    ushort4 _789;
-                      int4 _790 = make_int4(3, 3, 3, 3);
-                      int4 _791 = make_int4(0, 0, 0, 0);
-                      _789.x = (_790.x<_791.x);
-                      _789.y = (_790.y<_791.y);
-                      _789.z = (_790.z<_791.z);
-                      _789.w = (_790.w<_791.w);
-                    ushort4 _792;
-                      int4 _793 = make_int4(0, 0, 0, 0);
-                      _792.x = (_774.x<=_793.x);
-                      _792.y = (_774.y<=_793.y);
-                      _792.z = (_774.z<=_793.z);
-                      _792.w = (_774.w<=_793.w);
-                    _788.x = (_789.x&&_792.x);
-                    _788.y = (_789.y&&_792.y);
-                    _788.z = (_789.z&&_792.z);
-                    _788.w = (_789.w&&_792.w);
-                  _781.x = (_782.x||_788.x);
-                  _781.y = (_782.y||_788.y);
-                  _781.z = (_782.z||_788.z);
-                  _781.w = (_782.w||_788.w);
-                int4 _794;
-                  int4 _795 = make_int4(1, 1, 1, 1);
-                  _794.x = (_777.x-_795.x);
-                  _794.y = (_777.y-_795.y);
-                  _794.z = (_777.z-_795.z);
-                  _794.w = (_777.w-_795.w);
-                _780.x = (bool(_781.x)?_777.x:_794.x);
-                _780.y = (bool(_781.y)?_777.y:_794.y);
-                _780.z = (bool(_781.z)?_777.z:_794.z);
-                _780.w = (bool(_781.w)?_777.w:_794.w);
-                int4 _796 = make_int4(24, 24, 24, 24);
-                _773.x = (_780.x%_796.x);
-                _773.y = (_780.y%_796.y);
-                _773.z = (_780.z%_796.z);
-                _773.w = (_780.w%_796.w);
-              int4 _797;
-              ushort4 _798;
-                ushort4 _799;
-                  ushort4 _800;
-                    int4 _801 = make_int4(24, 24, 24, 24);
-                    int4 _802 = make_int4(0, 0, 0, 0);
-                    _800.x = (_801.x>=_802.x);
-                    _800.y = (_801.y>=_802.y);
-                    _800.z = (_801.z>=_802.z);
-                    _800.w = (_801.w>=_802.w);
-                  ushort4 _803;
-                    int4 _804 = make_int4(0, 0, 0, 0);
-                    _803.x = (_773.x>=_804.x);
-                    _803.y = (_773.y>=_804.y);
-                    _803.z = (_773.z>=_804.z);
-                    _803.w = (_773.w>=_804.w);
-                  _799.x = (_800.x&&_803.x);
-                  _799.y = (_800.y&&_803.y);
-                  _799.z = (_800.z&&_803.z);
-                  _799.w = (_800.w&&_803.w);
-                ushort4 _805;
-                  ushort4 _806;
-                    int4 _807 = make_int4(24, 24, 24, 24);
-                    int4 _808 = make_int4(0, 0, 0, 0);
-                    _806.x = (_807.x<_808.x);
-                    _806.y = (_807.y<_808.y);
-                    _806.z = (_807.z<_808.z);
-                    _806.w = (_807.w<_808.w);
-                  ushort4 _809;
-                    int4 _810 = make_int4(0, 0, 0, 0);
-                    _809.x = (_773.x<=_810.x);
-                    _809.y = (_773.y<=_810.y);
-                    _809.z = (_773.z<=_810.z);
-                    _809.w = (_773.w<=_810.w);
-                  _805.x = (_806.x&&_809.x);
-                  _805.y = (_806.y&&_809.y);
-                  _805.z = (_806.z&&_809.z);
-                  _805.w = (_806.w&&_809.w);
-                _798.x = (_799.x||_805.x);
-                _798.y = (_799.y||_805.y);
-                _798.z = (_799.z||_805.z);
-                _798.w = (_799.w||_805.w);
-              int4 _811;
-                int4 _812 = make_int4(24, 24, 24, 24);
-                _811.x = (_773.x+_812.x);
-                _811.y = (_773.y+_812.y);
-                _811.z = (_773.z+_812.z);
-                _811.w = (_773.w+_812.w);
-              _797.x = (bool(_798.x)?_773.x:_811.x);
-              _797.y = (bool(_798.y)?_773.y:_811.y);
-              _797.z = (bool(_798.z)?_773.z:_811.z);
-              _797.w = (bool(_798.w)?_773.w:_811.w);
-              int4 _813 = make_int4(3, 3, 3, 3);
-              _772.x = (_797.x*_813.x);
-              _772.y = (_797.y*_813.y);
-              _772.z = (_797.z*_813.z);
-              _772.w = (_797.w*_813.w);
-            _770.x = (_771.x+_772.x);
-            _770.y = (_771.y+_772.y);
-            _770.z = (_771.z+_772.z);
-            _770.w = (_771.w+_772.w);
-          int4 _814;
-            int4 _815 = make_int4(((((int)threadIdx.x) + 768))+(1*0), ((((int)threadIdx.x) + 768))+(1*1), ((((int)threadIdx.x) + 768))+(1*2), ((((int)threadIdx.x) + 768))+(1*3));
-            int4 _816 = make_int4(3, 3, 3, 3);
-            _814.x = (_815.x%_816.x);
-            _814.y = (_815.y%_816.y);
-            _814.z = (_815.z%_816.z);
-            _814.w = (_815.w%_816.w);
-          int4 _817;
-          ushort4 _818;
-            ushort4 _819;
-              ushort4 _820;
-                int4 _821 = make_int4(3, 3, 3, 3);
-                int4 _822 = make_int4(0, 0, 0, 0);
-                _820.x = (_821.x>=_822.x);
-                _820.y = (_821.y>=_822.y);
-                _820.z = (_821.z>=_822.z);
-                _820.w = (_821.w>=_822.w);
-              ushort4 _823;
-                int4 _824 = make_int4(0, 0, 0, 0);
-                _823.x = (_814.x>=_824.x);
-                _823.y = (_814.y>=_824.y);
-                _823.z = (_814.z>=_824.z);
-                _823.w = (_814.w>=_824.w);
-              _819.x = (_820.x&&_823.x);
-              _819.y = (_820.y&&_823.y);
-              _819.z = (_820.z&&_823.z);
-              _819.w = (_820.w&&_823.w);
-            ushort4 _825;
-              ushort4 _826;
-                int4 _827 = make_int4(3, 3, 3, 3);
-                int4 _828 = make_int4(0, 0, 0, 0);
-                _826.x = (_827.x<_828.x);
-                _826.y = (_827.y<_828.y);
-                _826.z = (_827.z<_828.z);
-                _826.w = (_827.w<_828.w);
-              ushort4 _829;
-                int4 _830 = make_int4(0, 0, 0, 0);
-                _829.x = (_814.x<=_830.x);
-                _829.y = (_814.y<=_830.y);
-                _829.z = (_814.z<=_830.z);
-                _829.w = (_814.w<=_830.w);
-              _825.x = (_826.x&&_829.x);
-              _825.y = (_826.y&&_829.y);
-              _825.z = (_826.z&&_829.z);
-              _825.w = (_826.w&&_829.w);
-            _818.x = (_819.x||_825.x);
-            _818.y = (_819.y||_825.y);
-            _818.z = (_819.z||_825.z);
-            _818.w = (_819.w||_825.w);
-          int4 _831;
-            int4 _832 = make_int4(3, 3, 3, 3);
-            _831.x = (_814.x+_832.x);
-            _831.y = (_814.y+_832.y);
-            _831.z = (_814.z+_832.z);
-            _831.w = (_814.w+_832.w);
-          _817.x = (bool(_818.x)?_814.x:_831.x);
-          _817.y = (bool(_818.y)?_814.y:_831.y);
-          _817.z = (bool(_818.z)?_814.z:_831.z);
-          _817.w = (bool(_818.w)?_814.w:_831.w);
-          _769.x = (_770.x+_817.x);
-          _769.y = (_770.y+_817.y);
-          _769.z = (_770.z+_817.z);
-          _769.w = (_770.w+_817.w);
-        *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 3072)) = make_float4(kernel[_769.x],kernel[_769.y],kernel[_769.z],kernel[_769.w]);
-        int4 _833;
-          int4 _834;
-            int4 _835 = make_int4(((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 3328) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 3328) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 3328) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 3328) / 72) * 4608)) + (rc_outer_outer * 72)));
-            int4 _836;
-              int4 _837;
-                int4 _838;
-                  int4 _839 = make_int4((((((int)threadIdx.x) * 4) + 3328))+(1*0), (((((int)threadIdx.x) * 4) + 3328))+(1*1), (((((int)threadIdx.x) * 4) + 3328))+(1*2), (((((int)threadIdx.x) * 4) + 3328))+(1*3));
-                  int4 _840 = make_int4(3, 3, 3, 3);
-                  _838.x = (_839.x%_840.x);
-                  _838.y = (_839.y%_840.y);
-                  _838.z = (_839.z%_840.z);
-                  _838.w = (_839.w%_840.w);
-                int4 _841;
-                  int4 _842 = make_int4((((((int)threadIdx.x) * 4) + 3328))+(1*0), (((((int)threadIdx.x) * 4) + 3328))+(1*1), (((((int)threadIdx.x) * 4) + 3328))+(1*2), (((((int)threadIdx.x) * 4) + 3328))+(1*3));
-                  int4 _843 = make_int4(3, 3, 3, 3);
-                  _841.x = (_842.x/_843.x);
-                  _841.y = (_842.y/_843.y);
-                  _841.z = (_842.z/_843.z);
-                  _841.w = (_842.w/_843.w);
-                int4 _844;
-                ushort4 _845;
-                  ushort4 _846;
-                    ushort4 _847;
-                      int4 _848 = make_int4(3, 3, 3, 3);
-                      int4 _849 = make_int4(0, 0, 0, 0);
-                      _847.x = (_848.x>=_849.x);
-                      _847.y = (_848.y>=_849.y);
-                      _847.z = (_848.z>=_849.z);
-                      _847.w = (_848.w>=_849.w);
-                    ushort4 _850;
-                      int4 _851 = make_int4(0, 0, 0, 0);
-                      _850.x = (_838.x>=_851.x);
-                      _850.y = (_838.y>=_851.y);
-                      _850.z = (_838.z>=_851.z);
-                      _850.w = (_838.w>=_851.w);
-                    _846.x = (_847.x&&_850.x);
-                    _846.y = (_847.y&&_850.y);
-                    _846.z = (_847.z&&_850.z);
-                    _846.w = (_847.w&&_850.w);
-                  ushort4 _852;
-                    ushort4 _853;
-                      int4 _854 = make_int4(3, 3, 3, 3);
-                      int4 _855 = make_int4(0, 0, 0, 0);
-                      _853.x = (_854.x<_855.x);
-                      _853.y = (_854.y<_855.y);
-                      _853.z = (_854.z<_855.z);
-                      _853.w = (_854.w<_855.w);
-                    ushort4 _856;
-                      int4 _857 = make_int4(0, 0, 0, 0);
-                      _856.x = (_838.x<=_857.x);
-                      _856.y = (_838.y<=_857.y);
-                      _856.z = (_838.z<=_857.z);
-                      _856.w = (_838.w<=_857.w);
-                    _852.x = (_853.x&&_856.x);
-                    _852.y = (_853.y&&_856.y);
-                    _852.z = (_853.z&&_856.z);
-                    _852.w = (_853.w&&_856.w);
-                  _845.x = (_846.x||_852.x);
-                  _845.y = (_846.y||_852.y);
-                  _845.z = (_846.z||_852.z);
-                  _845.w = (_846.w||_852.w);
-                int4 _858;
-                  int4 _859 = make_int4(1, 1, 1, 1);
-                  _858.x = (_841.x-_859.x);
-                  _858.y = (_841.y-_859.y);
-                  _858.z = (_841.z-_859.z);
-                  _858.w = (_841.w-_859.w);
-                _844.x = (bool(_845.x)?_841.x:_858.x);
-                _844.y = (bool(_845.y)?_841.y:_858.y);
-                _844.z = (bool(_845.z)?_841.z:_858.z);
-                _844.w = (bool(_845.w)?_841.w:_858.w);
-                int4 _860 = make_int4(24, 24, 24, 24);
-                _837.x = (_844.x%_860.x);
-                _837.y = (_844.y%_860.y);
-                _837.z = (_844.z%_860.z);
-                _837.w = (_844.w%_860.w);
-              int4 _861;
-              ushort4 _862;
-                ushort4 _863;
-                  ushort4 _864;
-                    int4 _865 = make_int4(24, 24, 24, 24);
-                    int4 _866 = make_int4(0, 0, 0, 0);
-                    _864.x = (_865.x>=_866.x);
-                    _864.y = (_865.y>=_866.y);
-                    _864.z = (_865.z>=_866.z);
-                    _864.w = (_865.w>=_866.w);
-                  ushort4 _867;
-                    int4 _868 = make_int4(0, 0, 0, 0);
-                    _867.x = (_837.x>=_868.x);
-                    _867.y = (_837.y>=_868.y);
-                    _867.z = (_837.z>=_868.z);
-                    _867.w = (_837.w>=_868.w);
-                  _863.x = (_864.x&&_867.x);
-                  _863.y = (_864.y&&_867.y);
-                  _863.z = (_864.z&&_867.z);
-                  _863.w = (_864.w&&_867.w);
-                ushort4 _869;
-                  ushort4 _870;
-                    int4 _871 = make_int4(24, 24, 24, 24);
-                    int4 _872 = make_int4(0, 0, 0, 0);
-                    _870.x = (_871.x<_872.x);
-                    _870.y = (_871.y<_872.y);
-                    _870.z = (_871.z<_872.z);
-                    _870.w = (_871.w<_872.w);
-                  ushort4 _873;
-                    int4 _874 = make_int4(0, 0, 0, 0);
-                    _873.x = (_837.x<=_874.x);
-                    _873.y = (_837.y<=_874.y);
-                    _873.z = (_837.z<=_874.z);
-                    _873.w = (_837.w<=_874.w);
-                  _869.x = (_870.x&&_873.x);
-                  _869.y = (_870.y&&_873.y);
-                  _869.z = (_870.z&&_873.z);
-                  _869.w = (_870.w&&_873.w);
-                _862.x = (_863.x||_869.x);
-                _862.y = (_863.y||_869.y);
-                _862.z = (_863.z||_869.z);
-                _862.w = (_863.w||_869.w);
-              int4 _875;
-                int4 _876 = make_int4(24, 24, 24, 24);
-                _875.x = (_837.x+_876.x);
-                _875.y = (_837.y+_876.y);
-                _875.z = (_837.z+_876.z);
-                _875.w = (_837.w+_876.w);
-              _861.x = (bool(_862.x)?_837.x:_875.x);
-              _861.y = (bool(_862.y)?_837.y:_875.y);
-              _861.z = (bool(_862.z)?_837.z:_875.z);
-              _861.w = (bool(_862.w)?_837.w:_875.w);
-              int4 _877 = make_int4(3, 3, 3, 3);
-              _836.x = (_861.x*_877.x);
-              _836.y = (_861.y*_877.y);
-              _836.z = (_861.z*_877.z);
-              _836.w = (_861.w*_877.w);
-            _834.x = (_835.x+_836.x);
-            _834.y = (_835.y+_836.y);
-            _834.z = (_835.z+_836.z);
-            _834.w = (_835.w+_836.w);
-          int4 _878;
-            int4 _879 = make_int4(((((int)threadIdx.x) + 832))+(1*0), ((((int)threadIdx.x) + 832))+(1*1), ((((int)threadIdx.x) + 832))+(1*2), ((((int)threadIdx.x) + 832))+(1*3));
-            int4 _880 = make_int4(3, 3, 3, 3);
-            _878.x = (_879.x%_880.x);
-            _878.y = (_879.y%_880.y);
-            _878.z = (_879.z%_880.z);
-            _878.w = (_879.w%_880.w);
-          int4 _881;
-          ushort4 _882;
-            ushort4 _883;
-              ushort4 _884;
-                int4 _885 = make_int4(3, 3, 3, 3);
-                int4 _886 = make_int4(0, 0, 0, 0);
-                _884.x = (_885.x>=_886.x);
-                _884.y = (_885.y>=_886.y);
-                _884.z = (_885.z>=_886.z);
-                _884.w = (_885.w>=_886.w);
-              ushort4 _887;
-                int4 _888 = make_int4(0, 0, 0, 0);
-                _887.x = (_878.x>=_888.x);
-                _887.y = (_878.y>=_888.y);
-                _887.z = (_878.z>=_888.z);
-                _887.w = (_878.w>=_888.w);
-              _883.x = (_884.x&&_887.x);
-              _883.y = (_884.y&&_887.y);
-              _883.z = (_884.z&&_887.z);
-              _883.w = (_884.w&&_887.w);
-            ushort4 _889;
-              ushort4 _890;
-                int4 _891 = make_int4(3, 3, 3, 3);
-                int4 _892 = make_int4(0, 0, 0, 0);
-                _890.x = (_891.x<_892.x);
-                _890.y = (_891.y<_892.y);
-                _890.z = (_891.z<_892.z);
-                _890.w = (_891.w<_892.w);
-              ushort4 _893;
-                int4 _894 = make_int4(0, 0, 0, 0);
-                _893.x = (_878.x<=_894.x);
-                _893.y = (_878.y<=_894.y);
-                _893.z = (_878.z<=_894.z);
-                _893.w = (_878.w<=_894.w);
-              _889.x = (_890.x&&_893.x);
-              _889.y = (_890.y&&_893.y);
-              _889.z = (_890.z&&_893.z);
-              _889.w = (_890.w&&_893.w);
-            _882.x = (_883.x||_889.x);
-            _882.y = (_883.y||_889.y);
-            _882.z = (_883.z||_889.z);
-            _882.w = (_883.w||_889.w);
-          int4 _895;
-            int4 _896 = make_int4(3, 3, 3, 3);
-            _895.x = (_878.x+_896.x);
-            _895.y = (_878.y+_896.y);
-            _895.z = (_878.z+_896.z);
-            _895.w = (_878.w+_896.w);
-          _881.x = (bool(_882.x)?_878.x:_895.x);
-          _881.y = (bool(_882.y)?_878.y:_895.y);
-          _881.z = (bool(_882.z)?_878.z:_895.z);
-          _881.w = (bool(_882.w)?_878.w:_895.w);
-          _833.x = (_834.x+_881.x);
-          _833.y = (_834.y+_881.y);
-          _833.z = (_834.z+_881.z);
-          _833.w = (_834.w+_881.w);
-        *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 3328)) = make_float4(kernel[_833.x],kernel[_833.y],kernel[_833.z],kernel[_833.w]);
-        int4 _897;
-          int4 _898;
-            int4 _899 = make_int4(((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 3584) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 3584) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 3584) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 3584) / 72) * 4608)) + (rc_outer_outer * 72)));
-            int4 _900;
-              int4 _901;
-                int4 _902;
-                  int4 _903 = make_int4((((((int)threadIdx.x) * 4) + 3584))+(1*0), (((((int)threadIdx.x) * 4) + 3584))+(1*1), (((((int)threadIdx.x) * 4) + 3584))+(1*2), (((((int)threadIdx.x) * 4) + 3584))+(1*3));
-                  int4 _904 = make_int4(3, 3, 3, 3);
-                  _902.x = (_903.x%_904.x);
-                  _902.y = (_903.y%_904.y);
-                  _902.z = (_903.z%_904.z);
-                  _902.w = (_903.w%_904.w);
-                int4 _905;
-                  int4 _906 = make_int4((((((int)threadIdx.x) * 4) + 3584))+(1*0), (((((int)threadIdx.x) * 4) + 3584))+(1*1), (((((int)threadIdx.x) * 4) + 3584))+(1*2), (((((int)threadIdx.x) * 4) + 3584))+(1*3));
-                  int4 _907 = make_int4(3, 3, 3, 3);
-                  _905.x = (_906.x/_907.x);
-                  _905.y = (_906.y/_907.y);
-                  _905.z = (_906.z/_907.z);
-                  _905.w = (_906.w/_907.w);
-                int4 _908;
-                ushort4 _909;
-                  ushort4 _910;
-                    ushort4 _911;
-                      int4 _912 = make_int4(3, 3, 3, 3);
-                      int4 _913 = make_int4(0, 0, 0, 0);
-                      _911.x = (_912.x>=_913.x);
-                      _911.y = (_912.y>=_913.y);
-                      _911.z = (_912.z>=_913.z);
-                      _911.w = (_912.w>=_913.w);
-                    ushort4 _914;
-                      int4 _915 = make_int4(0, 0, 0, 0);
-                      _914.x = (_902.x>=_915.x);
-                      _914.y = (_902.y>=_915.y);
-                      _914.z = (_902.z>=_915.z);
-                      _914.w = (_902.w>=_915.w);
-                    _910.x = (_911.x&&_914.x);
-                    _910.y = (_911.y&&_914.y);
-                    _910.z = (_911.z&&_914.z);
-                    _910.w = (_911.w&&_914.w);
-                  ushort4 _916;
-                    ushort4 _917;
-                      int4 _918 = make_int4(3, 3, 3, 3);
-                      int4 _919 = make_int4(0, 0, 0, 0);
-                      _917.x = (_918.x<_919.x);
-                      _917.y = (_918.y<_919.y);
-                      _917.z = (_918.z<_919.z);
-                      _917.w = (_918.w<_919.w);
-                    ushort4 _920;
-                      int4 _921 = make_int4(0, 0, 0, 0);
-                      _920.x = (_902.x<=_921.x);
-                      _920.y = (_902.y<=_921.y);
-                      _920.z = (_902.z<=_921.z);
-                      _920.w = (_902.w<=_921.w);
-                    _916.x = (_917.x&&_920.x);
-                    _916.y = (_917.y&&_920.y);
-                    _916.z = (_917.z&&_920.z);
-                    _916.w = (_917.w&&_920.w);
-                  _909.x = (_910.x||_916.x);
-                  _909.y = (_910.y||_916.y);
-                  _909.z = (_910.z||_916.z);
-                  _909.w = (_910.w||_916.w);
-                int4 _922;
-                  int4 _923 = make_int4(1, 1, 1, 1);
-                  _922.x = (_905.x-_923.x);
-                  _922.y = (_905.y-_923.y);
-                  _922.z = (_905.z-_923.z);
-                  _922.w = (_905.w-_923.w);
-                _908.x = (bool(_909.x)?_905.x:_922.x);
-                _908.y = (bool(_909.y)?_905.y:_922.y);
-                _908.z = (bool(_909.z)?_905.z:_922.z);
-                _908.w = (bool(_909.w)?_905.w:_922.w);
-                int4 _924 = make_int4(24, 24, 24, 24);
-                _901.x = (_908.x%_924.x);
-                _901.y = (_908.y%_924.y);
-                _901.z = (_908.z%_924.z);
-                _901.w = (_908.w%_924.w);
-              int4 _925;
-              ushort4 _926;
-                ushort4 _927;
-                  ushort4 _928;
-                    int4 _929 = make_int4(24, 24, 24, 24);
-                    int4 _930 = make_int4(0, 0, 0, 0);
-                    _928.x = (_929.x>=_930.x);
-                    _928.y = (_929.y>=_930.y);
-                    _928.z = (_929.z>=_930.z);
-                    _928.w = (_929.w>=_930.w);
-                  ushort4 _931;
-                    int4 _932 = make_int4(0, 0, 0, 0);
-                    _931.x = (_901.x>=_932.x);
-                    _931.y = (_901.y>=_932.y);
-                    _931.z = (_901.z>=_932.z);
-                    _931.w = (_901.w>=_932.w);
-                  _927.x = (_928.x&&_931.x);
-                  _927.y = (_928.y&&_931.y);
-                  _927.z = (_928.z&&_931.z);
-                  _927.w = (_928.w&&_931.w);
-                ushort4 _933;
-                  ushort4 _934;
-                    int4 _935 = make_int4(24, 24, 24, 24);
-                    int4 _936 = make_int4(0, 0, 0, 0);
-                    _934.x = (_935.x<_936.x);
-                    _934.y = (_935.y<_936.y);
-                    _934.z = (_935.z<_936.z);
-                    _934.w = (_935.w<_936.w);
-                  ushort4 _937;
-                    int4 _938 = make_int4(0, 0, 0, 0);
-                    _937.x = (_901.x<=_938.x);
-                    _937.y = (_901.y<=_938.y);
-                    _937.z = (_901.z<=_938.z);
-                    _937.w = (_901.w<=_938.w);
-                  _933.x = (_934.x&&_937.x);
-                  _933.y = (_934.y&&_937.y);
-                  _933.z = (_934.z&&_937.z);
-                  _933.w = (_934.w&&_937.w);
-                _926.x = (_927.x||_933.x);
-                _926.y = (_927.y||_933.y);
-                _926.z = (_927.z||_933.z);
-                _926.w = (_927.w||_933.w);
-              int4 _939;
-                int4 _940 = make_int4(24, 24, 24, 24);
-                _939.x = (_901.x+_940.x);
-                _939.y = (_901.y+_940.y);
-                _939.z = (_901.z+_940.z);
-                _939.w = (_901.w+_940.w);
-              _925.x = (bool(_926.x)?_901.x:_939.x);
-              _925.y = (bool(_926.y)?_901.y:_939.y);
-              _925.z = (bool(_926.z)?_901.z:_939.z);
-              _925.w = (bool(_926.w)?_901.w:_939.w);
-              int4 _941 = make_int4(3, 3, 3, 3);
-              _900.x = (_925.x*_941.x);
-              _900.y = (_925.y*_941.y);
-              _900.z = (_925.z*_941.z);
-              _900.w = (_925.w*_941.w);
-            _898.x = (_899.x+_900.x);
-            _898.y = (_899.y+_900.y);
-            _898.z = (_899.z+_900.z);
-            _898.w = (_899.w+_900.w);
-          int4 _942;
-            int4 _943 = make_int4(((((int)threadIdx.x) + 896))+(1*0), ((((int)threadIdx.x) + 896))+(1*1), ((((int)threadIdx.x) + 896))+(1*2), ((((int)threadIdx.x) + 896))+(1*3));
-            int4 _944 = make_int4(3, 3, 3, 3);
-            _942.x = (_943.x%_944.x);
-            _942.y = (_943.y%_944.y);
-            _942.z = (_943.z%_944.z);
-            _942.w = (_943.w%_944.w);
-          int4 _945;
-          ushort4 _946;
-            ushort4 _947;
-              ushort4 _948;
-                int4 _949 = make_int4(3, 3, 3, 3);
-                int4 _950 = make_int4(0, 0, 0, 0);
-                _948.x = (_949.x>=_950.x);
-                _948.y = (_949.y>=_950.y);
-                _948.z = (_949.z>=_950.z);
-                _948.w = (_949.w>=_950.w);
-              ushort4 _951;
-                int4 _952 = make_int4(0, 0, 0, 0);
-                _951.x = (_942.x>=_952.x);
-                _951.y = (_942.y>=_952.y);
-                _951.z = (_942.z>=_952.z);
-                _951.w = (_942.w>=_952.w);
-              _947.x = (_948.x&&_951.x);
-              _947.y = (_948.y&&_951.y);
-              _947.z = (_948.z&&_951.z);
-              _947.w = (_948.w&&_951.w);
-            ushort4 _953;
-              ushort4 _954;
-                int4 _955 = make_int4(3, 3, 3, 3);
-                int4 _956 = make_int4(0, 0, 0, 0);
-                _954.x = (_955.x<_956.x);
-                _954.y = (_955.y<_956.y);
-                _954.z = (_955.z<_956.z);
-                _954.w = (_955.w<_956.w);
-              ushort4 _957;
-                int4 _958 = make_int4(0, 0, 0, 0);
-                _957.x = (_942.x<=_958.x);
-                _957.y = (_942.y<=_958.y);
-                _957.z = (_942.z<=_958.z);
-                _957.w = (_942.w<=_958.w);
-              _953.x = (_954.x&&_957.x);
-              _953.y = (_954.y&&_957.y);
-              _953.z = (_954.z&&_957.z);
-              _953.w = (_954.w&&_957.w);
-            _946.x = (_947.x||_953.x);
-            _946.y = (_947.y||_953.y);
-            _946.z = (_947.z||_953.z);
-            _946.w = (_947.w||_953.w);
-          int4 _959;
-            int4 _960 = make_int4(3, 3, 3, 3);
-            _959.x = (_942.x+_960.x);
-            _959.y = (_942.y+_960.y);
-            _959.z = (_942.z+_960.z);
-            _959.w = (_942.w+_960.w);
-          _945.x = (bool(_946.x)?_942.x:_959.x);
-          _945.y = (bool(_946.y)?_942.y:_959.y);
-          _945.z = (bool(_946.z)?_942.z:_959.z);
-          _945.w = (bool(_946.w)?_942.w:_959.w);
-          _897.x = (_898.x+_945.x);
-          _897.y = (_898.y+_945.y);
-          _897.z = (_898.z+_945.z);
-          _897.w = (_898.w+_945.w);
-        *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 3584)) = make_float4(kernel[_897.x],kernel[_897.y],kernel[_897.z],kernel[_897.w]);
-        int4 _961;
-          int4 _962;
-            int4 _963 = make_int4(((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 3840) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 3840) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 3840) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 3840) / 72) * 4608)) + (rc_outer_outer * 72)));
-            int4 _964;
-              int4 _965;
-                int4 _966;
-                  int4 _967 = make_int4((((((int)threadIdx.x) * 4) + 3840))+(1*0), (((((int)threadIdx.x) * 4) + 3840))+(1*1), (((((int)threadIdx.x) * 4) + 3840))+(1*2), (((((int)threadIdx.x) * 4) + 3840))+(1*3));
-                  int4 _968 = make_int4(3, 3, 3, 3);
-                  _966.x = (_967.x%_968.x);
-                  _966.y = (_967.y%_968.y);
-                  _966.z = (_967.z%_968.z);
-                  _966.w = (_967.w%_968.w);
-                int4 _969;
-                  int4 _970 = make_int4((((((int)threadIdx.x) * 4) + 3840))+(1*0), (((((int)threadIdx.x) * 4) + 3840))+(1*1), (((((int)threadIdx.x) * 4) + 3840))+(1*2), (((((int)threadIdx.x) * 4) + 3840))+(1*3));
-                  int4 _971 = make_int4(3, 3, 3, 3);
-                  _969.x = (_970.x/_971.x);
-                  _969.y = (_970.y/_971.y);
-                  _969.z = (_970.z/_971.z);
-                  _969.w = (_970.w/_971.w);
-                int4 _972;
-                ushort4 _973;
-                  ushort4 _974;
-                    ushort4 _975;
-                      int4 _976 = make_int4(3, 3, 3, 3);
-                      int4 _977 = make_int4(0, 0, 0, 0);
-                      _975.x = (_976.x>=_977.x);
-                      _975.y = (_976.y>=_977.y);
-                      _975.z = (_976.z>=_977.z);
-                      _975.w = (_976.w>=_977.w);
-                    ushort4 _978;
-                      int4 _979 = make_int4(0, 0, 0, 0);
-                      _978.x = (_966.x>=_979.x);
-                      _978.y = (_966.y>=_979.y);
-                      _978.z = (_966.z>=_979.z);
-                      _978.w = (_966.w>=_979.w);
-                    _974.x = (_975.x&&_978.x);
-                    _974.y = (_975.y&&_978.y);
-                    _974.z = (_975.z&&_978.z);
-                    _974.w = (_975.w&&_978.w);
-                  ushort4 _980;
-                    ushort4 _981;
-                      int4 _982 = make_int4(3, 3, 3, 3);
-                      int4 _983 = make_int4(0, 0, 0, 0);
-                      _981.x = (_982.x<_983.x);
-                      _981.y = (_982.y<_983.y);
-                      _981.z = (_982.z<_983.z);
-                      _981.w = (_982.w<_983.w);
-                    ushort4 _984;
-                      int4 _985 = make_int4(0, 0, 0, 0);
-                      _984.x = (_966.x<=_985.x);
-                      _984.y = (_966.y<=_985.y);
-                      _984.z = (_966.z<=_985.z);
-                      _984.w = (_966.w<=_985.w);
-                    _980.x = (_981.x&&_984.x);
-                    _980.y = (_981.y&&_984.y);
-                    _980.z = (_981.z&&_984.z);
-                    _980.w = (_981.w&&_984.w);
-                  _973.x = (_974.x||_980.x);
-                  _973.y = (_974.y||_980.y);
-                  _973.z = (_974.z||_980.z);
-                  _973.w = (_974.w||_980.w);
-                int4 _986;
-                  int4 _987 = make_int4(1, 1, 1, 1);
-                  _986.x = (_969.x-_987.x);
-                  _986.y = (_969.y-_987.y);
-                  _986.z = (_969.z-_987.z);
-                  _986.w = (_969.w-_987.w);
-                _972.x = (bool(_973.x)?_969.x:_986.x);
-                _972.y = (bool(_973.y)?_969.y:_986.y);
-                _972.z = (bool(_973.z)?_969.z:_986.z);
-                _972.w = (bool(_973.w)?_969.w:_986.w);
-                int4 _988 = make_int4(24, 24, 24, 24);
-                _965.x = (_972.x%_988.x);
-                _965.y = (_972.y%_988.y);
-                _965.z = (_972.z%_988.z);
-                _965.w = (_972.w%_988.w);
-              int4 _989;
-              ushort4 _990;
-                ushort4 _991;
-                  ushort4 _992;
-                    int4 _993 = make_int4(24, 24, 24, 24);
-                    int4 _994 = make_int4(0, 0, 0, 0);
-                    _992.x = (_993.x>=_994.x);
-                    _992.y = (_993.y>=_994.y);
-                    _992.z = (_993.z>=_994.z);
-                    _992.w = (_993.w>=_994.w);
-                  ushort4 _995;
-                    int4 _996 = make_int4(0, 0, 0, 0);
-                    _995.x = (_965.x>=_996.x);
-                    _995.y = (_965.y>=_996.y);
-                    _995.z = (_965.z>=_996.z);
-                    _995.w = (_965.w>=_996.w);
-                  _991.x = (_992.x&&_995.x);
-                  _991.y = (_992.y&&_995.y);
-                  _991.z = (_992.z&&_995.z);
-                  _991.w = (_992.w&&_995.w);
-                ushort4 _997;
-                  ushort4 _998;
-                    int4 _999 = make_int4(24, 24, 24, 24);
-                    int4 _1000 = make_int4(0, 0, 0, 0);
-                    _998.x = (_999.x<_1000.x);
-                    _998.y = (_999.y<_1000.y);
-                    _998.z = (_999.z<_1000.z);
-                    _998.w = (_999.w<_1000.w);
-                  ushort4 _1001;
-                    int4 _1002 = make_int4(0, 0, 0, 0);
-                    _1001.x = (_965.x<=_1002.x);
-                    _1001.y = (_965.y<=_1002.y);
-                    _1001.z = (_965.z<=_1002.z);
-                    _1001.w = (_965.w<=_1002.w);
-                  _997.x = (_998.x&&_1001.x);
-                  _997.y = (_998.y&&_1001.y);
-                  _997.z = (_998.z&&_1001.z);
-                  _997.w = (_998.w&&_1001.w);
-                _990.x = (_991.x||_997.x);
-                _990.y = (_991.y||_997.y);
-                _990.z = (_991.z||_997.z);
-                _990.w = (_991.w||_997.w);
-              int4 _1003;
-                int4 _1004 = make_int4(24, 24, 24, 24);
-                _1003.x = (_965.x+_1004.x);
-                _1003.y = (_965.y+_1004.y);
-                _1003.z = (_965.z+_1004.z);
-                _1003.w = (_965.w+_1004.w);
-              _989.x = (bool(_990.x)?_965.x:_1003.x);
-              _989.y = (bool(_990.y)?_965.y:_1003.y);
-              _989.z = (bool(_990.z)?_965.z:_1003.z);
-              _989.w = (bool(_990.w)?_965.w:_1003.w);
-              int4 _1005 = make_int4(3, 3, 3, 3);
-              _964.x = (_989.x*_1005.x);
-              _964.y = (_989.y*_1005.y);
-              _964.z = (_989.z*_1005.z);
-              _964.w = (_989.w*_1005.w);
-            _962.x = (_963.x+_964.x);
-            _962.y = (_963.y+_964.y);
-            _962.z = (_963.z+_964.z);
-            _962.w = (_963.w+_964.w);
-          int4 _1006;
-            int4 _1007 = make_int4(((((int)threadIdx.x) + 960))+(1*0), ((((int)threadIdx.x) + 960))+(1*1), ((((int)threadIdx.x) + 960))+(1*2), ((((int)threadIdx.x) + 960))+(1*3));
-            int4 _1008 = make_int4(3, 3, 3, 3);
-            _1006.x = (_1007.x%_1008.x);
-            _1006.y = (_1007.y%_1008.y);
-            _1006.z = (_1007.z%_1008.z);
-            _1006.w = (_1007.w%_1008.w);
-          int4 _1009;
-          ushort4 _1010;
-            ushort4 _1011;
-              ushort4 _1012;
-                int4 _1013 = make_int4(3, 3, 3, 3);
-                int4 _1014 = make_int4(0, 0, 0, 0);
-                _1012.x = (_1013.x>=_1014.x);
-                _1012.y = (_1013.y>=_1014.y);
-                _1012.z = (_1013.z>=_1014.z);
-                _1012.w = (_1013.w>=_1014.w);
-              ushort4 _1015;
-                int4 _1016 = make_int4(0, 0, 0, 0);
-                _1015.x = (_1006.x>=_1016.x);
-                _1015.y = (_1006.y>=_1016.y);
-                _1015.z = (_1006.z>=_1016.z);
-                _1015.w = (_1006.w>=_1016.w);
-              _1011.x = (_1012.x&&_1015.x);
-              _1011.y = (_1012.y&&_1015.y);
-              _1011.z = (_1012.z&&_1015.z);
-              _1011.w = (_1012.w&&_1015.w);
-            ushort4 _1017;
-              ushort4 _1018;
-                int4 _1019 = make_int4(3, 3, 3, 3);
-                int4 _1020 = make_int4(0, 0, 0, 0);
-                _1018.x = (_1019.x<_1020.x);
-                _1018.y = (_1019.y<_1020.y);
-                _1018.z = (_1019.z<_1020.z);
-                _1018.w = (_1019.w<_1020.w);
-              ushort4 _1021;
-                int4 _1022 = make_int4(0, 0, 0, 0);
-                _1021.x = (_1006.x<=_1022.x);
-                _1021.y = (_1006.y<=_1022.y);
-                _1021.z = (_1006.z<=_1022.z);
-                _1021.w = (_1006.w<=_1022.w);
-              _1017.x = (_1018.x&&_1021.x);
-              _1017.y = (_1018.y&&_1021.y);
-              _1017.z = (_1018.z&&_1021.z);
-              _1017.w = (_1018.w&&_1021.w);
-            _1010.x = (_1011.x||_1017.x);
-            _1010.y = (_1011.y||_1017.y);
-            _1010.z = (_1011.z||_1017.z);
-            _1010.w = (_1011.w||_1017.w);
-          int4 _1023;
-            int4 _1024 = make_int4(3, 3, 3, 3);
-            _1023.x = (_1006.x+_1024.x);
-            _1023.y = (_1006.y+_1024.y);
-            _1023.z = (_1006.z+_1024.z);
-            _1023.w = (_1006.w+_1024.w);
-          _1009.x = (bool(_1010.x)?_1006.x:_1023.x);
-          _1009.y = (bool(_1010.y)?_1006.y:_1023.y);
-          _1009.z = (bool(_1010.z)?_1006.z:_1023.z);
-          _1009.w = (bool(_1010.w)?_1006.w:_1023.w);
-          _961.x = (_962.x+_1009.x);
-          _961.y = (_962.y+_1009.y);
-          _961.z = (_962.z+_1009.z);
-          _961.w = (_962.w+_1009.w);
-        *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 3840)) = make_float4(kernel[_961.x],kernel[_961.y],kernel[_961.z],kernel[_961.w]);
-        int4 _1025;
-          int4 _1026;
-            int4 _1027 = make_int4(((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 4096) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 4096) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 4096) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 4096) / 72) * 4608)) + (rc_outer_outer * 72)));
-            int4 _1028;
-              int4 _1029;
-                int4 _1030;
-                  int4 _1031 = make_int4((((((int)threadIdx.x) * 4) + 4096))+(1*0), (((((int)threadIdx.x) * 4) + 4096))+(1*1), (((((int)threadIdx.x) * 4) + 4096))+(1*2), (((((int)threadIdx.x) * 4) + 4096))+(1*3));
-                  int4 _1032 = make_int4(3, 3, 3, 3);
-                  _1030.x = (_1031.x%_1032.x);
-                  _1030.y = (_1031.y%_1032.y);
-                  _1030.z = (_1031.z%_1032.z);
-                  _1030.w = (_1031.w%_1032.w);
-                int4 _1033;
-                  int4 _1034 = make_int4((((((int)threadIdx.x) * 4) + 4096))+(1*0), (((((int)threadIdx.x) * 4) + 4096))+(1*1), (((((int)threadIdx.x) * 4) + 4096))+(1*2), (((((int)threadIdx.x) * 4) + 4096))+(1*3));
-                  int4 _1035 = make_int4(3, 3, 3, 3);
-                  _1033.x = (_1034.x/_1035.x);
-                  _1033.y = (_1034.y/_1035.y);
-                  _1033.z = (_1034.z/_1035.z);
-                  _1033.w = (_1034.w/_1035.w);
-                int4 _1036;
-                ushort4 _1037;
-                  ushort4 _1038;
-                    ushort4 _1039;
-                      int4 _1040 = make_int4(3, 3, 3, 3);
-                      int4 _1041 = make_int4(0, 0, 0, 0);
-                      _1039.x = (_1040.x>=_1041.x);
-                      _1039.y = (_1040.y>=_1041.y);
-                      _1039.z = (_1040.z>=_1041.z);
-                      _1039.w = (_1040.w>=_1041.w);
-                    ushort4 _1042;
-                      int4 _1043 = make_int4(0, 0, 0, 0);
-                      _1042.x = (_1030.x>=_1043.x);
-                      _1042.y = (_1030.y>=_1043.y);
-                      _1042.z = (_1030.z>=_1043.z);
-                      _1042.w = (_1030.w>=_1043.w);
-                    _1038.x = (_1039.x&&_1042.x);
-                    _1038.y = (_1039.y&&_1042.y);
-                    _1038.z = (_1039.z&&_1042.z);
-                    _1038.w = (_1039.w&&_1042.w);
-                  ushort4 _1044;
-                    ushort4 _1045;
-                      int4 _1046 = make_int4(3, 3, 3, 3);
-                      int4 _1047 = make_int4(0, 0, 0, 0);
-                      _1045.x = (_1046.x<_1047.x);
-                      _1045.y = (_1046.y<_1047.y);
-                      _1045.z = (_1046.z<_1047.z);
-                      _1045.w = (_1046.w<_1047.w);
-                    ushort4 _1048;
-                      int4 _1049 = make_int4(0, 0, 0, 0);
-                      _1048.x = (_1030.x<=_1049.x);
-                      _1048.y = (_1030.y<=_1049.y);
-                      _1048.z = (_1030.z<=_1049.z);
-                      _1048.w = (_1030.w<=_1049.w);
-                    _1044.x = (_1045.x&&_1048.x);
-                    _1044.y = (_1045.y&&_1048.y);
-                    _1044.z = (_1045.z&&_1048.z);
-                    _1044.w = (_1045.w&&_1048.w);
-                  _1037.x = (_1038.x||_1044.x);
-                  _1037.y = (_1038.y||_1044.y);
-                  _1037.z = (_1038.z||_1044.z);
-                  _1037.w = (_1038.w||_1044.w);
-                int4 _1050;
-                  int4 _1051 = make_int4(1, 1, 1, 1);
-                  _1050.x = (_1033.x-_1051.x);
-                  _1050.y = (_1033.y-_1051.y);
-                  _1050.z = (_1033.z-_1051.z);
-                  _1050.w = (_1033.w-_1051.w);
-                _1036.x = (bool(_1037.x)?_1033.x:_1050.x);
-                _1036.y = (bool(_1037.y)?_1033.y:_1050.y);
-                _1036.z = (bool(_1037.z)?_1033.z:_1050.z);
-                _1036.w = (bool(_1037.w)?_1033.w:_1050.w);
-                int4 _1052 = make_int4(24, 24, 24, 24);
-                _1029.x = (_1036.x%_1052.x);
-                _1029.y = (_1036.y%_1052.y);
-                _1029.z = (_1036.z%_1052.z);
-                _1029.w = (_1036.w%_1052.w);
-              int4 _1053;
-              ushort4 _1054;
-                ushort4 _1055;
-                  ushort4 _1056;
-                    int4 _1057 = make_int4(24, 24, 24, 24);
-                    int4 _1058 = make_int4(0, 0, 0, 0);
-                    _1056.x = (_1057.x>=_1058.x);
-                    _1056.y = (_1057.y>=_1058.y);
-                    _1056.z = (_1057.z>=_1058.z);
-                    _1056.w = (_1057.w>=_1058.w);
-                  ushort4 _1059;
-                    int4 _1060 = make_int4(0, 0, 0, 0);
-                    _1059.x = (_1029.x>=_1060.x);
-                    _1059.y = (_1029.y>=_1060.y);
-                    _1059.z = (_1029.z>=_1060.z);
-                    _1059.w = (_1029.w>=_1060.w);
-                  _1055.x = (_1056.x&&_1059.x);
-                  _1055.y = (_1056.y&&_1059.y);
-                  _1055.z = (_1056.z&&_1059.z);
-                  _1055.w = (_1056.w&&_1059.w);
-                ushort4 _1061;
-                  ushort4 _1062;
-                    int4 _1063 = make_int4(24, 24, 24, 24);
-                    int4 _1064 = make_int4(0, 0, 0, 0);
-                    _1062.x = (_1063.x<_1064.x);
-                    _1062.y = (_1063.y<_1064.y);
-                    _1062.z = (_1063.z<_1064.z);
-                    _1062.w = (_1063.w<_1064.w);
-                  ushort4 _1065;
-                    int4 _1066 = make_int4(0, 0, 0, 0);
-                    _1065.x = (_1029.x<=_1066.x);
-                    _1065.y = (_1029.y<=_1066.y);
-                    _1065.z = (_1029.z<=_1066.z);
-                    _1065.w = (_1029.w<=_1066.w);
-                  _1061.x = (_1062.x&&_1065.x);
-                  _1061.y = (_1062.y&&_1065.y);
-                  _1061.z = (_1062.z&&_1065.z);
-                  _1061.w = (_1062.w&&_1065.w);
-                _1054.x = (_1055.x||_1061.x);
-                _1054.y = (_1055.y||_1061.y);
-                _1054.z = (_1055.z||_1061.z);
-                _1054.w = (_1055.w||_1061.w);
-              int4 _1067;
-                int4 _1068 = make_int4(24, 24, 24, 24);
-                _1067.x = (_1029.x+_1068.x);
-                _1067.y = (_1029.y+_1068.y);
-                _1067.z = (_1029.z+_1068.z);
-                _1067.w = (_1029.w+_1068.w);
-              _1053.x = (bool(_1054.x)?_1029.x:_1067.x);
-              _1053.y = (bool(_1054.y)?_1029.y:_1067.y);
-              _1053.z = (bool(_1054.z)?_1029.z:_1067.z);
-              _1053.w = (bool(_1054.w)?_1029.w:_1067.w);
-              int4 _1069 = make_int4(3, 3, 3, 3);
-              _1028.x = (_1053.x*_1069.x);
-              _1028.y = (_1053.y*_1069.y);
-              _1028.z = (_1053.z*_1069.z);
-              _1028.w = (_1053.w*_1069.w);
-            _1026.x = (_1027.x+_1028.x);
-            _1026.y = (_1027.y+_1028.y);
-            _1026.z = (_1027.z+_1028.z);
-            _1026.w = (_1027.w+_1028.w);
-          int4 _1070;
-            int4 _1071 = make_int4(((((int)threadIdx.x) + 1024))+(1*0), ((((int)threadIdx.x) + 1024))+(1*1), ((((int)threadIdx.x) + 1024))+(1*2), ((((int)threadIdx.x) + 1024))+(1*3));
-            int4 _1072 = make_int4(3, 3, 3, 3);
-            _1070.x = (_1071.x%_1072.x);
-            _1070.y = (_1071.y%_1072.y);
-            _1070.z = (_1071.z%_1072.z);
-            _1070.w = (_1071.w%_1072.w);
-          int4 _1073;
-          ushort4 _1074;
-            ushort4 _1075;
-              ushort4 _1076;
-                int4 _1077 = make_int4(3, 3, 3, 3);
-                int4 _1078 = make_int4(0, 0, 0, 0);
-                _1076.x = (_1077.x>=_1078.x);
-                _1076.y = (_1077.y>=_1078.y);
-                _1076.z = (_1077.z>=_1078.z);
-                _1076.w = (_1077.w>=_1078.w);
-              ushort4 _1079;
-                int4 _1080 = make_int4(0, 0, 0, 0);
-                _1079.x = (_1070.x>=_1080.x);
-                _1079.y = (_1070.y>=_1080.y);
-                _1079.z = (_1070.z>=_1080.z);
-                _1079.w = (_1070.w>=_1080.w);
-              _1075.x = (_1076.x&&_1079.x);
-              _1075.y = (_1076.y&&_1079.y);
-              _1075.z = (_1076.z&&_1079.z);
-              _1075.w = (_1076.w&&_1079.w);
-            ushort4 _1081;
-              ushort4 _1082;
-                int4 _1083 = make_int4(3, 3, 3, 3);
-                int4 _1084 = make_int4(0, 0, 0, 0);
-                _1082.x = (_1083.x<_1084.x);
-                _1082.y = (_1083.y<_1084.y);
-                _1082.z = (_1083.z<_1084.z);
-                _1082.w = (_1083.w<_1084.w);
-              ushort4 _1085;
-                int4 _1086 = make_int4(0, 0, 0, 0);
-                _1085.x = (_1070.x<=_1086.x);
-                _1085.y = (_1070.y<=_1086.y);
-                _1085.z = (_1070.z<=_1086.z);
-                _1085.w = (_1070.w<=_1086.w);
-              _1081.x = (_1082.x&&_1085.x);
-              _1081.y = (_1082.y&&_1085.y);
-              _1081.z = (_1082.z&&_1085.z);
-              _1081.w = (_1082.w&&_1085.w);
-            _1074.x = (_1075.x||_1081.x);
-            _1074.y = (_1075.y||_1081.y);
-            _1074.z = (_1075.z||_1081.z);
-            _1074.w = (_1075.w||_1081.w);
-          int4 _1087;
-            int4 _1088 = make_int4(3, 3, 3, 3);
-            _1087.x = (_1070.x+_1088.x);
-            _1087.y = (_1070.y+_1088.y);
-            _1087.z = (_1070.z+_1088.z);
-            _1087.w = (_1070.w+_1088.w);
-          _1073.x = (bool(_1074.x)?_1070.x:_1087.x);
-          _1073.y = (bool(_1074.y)?_1070.y:_1087.y);
-          _1073.z = (bool(_1074.z)?_1070.z:_1087.z);
-          _1073.w = (bool(_1074.w)?_1070.w:_1087.w);
-          _1025.x = (_1026.x+_1073.x);
-          _1025.y = (_1026.y+_1073.y);
-          _1025.z = (_1026.z+_1073.z);
-          _1025.w = (_1026.w+_1073.w);
-        *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 4096)) = make_float4(kernel[_1025.x],kernel[_1025.y],kernel[_1025.z],kernel[_1025.w]);
-        int4 _1089;
-          int4 _1090;
-            int4 _1091 = make_int4(((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 4352) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 4352) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 4352) / 72) * 4608)) + (rc_outer_outer * 72)), ((((((int)blockIdx.x) / 7) * 294912) + ((((((int)threadIdx.x) * 4) + 4352) / 72) * 4608)) + (rc_outer_outer * 72)));
-            int4 _1092;
-              int4 _1093;
-                int4 _1094;
-                  int4 _1095 = make_int4((((((int)threadIdx.x) * 4) + 4352))+(1*0), (((((int)threadIdx.x) * 4) + 4352))+(1*1), (((((int)threadIdx.x) * 4) + 4352))+(1*2), (((((int)threadIdx.x) * 4) + 4352))+(1*3));
-                  int4 _1096 = make_int4(3, 3, 3, 3);
-                  _1094.x = (_1095.x%_1096.x);
-                  _1094.y = (_1095.y%_1096.y);
-                  _1094.z = (_1095.z%_1096.z);
-                  _1094.w = (_1095.w%_1096.w);
-                int4 _1097;
-                  int4 _1098 = make_int4((((((int)threadIdx.x) * 4) + 4352))+(1*0), (((((int)threadIdx.x) * 4) + 4352))+(1*1), (((((int)threadIdx.x) * 4) + 4352))+(1*2), (((((int)threadIdx.x) * 4) + 4352))+(1*3));
-                  int4 _1099 = make_int4(3, 3, 3, 3);
-                  _1097.x = (_1098.x/_1099.x);
-                  _1097.y = (_1098.y/_1099.y);
-                  _1097.z = (_1098.z/_1099.z);
-                  _1097.w = (_1098.w/_1099.w);
-                int4 _1100;
-                ushort4 _1101;
-                  ushort4 _1102;
-                    ushort4 _1103;
-                      int4 _1104 = make_int4(3, 3, 3, 3);
-                      int4 _1105 = make_int4(0, 0, 0, 0);
-                      _1103.x = (_1104.x>=_1105.x);
-                      _1103.y = (_1104.y>=_1105.y);
-                      _1103.z = (_1104.z>=_1105.z);
-                      _1103.w = (_1104.w>=_1105.w);
-                    ushort4 _1106;
-                      int4 _1107 = make_int4(0, 0, 0, 0);
-                      _1106.x = (_1094.x>=_1107.x);
-                      _1106.y = (_1094.y>=_1107.y);
-                      _1106.z = (_1094.z>=_1107.z);
-                      _1106.w = (_1094.w>=_1107.w);
-                    _1102.x = (_1103.x&&_1106.x);
-                    _1102.y = (_1103.y&&_1106.y);
-                    _1102.z = (_1103.z&&_1106.z);
-                    _1102.w = (_1103.w&&_1106.w);
-                  ushort4 _1108;
-                    ushort4 _1109;
-                      int4 _1110 = make_int4(3, 3, 3, 3);
-                      int4 _1111 = make_int4(0, 0, 0, 0);
-                      _1109.x = (_1110.x<_1111.x);
-                      _1109.y = (_1110.y<_1111.y);
-                      _1109.z = (_1110.z<_1111.z);
-                      _1109.w = (_1110.w<_1111.w);
-                    ushort4 _1112;
-                      int4 _1113 = make_int4(0, 0, 0, 0);
-                      _1112.x = (_1094.x<=_1113.x);
-                      _1112.y = (_1094.y<=_1113.y);
-                      _1112.z = (_1094.z<=_1113.z);
-                      _1112.w = (_1094.w<=_1113.w);
-                    _1108.x = (_1109.x&&_1112.x);
-                    _1108.y = (_1109.y&&_1112.y);
-                    _1108.z = (_1109.z&&_1112.z);
-                    _1108.w = (_1109.w&&_1112.w);
-                  _1101.x = (_1102.x||_1108.x);
-                  _1101.y = (_1102.y||_1108.y);
-                  _1101.z = (_1102.z||_1108.z);
-                  _1101.w = (_1102.w||_1108.w);
-                int4 _1114;
-                  int4 _1115 = make_int4(1, 1, 1, 1);
-                  _1114.x = (_1097.x-_1115.x);
-                  _1114.y = (_1097.y-_1115.y);
-                  _1114.z = (_1097.z-_1115.z);
-                  _1114.w = (_1097.w-_1115.w);
-                _1100.x = (bool(_1101.x)?_1097.x:_1114.x);
-                _1100.y = (bool(_1101.y)?_1097.y:_1114.y);
-                _1100.z = (bool(_1101.z)?_1097.z:_1114.z);
-                _1100.w = (bool(_1101.w)?_1097.w:_1114.w);
-                int4 _1116 = make_int4(24, 24, 24, 24);
-                _1093.x = (_1100.x%_1116.x);
-                _1093.y = (_1100.y%_1116.y);
-                _1093.z = (_1100.z%_1116.z);
-                _1093.w = (_1100.w%_1116.w);
-              int4 _1117;
-              ushort4 _1118;
-                ushort4 _1119;
-                  ushort4 _1120;
-                    int4 _1121 = make_int4(24, 24, 24, 24);
-                    int4 _1122 = make_int4(0, 0, 0, 0);
-                    _1120.x = (_1121.x>=_1122.x);
-                    _1120.y = (_1121.y>=_1122.y);
-                    _1120.z = (_1121.z>=_1122.z);
-                    _1120.w = (_1121.w>=_1122.w);
-                  ushort4 _1123;
-                    int4 _1124 = make_int4(0, 0, 0, 0);
-                    _1123.x = (_1093.x>=_1124.x);
-                    _1123.y = (_1093.y>=_1124.y);
-                    _1123.z = (_1093.z>=_1124.z);
-                    _1123.w = (_1093.w>=_1124.w);
-                  _1119.x = (_1120.x&&_1123.x);
-                  _1119.y = (_1120.y&&_1123.y);
-                  _1119.z = (_1120.z&&_1123.z);
-                  _1119.w = (_1120.w&&_1123.w);
-                ushort4 _1125;
-                  ushort4 _1126;
-                    int4 _1127 = make_int4(24, 24, 24, 24);
-                    int4 _1128 = make_int4(0, 0, 0, 0);
-                    _1126.x = (_1127.x<_1128.x);
-                    _1126.y = (_1127.y<_1128.y);
-                    _1126.z = (_1127.z<_1128.z);
-                    _1126.w = (_1127.w<_1128.w);
-                  ushort4 _1129;
-                    int4 _1130 = make_int4(0, 0, 0, 0);
-                    _1129.x = (_1093.x<=_1130.x);
-                    _1129.y = (_1093.y<=_1130.y);
-                    _1129.z = (_1093.z<=_1130.z);
-                    _1129.w = (_1093.w<=_1130.w);
-                  _1125.x = (_1126.x&&_1129.x);
-                  _1125.y = (_1126.y&&_1129.y);
-                  _1125.z = (_1126.z&&_1129.z);
-                  _1125.w = (_1126.w&&_1129.w);
-                _1118.x = (_1119.x||_1125.x);
-                _1118.y = (_1119.y||_1125.y);
-                _1118.z = (_1119.z||_1125.z);
-                _1118.w = (_1119.w||_1125.w);
-              int4 _1131;
-                int4 _1132 = make_int4(24, 24, 24, 24);
-                _1131.x = (_1093.x+_1132.x);
-                _1131.y = (_1093.y+_1132.y);
-                _1131.z = (_1093.z+_1132.z);
-                _1131.w = (_1093.w+_1132.w);
-              _1117.x = (bool(_1118.x)?_1093.x:_1131.x);
-              _1117.y = (bool(_1118.y)?_1093.y:_1131.y);
-              _1117.z = (bool(_1118.z)?_1093.z:_1131.z);
-              _1117.w = (bool(_1118.w)?_1093.w:_1131.w);
-              int4 _1133 = make_int4(3, 3, 3, 3);
-              _1092.x = (_1117.x*_1133.x);
-              _1092.y = (_1117.y*_1133.y);
-              _1092.z = (_1117.z*_1133.z);
-              _1092.w = (_1117.w*_1133.w);
-            _1090.x = (_1091.x+_1092.x);
-            _1090.y = (_1091.y+_1092.y);
-            _1090.z = (_1091.z+_1092.z);
-            _1090.w = (_1091.w+_1092.w);
-          int4 _1134;
-            int4 _1135 = make_int4(((((int)threadIdx.x) + 1088))+(1*0), ((((int)threadIdx.x) + 1088))+(1*1), ((((int)threadIdx.x) + 1088))+(1*2), ((((int)threadIdx.x) + 1088))+(1*3));
-            int4 _1136 = make_int4(3, 3, 3, 3);
-            _1134.x = (_1135.x%_1136.x);
-            _1134.y = (_1135.y%_1136.y);
-            _1134.z = (_1135.z%_1136.z);
-            _1134.w = (_1135.w%_1136.w);
-          int4 _1137;
-          ushort4 _1138;
-            ushort4 _1139;
-              ushort4 _1140;
-                int4 _1141 = make_int4(3, 3, 3, 3);
-                int4 _1142 = make_int4(0, 0, 0, 0);
-                _1140.x = (_1141.x>=_1142.x);
-                _1140.y = (_1141.y>=_1142.y);
-                _1140.z = (_1141.z>=_1142.z);
-                _1140.w = (_1141.w>=_1142.w);
-              ushort4 _1143;
-                int4 _1144 = make_int4(0, 0, 0, 0);
-                _1143.x = (_1134.x>=_1144.x);
-                _1143.y = (_1134.y>=_1144.y);
-                _1143.z = (_1134.z>=_1144.z);
-                _1143.w = (_1134.w>=_1144.w);
-              _1139.x = (_1140.x&&_1143.x);
-              _1139.y = (_1140.y&&_1143.y);
-              _1139.z = (_1140.z&&_1143.z);
-              _1139.w = (_1140.w&&_1143.w);
-            ushort4 _1145;
-              ushort4 _1146;
-                int4 _1147 = make_int4(3, 3, 3, 3);
-                int4 _1148 = make_int4(0, 0, 0, 0);
-                _1146.x = (_1147.x<_1148.x);
-                _1146.y = (_1147.y<_1148.y);
-                _1146.z = (_1147.z<_1148.z);
-                _1146.w = (_1147.w<_1148.w);
-              ushort4 _1149;
-                int4 _1150 = make_int4(0, 0, 0, 0);
-                _1149.x = (_1134.x<=_1150.x);
-                _1149.y = (_1134.y<=_1150.y);
-                _1149.z = (_1134.z<=_1150.z);
-                _1149.w = (_1134.w<=_1150.w);
-              _1145.x = (_1146.x&&_1149.x);
-              _1145.y = (_1146.y&&_1149.y);
-              _1145.z = (_1146.z&&_1149.z);
-              _1145.w = (_1146.w&&_1149.w);
-            _1138.x = (_1139.x||_1145.x);
-            _1138.y = (_1139.y||_1145.y);
-            _1138.z = (_1139.z||_1145.z);
-            _1138.w = (_1139.w||_1145.w);
-          int4 _1151;
-            int4 _1152 = make_int4(3, 3, 3, 3);
-            _1151.x = (_1134.x+_1152.x);
-            _1151.y = (_1134.y+_1152.y);
-            _1151.z = (_1134.z+_1152.z);
-            _1151.w = (_1134.w+_1152.w);
-          _1137.x = (bool(_1138.x)?_1134.x:_1151.x);
-          _1137.y = (bool(_1138.y)?_1134.y:_1151.y);
-          _1137.z = (bool(_1138.z)?_1134.z:_1151.z);
-          _1137.w = (bool(_1138.w)?_1134.w:_1151.w);
-          _1089.x = (_1090.x+_1137.x);
-          _1089.y = (_1090.y+_1137.y);
-          _1089.z = (_1090.z+_1137.z);
-          _1089.w = (_1090.w+_1137.w);
-        *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 4352)) = make_float4(kernel[_1089.x],kernel[_1089.y],kernel[_1089.z],kernel[_1089.w]);
-        __syncthreads();
-        for (int rc_inner = 0; rc_inner < 8; ++rc_inner) {
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(rc_inner * 27)] * kernel_shared[((((int)threadIdx.x) * 72) + (rc_inner * 9))]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_inner * 27) + 3)] * kernel_shared[((((int)threadIdx.x) * 72) + (rc_inner * 9))]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_inner * 27) + 6)] * kernel_shared[((((int)threadIdx.x) * 72) + (rc_inner * 9))]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_inner * 27) + 9)] * kernel_shared[((((int)threadIdx.x) * 72) + (rc_inner * 9))]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_inner * 27) + 12)] * kernel_shared[((((int)threadIdx.x) * 72) + (rc_inner * 9))]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_inner * 27) + 15)] * kernel_shared[((((int)threadIdx.x) * 72) + (rc_inner * 9))]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_inner * 27) + 18)] * kernel_shared[((((int)threadIdx.x) * 72) + (rc_inner * 9))]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_inner * 27) + 1)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 1)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_inner * 27) + 4)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 1)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_inner * 27) + 7)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 1)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_inner * 27) + 10)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 1)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_inner * 27) + 13)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 1)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_inner * 27) + 16)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 1)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_inner * 27) + 19)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 1)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_inner * 27) + 2)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 2)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_inner * 27) + 5)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 2)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_inner * 27) + 8)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 2)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_inner * 27) + 11)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 2)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_inner * 27) + 14)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 2)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_inner * 27) + 17)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 2)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_inner * 27) + 20)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 2)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_inner * 27) + 3)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 3)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_inner * 27) + 6)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 3)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_inner * 27) + 9)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 3)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_inner * 27) + 12)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 3)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_inner * 27) + 15)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 3)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_inner * 27) + 18)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 3)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_inner * 27) + 21)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 3)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_inner * 27) + 4)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 4)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_inner * 27) + 7)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 4)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_inner * 27) + 10)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 4)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_inner * 27) + 13)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 4)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_inner * 27) + 16)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 4)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_inner * 27) + 19)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 4)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_inner * 27) + 22)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 4)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_inner * 27) + 5)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 5)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_inner * 27) + 8)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 5)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_inner * 27) + 11)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 5)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_inner * 27) + 14)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 5)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_inner * 27) + 17)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 5)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_inner * 27) + 20)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 5)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_inner * 27) + 23)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 5)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_inner * 27) + 6)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 6)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_inner * 27) + 9)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 6)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_inner * 27) + 12)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 6)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_inner * 27) + 15)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 6)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_inner * 27) + 18)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 6)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_inner * 27) + 21)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 6)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_inner * 27) + 24)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 6)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_inner * 27) + 7)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 7)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_inner * 27) + 10)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 7)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_inner * 27) + 13)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 7)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_inner * 27) + 16)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 7)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_inner * 27) + 19)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 7)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_inner * 27) + 22)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 7)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_inner * 27) + 25)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 7)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_inner * 27) + 8)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 8)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_inner * 27) + 11)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 8)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_inner * 27) + 14)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 8)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_inner * 27) + 17)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 8)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_inner * 27) + 20)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 8)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_inner * 27) + 23)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 8)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_inner * 27) + 26)] * kernel_shared[(((((int)threadIdx.x) * 72) + (rc_inner * 9)) + 8)]));
+      conv2d_nchw[1] = 0.000000e+00f;
+      conv2d_nchw[3] = 0.000000e+00f;
+      conv2d_nchw[5] = 0.000000e+00f;
+      conv2d_nchw[7] = 0.000000e+00f;
+      for (int rc_outer_outer = 0; rc_outer_outer < 16; ++rc_outer_outer) {
+        for (int ry_outer_outer = 0; ry_outer_outer < 3; ++ry_outer_outer) {
+          __syncthreads();
+          pad_temp_shared[((int)threadIdx.x)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 49)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 41)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 98)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 90)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 147)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 139)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 196)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 188)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 245)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 237)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 294)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 286)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 343)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 335)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 392)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 384)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 441)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 433)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 490)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 482)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 539)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 531)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 588)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 580)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 637)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 629)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 686)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 678)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 735)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 727)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 784)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 776)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 833)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 825)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 882)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 874)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 931)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 923)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 980)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 972)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1029)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1021)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1078)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1070)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1127)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1119)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1176)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1168)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1225)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1217)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1274)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1266)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1323)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1315)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1372)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1364)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1421)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1413)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1470)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1462)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1519)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 7))) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1511)] : 0.000000e+00f);
+          kernel_shared[((int)threadIdx.x)] = kernel[(((((((int)blockIdx.x) * 36864) + ((((int)threadIdx.x) >> 5) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) & 31) * 9)) + (ry_outer_outer * 3))];
+          kernel_shared[(((int)threadIdx.x) + 49)] = kernel[(((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 49) >> 5) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 17) & 31) * 9)) + (ry_outer_outer * 3))];
+          kernel_shared[(((int)threadIdx.x) + 98)] = kernel[(((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 98) >> 5) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 2) & 31) * 9)) + (ry_outer_outer * 3))];
+          kernel_shared[(((int)threadIdx.x) + 147)] = kernel[(((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 147) >> 5) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 19) & 31) * 9)) + (ry_outer_outer * 3))];
+          kernel_shared[(((int)threadIdx.x) + 196)] = kernel[(((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 196) >> 5) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 4) & 31) * 9)) + (ry_outer_outer * 3))];
+          if (((int)threadIdx.x) < 11) {
+            kernel_shared[(((int)threadIdx.x) + 245)] = kernel[(((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 245) >> 5) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) + 21) * 9)) + (ry_outer_outer * 3))];
+          }
+          __syncthreads();
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((int)threadIdx.x)] * kernel_shared[0]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((int)threadIdx.x)] * kernel_shared[64]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((int)threadIdx.x)] * kernel_shared[128]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((int)threadIdx.x)] * kernel_shared[192]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((int)threadIdx.x)] * kernel_shared[32]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((int)threadIdx.x)] * kernel_shared[96]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((int)threadIdx.x)] * kernel_shared[160]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((int)threadIdx.x)] * kernel_shared[224]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 49)] * kernel_shared[1]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 49)] * kernel_shared[65]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 49)] * kernel_shared[129]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 49)] * kernel_shared[193]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 49)] * kernel_shared[33]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 49)] * kernel_shared[97]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 49)] * kernel_shared[161]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 49)] * kernel_shared[225]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 98)] * kernel_shared[2]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 98)] * kernel_shared[66]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 98)] * kernel_shared[130]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 98)] * kernel_shared[194]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 98)] * kernel_shared[34]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 98)] * kernel_shared[98]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 98)] * kernel_shared[162]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 98)] * kernel_shared[226]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 147)] * kernel_shared[3]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 147)] * kernel_shared[67]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 147)] * kernel_shared[131]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 147)] * kernel_shared[195]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 147)] * kernel_shared[35]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 147)] * kernel_shared[99]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 147)] * kernel_shared[163]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 147)] * kernel_shared[227]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 196)] * kernel_shared[4]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 196)] * kernel_shared[68]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 196)] * kernel_shared[132]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 196)] * kernel_shared[196]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 196)] * kernel_shared[36]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 196)] * kernel_shared[100]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 196)] * kernel_shared[164]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 196)] * kernel_shared[228]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 245)] * kernel_shared[5]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 245)] * kernel_shared[69]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 245)] * kernel_shared[133]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 245)] * kernel_shared[197]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 245)] * kernel_shared[37]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 245)] * kernel_shared[101]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 245)] * kernel_shared[165]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 245)] * kernel_shared[229]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 294)] * kernel_shared[6]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 294)] * kernel_shared[70]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 294)] * kernel_shared[134]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 294)] * kernel_shared[198]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 294)] * kernel_shared[38]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 294)] * kernel_shared[102]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 294)] * kernel_shared[166]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 294)] * kernel_shared[230]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 343)] * kernel_shared[7]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 343)] * kernel_shared[71]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 343)] * kernel_shared[135]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 343)] * kernel_shared[199]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 343)] * kernel_shared[39]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 343)] * kernel_shared[103]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 343)] * kernel_shared[167]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 343)] * kernel_shared[231]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 392)] * kernel_shared[8]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 392)] * kernel_shared[72]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 392)] * kernel_shared[136]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 392)] * kernel_shared[200]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 392)] * kernel_shared[40]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 392)] * kernel_shared[104]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 392)] * kernel_shared[168]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 392)] * kernel_shared[232]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 441)] * kernel_shared[9]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 441)] * kernel_shared[73]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 441)] * kernel_shared[137]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 441)] * kernel_shared[201]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 441)] * kernel_shared[41]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 441)] * kernel_shared[105]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 441)] * kernel_shared[169]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 441)] * kernel_shared[233]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 490)] * kernel_shared[10]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 490)] * kernel_shared[74]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 490)] * kernel_shared[138]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 490)] * kernel_shared[202]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 490)] * kernel_shared[42]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 490)] * kernel_shared[106]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 490)] * kernel_shared[170]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 490)] * kernel_shared[234]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 539)] * kernel_shared[11]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 539)] * kernel_shared[75]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 539)] * kernel_shared[139]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 539)] * kernel_shared[203]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 539)] * kernel_shared[43]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 539)] * kernel_shared[107]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 539)] * kernel_shared[171]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 539)] * kernel_shared[235]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 588)] * kernel_shared[12]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 588)] * kernel_shared[76]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 588)] * kernel_shared[140]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 588)] * kernel_shared[204]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 588)] * kernel_shared[44]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 588)] * kernel_shared[108]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 588)] * kernel_shared[172]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 588)] * kernel_shared[236]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 637)] * kernel_shared[13]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 637)] * kernel_shared[77]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 637)] * kernel_shared[141]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 637)] * kernel_shared[205]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 637)] * kernel_shared[45]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 637)] * kernel_shared[109]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 637)] * kernel_shared[173]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 637)] * kernel_shared[237]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 686)] * kernel_shared[14]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 686)] * kernel_shared[78]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 686)] * kernel_shared[142]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 686)] * kernel_shared[206]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 686)] * kernel_shared[46]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 686)] * kernel_shared[110]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 686)] * kernel_shared[174]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 686)] * kernel_shared[238]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 735)] * kernel_shared[15]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 735)] * kernel_shared[79]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 735)] * kernel_shared[143]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 735)] * kernel_shared[207]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 735)] * kernel_shared[47]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 735)] * kernel_shared[111]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 735)] * kernel_shared[175]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 735)] * kernel_shared[239]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 784)] * kernel_shared[16]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 784)] * kernel_shared[80]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 784)] * kernel_shared[144]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 784)] * kernel_shared[208]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 784)] * kernel_shared[48]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 784)] * kernel_shared[112]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 784)] * kernel_shared[176]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 784)] * kernel_shared[240]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 833)] * kernel_shared[17]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 833)] * kernel_shared[81]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 833)] * kernel_shared[145]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 833)] * kernel_shared[209]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 833)] * kernel_shared[49]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 833)] * kernel_shared[113]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 833)] * kernel_shared[177]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 833)] * kernel_shared[241]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 882)] * kernel_shared[18]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 882)] * kernel_shared[82]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 882)] * kernel_shared[146]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 882)] * kernel_shared[210]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 882)] * kernel_shared[50]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 882)] * kernel_shared[114]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 882)] * kernel_shared[178]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 882)] * kernel_shared[242]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 931)] * kernel_shared[19]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 931)] * kernel_shared[83]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 931)] * kernel_shared[147]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 931)] * kernel_shared[211]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 931)] * kernel_shared[51]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 931)] * kernel_shared[115]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 931)] * kernel_shared[179]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 931)] * kernel_shared[243]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 980)] * kernel_shared[20]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 980)] * kernel_shared[84]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 980)] * kernel_shared[148]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 980)] * kernel_shared[212]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 980)] * kernel_shared[52]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 980)] * kernel_shared[116]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 980)] * kernel_shared[180]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 980)] * kernel_shared[244]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1029)] * kernel_shared[21]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1029)] * kernel_shared[85]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1029)] * kernel_shared[149]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1029)] * kernel_shared[213]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1029)] * kernel_shared[53]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1029)] * kernel_shared[117]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1029)] * kernel_shared[181]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1029)] * kernel_shared[245]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1078)] * kernel_shared[22]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1078)] * kernel_shared[86]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1078)] * kernel_shared[150]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1078)] * kernel_shared[214]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1078)] * kernel_shared[54]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1078)] * kernel_shared[118]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1078)] * kernel_shared[182]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1078)] * kernel_shared[246]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1127)] * kernel_shared[23]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1127)] * kernel_shared[87]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1127)] * kernel_shared[151]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1127)] * kernel_shared[215]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1127)] * kernel_shared[55]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1127)] * kernel_shared[119]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1127)] * kernel_shared[183]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1127)] * kernel_shared[247]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1176)] * kernel_shared[24]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1176)] * kernel_shared[88]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1176)] * kernel_shared[152]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1176)] * kernel_shared[216]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1176)] * kernel_shared[56]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1176)] * kernel_shared[120]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1176)] * kernel_shared[184]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1176)] * kernel_shared[248]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1225)] * kernel_shared[25]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1225)] * kernel_shared[89]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1225)] * kernel_shared[153]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1225)] * kernel_shared[217]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1225)] * kernel_shared[57]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1225)] * kernel_shared[121]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1225)] * kernel_shared[185]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1225)] * kernel_shared[249]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1274)] * kernel_shared[26]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1274)] * kernel_shared[90]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1274)] * kernel_shared[154]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1274)] * kernel_shared[218]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1274)] * kernel_shared[58]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1274)] * kernel_shared[122]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1274)] * kernel_shared[186]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1274)] * kernel_shared[250]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1323)] * kernel_shared[27]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1323)] * kernel_shared[91]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1323)] * kernel_shared[155]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1323)] * kernel_shared[219]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1323)] * kernel_shared[59]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1323)] * kernel_shared[123]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1323)] * kernel_shared[187]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1323)] * kernel_shared[251]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1372)] * kernel_shared[28]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1372)] * kernel_shared[92]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1372)] * kernel_shared[156]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1372)] * kernel_shared[220]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1372)] * kernel_shared[60]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1372)] * kernel_shared[124]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1372)] * kernel_shared[188]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1372)] * kernel_shared[252]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1421)] * kernel_shared[29]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1421)] * kernel_shared[93]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1421)] * kernel_shared[157]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1421)] * kernel_shared[221]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1421)] * kernel_shared[61]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1421)] * kernel_shared[125]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1421)] * kernel_shared[189]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1421)] * kernel_shared[253]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1470)] * kernel_shared[30]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1470)] * kernel_shared[94]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1470)] * kernel_shared[158]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1470)] * kernel_shared[222]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1470)] * kernel_shared[62]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1470)] * kernel_shared[126]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1470)] * kernel_shared[190]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1470)] * kernel_shared[254]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1519)] * kernel_shared[31]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1519)] * kernel_shared[95]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1519)] * kernel_shared[159]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1519)] * kernel_shared[223]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1519)] * kernel_shared[63]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1519)] * kernel_shared[127]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1519)] * kernel_shared[191]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1519)] * kernel_shared[255]));
+          __syncthreads();
+          pad_temp_shared[((int)threadIdx.x)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) - 7)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 49)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 42)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 98)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 91)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 147)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 140)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 196)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 189)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 245)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 238)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 294)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 287)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 343)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 336)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 392)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 385)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 441)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 434)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 490)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 483)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 539)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 532)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 588)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 581)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 637)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 630)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 686)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 679)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 735)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 728)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 784)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 777)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 833)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 826)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 882)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 875)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 931)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 924)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 980)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 973)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1029)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1022)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1078)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1071)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1127)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1120)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1176)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1169)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1225)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1218)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1274)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1267)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1323)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1316)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1372)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1365)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1421)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1414)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1470)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1463)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1519)] = (((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1512)] : 0.000000e+00f);
+          kernel_shared[((int)threadIdx.x)] = kernel[((((((((int)blockIdx.x) * 36864) + ((((int)threadIdx.x) >> 5) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) & 31) * 9)) + (ry_outer_outer * 3)) + 1)];
+          kernel_shared[(((int)threadIdx.x) + 49)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 49) >> 5) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 17) & 31) * 9)) + (ry_outer_outer * 3)) + 1)];
+          kernel_shared[(((int)threadIdx.x) + 98)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 98) >> 5) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 2) & 31) * 9)) + (ry_outer_outer * 3)) + 1)];
+          kernel_shared[(((int)threadIdx.x) + 147)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 147) >> 5) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 19) & 31) * 9)) + (ry_outer_outer * 3)) + 1)];
+          kernel_shared[(((int)threadIdx.x) + 196)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 196) >> 5) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 4) & 31) * 9)) + (ry_outer_outer * 3)) + 1)];
+          if (((int)threadIdx.x) < 11) {
+            kernel_shared[(((int)threadIdx.x) + 245)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 245) >> 5) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) + 21) * 9)) + (ry_outer_outer * 3)) + 1)];
+          }
+          __syncthreads();
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((int)threadIdx.x)] * kernel_shared[0]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((int)threadIdx.x)] * kernel_shared[64]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((int)threadIdx.x)] * kernel_shared[128]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((int)threadIdx.x)] * kernel_shared[192]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((int)threadIdx.x)] * kernel_shared[32]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((int)threadIdx.x)] * kernel_shared[96]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((int)threadIdx.x)] * kernel_shared[160]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((int)threadIdx.x)] * kernel_shared[224]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 49)] * kernel_shared[1]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 49)] * kernel_shared[65]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 49)] * kernel_shared[129]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 49)] * kernel_shared[193]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 49)] * kernel_shared[33]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 49)] * kernel_shared[97]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 49)] * kernel_shared[161]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 49)] * kernel_shared[225]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 98)] * kernel_shared[2]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 98)] * kernel_shared[66]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 98)] * kernel_shared[130]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 98)] * kernel_shared[194]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 98)] * kernel_shared[34]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 98)] * kernel_shared[98]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 98)] * kernel_shared[162]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 98)] * kernel_shared[226]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 147)] * kernel_shared[3]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 147)] * kernel_shared[67]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 147)] * kernel_shared[131]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 147)] * kernel_shared[195]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 147)] * kernel_shared[35]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 147)] * kernel_shared[99]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 147)] * kernel_shared[163]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 147)] * kernel_shared[227]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 196)] * kernel_shared[4]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 196)] * kernel_shared[68]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 196)] * kernel_shared[132]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 196)] * kernel_shared[196]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 196)] * kernel_shared[36]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 196)] * kernel_shared[100]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 196)] * kernel_shared[164]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 196)] * kernel_shared[228]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 245)] * kernel_shared[5]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 245)] * kernel_shared[69]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 245)] * kernel_shared[133]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 245)] * kernel_shared[197]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 245)] * kernel_shared[37]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 245)] * kernel_shared[101]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 245)] * kernel_shared[165]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 245)] * kernel_shared[229]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 294)] * kernel_shared[6]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 294)] * kernel_shared[70]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 294)] * kernel_shared[134]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 294)] * kernel_shared[198]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 294)] * kernel_shared[38]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 294)] * kernel_shared[102]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 294)] * kernel_shared[166]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 294)] * kernel_shared[230]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 343)] * kernel_shared[7]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 343)] * kernel_shared[71]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 343)] * kernel_shared[135]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 343)] * kernel_shared[199]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 343)] * kernel_shared[39]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 343)] * kernel_shared[103]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 343)] * kernel_shared[167]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 343)] * kernel_shared[231]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 392)] * kernel_shared[8]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 392)] * kernel_shared[72]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 392)] * kernel_shared[136]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 392)] * kernel_shared[200]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 392)] * kernel_shared[40]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 392)] * kernel_shared[104]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 392)] * kernel_shared[168]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 392)] * kernel_shared[232]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 441)] * kernel_shared[9]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 441)] * kernel_shared[73]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 441)] * kernel_shared[137]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 441)] * kernel_shared[201]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 441)] * kernel_shared[41]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 441)] * kernel_shared[105]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 441)] * kernel_shared[169]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 441)] * kernel_shared[233]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 490)] * kernel_shared[10]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 490)] * kernel_shared[74]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 490)] * kernel_shared[138]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 490)] * kernel_shared[202]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 490)] * kernel_shared[42]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 490)] * kernel_shared[106]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 490)] * kernel_shared[170]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 490)] * kernel_shared[234]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 539)] * kernel_shared[11]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 539)] * kernel_shared[75]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 539)] * kernel_shared[139]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 539)] * kernel_shared[203]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 539)] * kernel_shared[43]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 539)] * kernel_shared[107]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 539)] * kernel_shared[171]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 539)] * kernel_shared[235]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 588)] * kernel_shared[12]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 588)] * kernel_shared[76]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 588)] * kernel_shared[140]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 588)] * kernel_shared[204]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 588)] * kernel_shared[44]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 588)] * kernel_shared[108]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 588)] * kernel_shared[172]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 588)] * kernel_shared[236]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 637)] * kernel_shared[13]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 637)] * kernel_shared[77]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 637)] * kernel_shared[141]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 637)] * kernel_shared[205]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 637)] * kernel_shared[45]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 637)] * kernel_shared[109]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 637)] * kernel_shared[173]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 637)] * kernel_shared[237]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 686)] * kernel_shared[14]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 686)] * kernel_shared[78]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 686)] * kernel_shared[142]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 686)] * kernel_shared[206]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 686)] * kernel_shared[46]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 686)] * kernel_shared[110]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 686)] * kernel_shared[174]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 686)] * kernel_shared[238]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 735)] * kernel_shared[15]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 735)] * kernel_shared[79]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 735)] * kernel_shared[143]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 735)] * kernel_shared[207]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 735)] * kernel_shared[47]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 735)] * kernel_shared[111]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 735)] * kernel_shared[175]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 735)] * kernel_shared[239]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 784)] * kernel_shared[16]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 784)] * kernel_shared[80]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 784)] * kernel_shared[144]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 784)] * kernel_shared[208]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 784)] * kernel_shared[48]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 784)] * kernel_shared[112]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 784)] * kernel_shared[176]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 784)] * kernel_shared[240]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 833)] * kernel_shared[17]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 833)] * kernel_shared[81]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 833)] * kernel_shared[145]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 833)] * kernel_shared[209]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 833)] * kernel_shared[49]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 833)] * kernel_shared[113]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 833)] * kernel_shared[177]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 833)] * kernel_shared[241]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 882)] * kernel_shared[18]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 882)] * kernel_shared[82]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 882)] * kernel_shared[146]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 882)] * kernel_shared[210]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 882)] * kernel_shared[50]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 882)] * kernel_shared[114]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 882)] * kernel_shared[178]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 882)] * kernel_shared[242]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 931)] * kernel_shared[19]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 931)] * kernel_shared[83]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 931)] * kernel_shared[147]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 931)] * kernel_shared[211]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 931)] * kernel_shared[51]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 931)] * kernel_shared[115]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 931)] * kernel_shared[179]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 931)] * kernel_shared[243]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 980)] * kernel_shared[20]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 980)] * kernel_shared[84]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 980)] * kernel_shared[148]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 980)] * kernel_shared[212]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 980)] * kernel_shared[52]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 980)] * kernel_shared[116]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 980)] * kernel_shared[180]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 980)] * kernel_shared[244]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1029)] * kernel_shared[21]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1029)] * kernel_shared[85]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1029)] * kernel_shared[149]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1029)] * kernel_shared[213]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1029)] * kernel_shared[53]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1029)] * kernel_shared[117]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1029)] * kernel_shared[181]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1029)] * kernel_shared[245]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1078)] * kernel_shared[22]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1078)] * kernel_shared[86]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1078)] * kernel_shared[150]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1078)] * kernel_shared[214]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1078)] * kernel_shared[54]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1078)] * kernel_shared[118]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1078)] * kernel_shared[182]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1078)] * kernel_shared[246]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1127)] * kernel_shared[23]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1127)] * kernel_shared[87]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1127)] * kernel_shared[151]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1127)] * kernel_shared[215]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1127)] * kernel_shared[55]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1127)] * kernel_shared[119]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1127)] * kernel_shared[183]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1127)] * kernel_shared[247]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1176)] * kernel_shared[24]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1176)] * kernel_shared[88]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1176)] * kernel_shared[152]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1176)] * kernel_shared[216]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1176)] * kernel_shared[56]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1176)] * kernel_shared[120]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1176)] * kernel_shared[184]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1176)] * kernel_shared[248]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1225)] * kernel_shared[25]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1225)] * kernel_shared[89]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1225)] * kernel_shared[153]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1225)] * kernel_shared[217]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1225)] * kernel_shared[57]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1225)] * kernel_shared[121]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1225)] * kernel_shared[185]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1225)] * kernel_shared[249]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1274)] * kernel_shared[26]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1274)] * kernel_shared[90]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1274)] * kernel_shared[154]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1274)] * kernel_shared[218]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1274)] * kernel_shared[58]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1274)] * kernel_shared[122]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1274)] * kernel_shared[186]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1274)] * kernel_shared[250]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1323)] * kernel_shared[27]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1323)] * kernel_shared[91]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1323)] * kernel_shared[155]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1323)] * kernel_shared[219]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1323)] * kernel_shared[59]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1323)] * kernel_shared[123]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1323)] * kernel_shared[187]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1323)] * kernel_shared[251]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1372)] * kernel_shared[28]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1372)] * kernel_shared[92]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1372)] * kernel_shared[156]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1372)] * kernel_shared[220]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1372)] * kernel_shared[60]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1372)] * kernel_shared[124]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1372)] * kernel_shared[188]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1372)] * kernel_shared[252]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1421)] * kernel_shared[29]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1421)] * kernel_shared[93]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1421)] * kernel_shared[157]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1421)] * kernel_shared[221]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1421)] * kernel_shared[61]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1421)] * kernel_shared[125]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1421)] * kernel_shared[189]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1421)] * kernel_shared[253]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1470)] * kernel_shared[30]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1470)] * kernel_shared[94]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1470)] * kernel_shared[158]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1470)] * kernel_shared[222]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1470)] * kernel_shared[62]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1470)] * kernel_shared[126]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1470)] * kernel_shared[190]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1470)] * kernel_shared[254]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1519)] * kernel_shared[31]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1519)] * kernel_shared[95]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1519)] * kernel_shared[159]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1519)] * kernel_shared[223]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1519)] * kernel_shared[63]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1519)] * kernel_shared[127]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1519)] * kernel_shared[191]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1519)] * kernel_shared[255]));
+          __syncthreads();
+          pad_temp_shared[((int)threadIdx.x)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) - 6)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 49)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 43)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 98)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 92)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 147)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 141)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 196)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 190)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 245)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 239)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 294)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 288)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 343)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 337)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 392)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 386)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 441)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 435)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 490)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 484)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 539)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 533)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 588)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 582)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 637)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 631)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 686)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 680)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 735)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 729)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 784)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 778)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 833)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 827)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 882)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 876)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 931)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 925)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 980)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 974)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1029)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1023)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1078)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1072)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1127)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1121)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1176)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1170)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1225)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1219)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1274)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1268)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1323)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1317)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1372)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1366)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1421)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1415)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1470)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1464)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1519)] = ((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && ((((int)threadIdx.x) % 7) < 6)) ? data[((((rc_outer_outer * 1568) + (ry_outer_outer * 7)) + ((int)threadIdx.x)) + 1513)] : 0.000000e+00f);
+          kernel_shared[((int)threadIdx.x)] = kernel[((((((((int)blockIdx.x) * 36864) + ((((int)threadIdx.x) >> 5) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) & 31) * 9)) + (ry_outer_outer * 3)) + 2)];
+          kernel_shared[(((int)threadIdx.x) + 49)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 49) >> 5) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 17) & 31) * 9)) + (ry_outer_outer * 3)) + 2)];
+          kernel_shared[(((int)threadIdx.x) + 98)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 98) >> 5) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 2) & 31) * 9)) + (ry_outer_outer * 3)) + 2)];
+          kernel_shared[(((int)threadIdx.x) + 147)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 147) >> 5) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 19) & 31) * 9)) + (ry_outer_outer * 3)) + 2)];
+          kernel_shared[(((int)threadIdx.x) + 196)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 196) >> 5) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 4) & 31) * 9)) + (ry_outer_outer * 3)) + 2)];
+          if (((int)threadIdx.x) < 11) {
+            kernel_shared[(((int)threadIdx.x) + 245)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 245) >> 5) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) + 21) * 9)) + (ry_outer_outer * 3)) + 2)];
+          }
+          __syncthreads();
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((int)threadIdx.x)] * kernel_shared[0]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((int)threadIdx.x)] * kernel_shared[64]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((int)threadIdx.x)] * kernel_shared[128]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((int)threadIdx.x)] * kernel_shared[192]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((int)threadIdx.x)] * kernel_shared[32]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((int)threadIdx.x)] * kernel_shared[96]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((int)threadIdx.x)] * kernel_shared[160]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((int)threadIdx.x)] * kernel_shared[224]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 49)] * kernel_shared[1]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 49)] * kernel_shared[65]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 49)] * kernel_shared[129]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 49)] * kernel_shared[193]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 49)] * kernel_shared[33]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 49)] * kernel_shared[97]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 49)] * kernel_shared[161]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 49)] * kernel_shared[225]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 98)] * kernel_shared[2]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 98)] * kernel_shared[66]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 98)] * kernel_shared[130]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 98)] * kernel_shared[194]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 98)] * kernel_shared[34]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 98)] * kernel_shared[98]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 98)] * kernel_shared[162]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 98)] * kernel_shared[226]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 147)] * kernel_shared[3]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 147)] * kernel_shared[67]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 147)] * kernel_shared[131]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 147)] * kernel_shared[195]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 147)] * kernel_shared[35]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 147)] * kernel_shared[99]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 147)] * kernel_shared[163]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 147)] * kernel_shared[227]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 196)] * kernel_shared[4]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 196)] * kernel_shared[68]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 196)] * kernel_shared[132]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 196)] * kernel_shared[196]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 196)] * kernel_shared[36]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 196)] * kernel_shared[100]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 196)] * kernel_shared[164]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 196)] * kernel_shared[228]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 245)] * kernel_shared[5]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 245)] * kernel_shared[69]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 245)] * kernel_shared[133]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 245)] * kernel_shared[197]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 245)] * kernel_shared[37]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 245)] * kernel_shared[101]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 245)] * kernel_shared[165]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 245)] * kernel_shared[229]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 294)] * kernel_shared[6]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 294)] * kernel_shared[70]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 294)] * kernel_shared[134]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 294)] * kernel_shared[198]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 294)] * kernel_shared[38]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 294)] * kernel_shared[102]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 294)] * kernel_shared[166]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 294)] * kernel_shared[230]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 343)] * kernel_shared[7]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 343)] * kernel_shared[71]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 343)] * kernel_shared[135]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 343)] * kernel_shared[199]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 343)] * kernel_shared[39]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 343)] * kernel_shared[103]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 343)] * kernel_shared[167]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 343)] * kernel_shared[231]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 392)] * kernel_shared[8]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 392)] * kernel_shared[72]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 392)] * kernel_shared[136]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 392)] * kernel_shared[200]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 392)] * kernel_shared[40]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 392)] * kernel_shared[104]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 392)] * kernel_shared[168]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 392)] * kernel_shared[232]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 441)] * kernel_shared[9]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 441)] * kernel_shared[73]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 441)] * kernel_shared[137]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 441)] * kernel_shared[201]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 441)] * kernel_shared[41]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 441)] * kernel_shared[105]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 441)] * kernel_shared[169]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 441)] * kernel_shared[233]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 490)] * kernel_shared[10]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 490)] * kernel_shared[74]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 490)] * kernel_shared[138]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 490)] * kernel_shared[202]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 490)] * kernel_shared[42]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 490)] * kernel_shared[106]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 490)] * kernel_shared[170]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 490)] * kernel_shared[234]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 539)] * kernel_shared[11]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 539)] * kernel_shared[75]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 539)] * kernel_shared[139]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 539)] * kernel_shared[203]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 539)] * kernel_shared[43]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 539)] * kernel_shared[107]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 539)] * kernel_shared[171]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 539)] * kernel_shared[235]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 588)] * kernel_shared[12]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 588)] * kernel_shared[76]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 588)] * kernel_shared[140]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 588)] * kernel_shared[204]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 588)] * kernel_shared[44]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 588)] * kernel_shared[108]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 588)] * kernel_shared[172]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 588)] * kernel_shared[236]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 637)] * kernel_shared[13]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 637)] * kernel_shared[77]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 637)] * kernel_shared[141]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 637)] * kernel_shared[205]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 637)] * kernel_shared[45]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 637)] * kernel_shared[109]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 637)] * kernel_shared[173]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 637)] * kernel_shared[237]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 686)] * kernel_shared[14]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 686)] * kernel_shared[78]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 686)] * kernel_shared[142]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 686)] * kernel_shared[206]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 686)] * kernel_shared[46]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 686)] * kernel_shared[110]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 686)] * kernel_shared[174]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 686)] * kernel_shared[238]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 735)] * kernel_shared[15]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 735)] * kernel_shared[79]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 735)] * kernel_shared[143]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 735)] * kernel_shared[207]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 735)] * kernel_shared[47]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 735)] * kernel_shared[111]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 735)] * kernel_shared[175]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 735)] * kernel_shared[239]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 784)] * kernel_shared[16]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 784)] * kernel_shared[80]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 784)] * kernel_shared[144]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 784)] * kernel_shared[208]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 784)] * kernel_shared[48]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 784)] * kernel_shared[112]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 784)] * kernel_shared[176]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 784)] * kernel_shared[240]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 833)] * kernel_shared[17]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 833)] * kernel_shared[81]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 833)] * kernel_shared[145]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 833)] * kernel_shared[209]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 833)] * kernel_shared[49]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 833)] * kernel_shared[113]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 833)] * kernel_shared[177]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 833)] * kernel_shared[241]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 882)] * kernel_shared[18]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 882)] * kernel_shared[82]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 882)] * kernel_shared[146]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 882)] * kernel_shared[210]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 882)] * kernel_shared[50]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 882)] * kernel_shared[114]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 882)] * kernel_shared[178]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 882)] * kernel_shared[242]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 931)] * kernel_shared[19]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 931)] * kernel_shared[83]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 931)] * kernel_shared[147]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 931)] * kernel_shared[211]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 931)] * kernel_shared[51]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 931)] * kernel_shared[115]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 931)] * kernel_shared[179]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 931)] * kernel_shared[243]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 980)] * kernel_shared[20]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 980)] * kernel_shared[84]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 980)] * kernel_shared[148]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 980)] * kernel_shared[212]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 980)] * kernel_shared[52]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 980)] * kernel_shared[116]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 980)] * kernel_shared[180]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 980)] * kernel_shared[244]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1029)] * kernel_shared[21]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1029)] * kernel_shared[85]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1029)] * kernel_shared[149]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1029)] * kernel_shared[213]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1029)] * kernel_shared[53]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1029)] * kernel_shared[117]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1029)] * kernel_shared[181]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1029)] * kernel_shared[245]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1078)] * kernel_shared[22]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1078)] * kernel_shared[86]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1078)] * kernel_shared[150]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1078)] * kernel_shared[214]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1078)] * kernel_shared[54]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1078)] * kernel_shared[118]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1078)] * kernel_shared[182]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1078)] * kernel_shared[246]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1127)] * kernel_shared[23]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1127)] * kernel_shared[87]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1127)] * kernel_shared[151]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1127)] * kernel_shared[215]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1127)] * kernel_shared[55]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1127)] * kernel_shared[119]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1127)] * kernel_shared[183]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1127)] * kernel_shared[247]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1176)] * kernel_shared[24]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1176)] * kernel_shared[88]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1176)] * kernel_shared[152]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1176)] * kernel_shared[216]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1176)] * kernel_shared[56]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1176)] * kernel_shared[120]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1176)] * kernel_shared[184]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1176)] * kernel_shared[248]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1225)] * kernel_shared[25]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1225)] * kernel_shared[89]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1225)] * kernel_shared[153]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1225)] * kernel_shared[217]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1225)] * kernel_shared[57]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1225)] * kernel_shared[121]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1225)] * kernel_shared[185]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1225)] * kernel_shared[249]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1274)] * kernel_shared[26]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1274)] * kernel_shared[90]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1274)] * kernel_shared[154]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1274)] * kernel_shared[218]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1274)] * kernel_shared[58]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1274)] * kernel_shared[122]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1274)] * kernel_shared[186]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1274)] * kernel_shared[250]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1323)] * kernel_shared[27]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1323)] * kernel_shared[91]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1323)] * kernel_shared[155]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1323)] * kernel_shared[219]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1323)] * kernel_shared[59]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1323)] * kernel_shared[123]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1323)] * kernel_shared[187]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1323)] * kernel_shared[251]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1372)] * kernel_shared[28]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1372)] * kernel_shared[92]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1372)] * kernel_shared[156]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1372)] * kernel_shared[220]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1372)] * kernel_shared[60]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1372)] * kernel_shared[124]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1372)] * kernel_shared[188]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1372)] * kernel_shared[252]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1421)] * kernel_shared[29]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1421)] * kernel_shared[93]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1421)] * kernel_shared[157]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1421)] * kernel_shared[221]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1421)] * kernel_shared[61]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1421)] * kernel_shared[125]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1421)] * kernel_shared[189]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1421)] * kernel_shared[253]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1470)] * kernel_shared[30]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1470)] * kernel_shared[94]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1470)] * kernel_shared[158]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1470)] * kernel_shared[222]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1470)] * kernel_shared[62]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1470)] * kernel_shared[126]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1470)] * kernel_shared[190]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1470)] * kernel_shared[254]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) + 1519)] * kernel_shared[31]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) + 1519)] * kernel_shared[95]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((int)threadIdx.x) + 1519)] * kernel_shared[159]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((int)threadIdx.x) + 1519)] * kernel_shared[223]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) + 1519)] * kernel_shared[63]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) + 1519)] * kernel_shared[127]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((int)threadIdx.x) + 1519)] * kernel_shared[191]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) + 1519)] * kernel_shared[255]));
         }
       }
-      compute[((((((int)blockIdx.x) / 7) * 3136) + (((int)threadIdx.x) * 49)) + (((int)blockIdx.x) % 7))] = max((conv2d_nchw[0] + bias[(((((int)blockIdx.x) / 7) * 64) + ((int)threadIdx.x))]), 0.000000e+00f);
-      compute[(((((((int)blockIdx.x) / 7) * 3136) + (((int)threadIdx.x) * 49)) + (((int)blockIdx.x) % 7)) + 7)] = max((conv2d_nchw[1] + bias[(((((int)blockIdx.x) / 7) * 64) + ((int)threadIdx.x))]), 0.000000e+00f);
-      compute[(((((((int)blockIdx.x) / 7) * 3136) + (((int)threadIdx.x) * 49)) + (((int)blockIdx.x) % 7)) + 14)] = max((conv2d_nchw[2] + bias[(((((int)blockIdx.x) / 7) * 64) + ((int)threadIdx.x))]), 0.000000e+00f);
-      compute[(((((((int)blockIdx.x) / 7) * 3136) + (((int)threadIdx.x) * 49)) + (((int)blockIdx.x) % 7)) + 21)] = max((conv2d_nchw[3] + bias[(((((int)blockIdx.x) / 7) * 64) + ((int)threadIdx.x))]), 0.000000e+00f);
-      compute[(((((((int)blockIdx.x) / 7) * 3136) + (((int)threadIdx.x) * 49)) + (((int)blockIdx.x) % 7)) + 28)] = max((conv2d_nchw[4] + bias[(((((int)blockIdx.x) / 7) * 64) + ((int)threadIdx.x))]), 0.000000e+00f);
-      compute[(((((((int)blockIdx.x) / 7) * 3136) + (((int)threadIdx.x) * 49)) + (((int)blockIdx.x) % 7)) + 35)] = max((conv2d_nchw[5] + bias[(((((int)blockIdx.x) / 7) * 64) + ((int)threadIdx.x))]), 0.000000e+00f);
-      compute[(((((((int)blockIdx.x) / 7) * 3136) + (((int)threadIdx.x) * 49)) + (((int)blockIdx.x) % 7)) + 42)] = max((conv2d_nchw[6] + bias[(((((int)blockIdx.x) / 7) * 64) + ((int)threadIdx.x))]), 0.000000e+00f);
+      for (int i1_inner = 0; i1_inner < 2; ++i1_inner) {
+        compute[(((((int)blockIdx.x) * 392) + (i1_inner * 49)) + ((int)threadIdx.x))] = max((conv2d_nchw[i1_inner] + bias[((((int)blockIdx.x) * 8) + i1_inner)]), 0.000000e+00f);
+        compute[((((((int)blockIdx.x) * 392) + (i1_inner * 49)) + ((int)threadIdx.x)) + 98)] = max((conv2d_nchw[(i1_inner + 2)] + bias[(((((int)blockIdx.x) * 8) + i1_inner) + 2)]), 0.000000e+00f);
+        compute[((((((int)blockIdx.x) * 392) + (i1_inner * 49)) + ((int)threadIdx.x)) + 196)] = max((conv2d_nchw[(i1_inner + 4)] + bias[(((((int)blockIdx.x) * 8) + i1_inner) + 4)]), 0.000000e+00f);
+        compute[((((((int)blockIdx.x) * 392) + (i1_inner * 49)) + ((int)threadIdx.x)) + 294)] = max((conv2d_nchw[(i1_inner + 6)] + bias[(((((int)blockIdx.x) * 8) + i1_inner) + 6)]), 0.000000e+00f);
+      }
     }
 
 
@@ -4319,7 +2393,7 @@ In the example below we resume the status and do more 5 trials.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 2 minutes  26.752 seconds)
+   **Total running time of the script:** ( 2 minutes  27.893 seconds)
 
 
 .. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
index c045b0e81..824540861 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
@@ -616,7 +616,7 @@ so we can read the log file and load the best schedules.
     Evaluate inference time cost...
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      10.1464      10.1449      10.1931      10.1011       0.0376   
+       9.7250       9.7145       9.7479       9.7125       0.0162   
                
 
 
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
index 37f9e150e..1351d795b 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
@@ -635,7 +635,7 @@ so we can read the log file and load the best schedules.
     Evaluate inference time cost...
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      759.1797     759.4534     759.6565     758.4292      0.5371   
+      745.6837     744.9122     747.7676     744.3712      1.4900   
                
 
 
@@ -660,7 +660,7 @@ Other Tips
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  19.468 seconds)
+   **Total running time of the script:** ( 1 minutes  18.366 seconds)
 
 
 .. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_network_x86.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
index a60106e54..17087c050 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
@@ -362,72 +362,121 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
                  placeholder_4: Buffer(placeholder_14: Pointer(float32), float32, [65536], []),
                  compute: Buffer(compute_2: Pointer(float32), float32, [65536], [])}
       buffer_map = {placeholder_5: placeholder, placeholder_6: placeholder_1, placeholder_7: placeholder_2, placeholder_8: placeholder_3, placeholder_9: placeholder_4, compute_1: compute}
-      preflattened_buffer_map = {placeholder_6: placeholder_15: Buffer(placeholder_11, float32, [4916, 16, 1], []), placeholder_8: placeholder_16: Buffer(placeholder_13, int32, [33], []), placeholder_9: placeholder_17: Buffer(placeholder_14, float32, [128, 512], []), compute_1: compute_3: Buffer(compute_2, float32, [128, 512], []), placeholder_5: placeholder_18: Buffer(placeholder_10, float32, [128, 256], []), placeholder_7: placeholder_19: Buffer(placeholder_12, int32, [4916], [])} {
-      for (i0.outer.i1.outer.fused: int32, 0, 512) "parallel" {
-        allocate(compute_4: Pointer(global float32), float32, [128]), storage_scope = global {
-          for (i.outer.inner: int32, 0, 8) {
-            let cse_var_2: int32 = floordiv(i0.outer.i1.outer.fused, 16)
-            let cse_var_1: int32 = (i.outer.inner*16)
-             {
-              compute_5: Buffer(compute_4, float32, [128], [])[cse_var_1] = 0f32
-              compute_5[(cse_var_1 + 1)] = 0f32
-              compute_5[(cse_var_1 + 2)] = 0f32
-              compute_5[(cse_var_1 + 3)] = 0f32
-              compute_5[(cse_var_1 + 4)] = 0f32
-              compute_5[(cse_var_1 + 5)] = 0f32
-              compute_5[(cse_var_1 + 6)] = 0f32
-              compute_5[(cse_var_1 + 7)] = 0f32
-              compute_5[(cse_var_1 + 8)] = 0f32
-              compute_5[(cse_var_1 + 9)] = 0f32
-              compute_5[(cse_var_1 + 10)] = 0f32
-              compute_5[(cse_var_1 + 11)] = 0f32
-              compute_5[(cse_var_1 + 12)] = 0f32
-              compute_5[(cse_var_1 + 13)] = 0f32
-              compute_5[(cse_var_1 + 14)] = 0f32
-              compute_5[(cse_var_1 + 15)] = 0f32
-              for (elem_idx: int32, 0, (placeholder_3[(cse_var_2 + 1)] - placeholder_3[cse_var_2])) {
-                let cse_var_20: int32 = (cse_var_1 + 10)
-                let cse_var_19: int32 = (cse_var_1 + 11)
-                let cse_var_18: int32 = (cse_var_1 + 12)
-                let cse_var_17: int32 = (cse_var_1 + 13)
-                let cse_var_16: int32 = (cse_var_1 + 14)
-                let cse_var_15: int32 = (cse_var_1 + 15)
-                let cse_var_14: int32 = (cse_var_1 + 2)
-                let cse_var_13: int32 = (cse_var_1 + 3)
-                let cse_var_12: int32 = (cse_var_1 + 1)
-                let cse_var_11: int32 = (cse_var_1 + 5)
-                let cse_var_10: int32 = (cse_var_1 + 6)
-                let cse_var_9: int32 = (cse_var_1 + 7)
-                let cse_var_8: int32 = (cse_var_1 + 8)
-                let cse_var_7: int32 = (cse_var_1 + 9)
-                let cse_var_6: int32 = (elem_idx*16)
-                let cse_var_5: int32 = (i.outer.inner*4096)
-                let cse_var_4: int32 = floormod(i0.outer.i1.outer.fused, 16)
-                let cse_var_3: int32 = (cse_var_1 + 4)
-                 {
-                  compute_5[cse_var_1] = (compute_5[cse_var_1] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_6) + cse_var_4)]*max(placeholder[(cse_var_5 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_6) + cse_var_4)]*max(placeholder[((cse_var_5 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 256)], 0f32)))
-                  compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_6) + cse_var_4)]*max(placeholder[((cse_var_5 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 512)], 0f32)))
-                  compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_6) + cse_var_4)]*max(placeholder[((cse_var_5 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 768)], 0f32)))
-                  compute_5[cse_var_3] = (compute_5[cse_var_3] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_6) + cse_var_4)]*max(placeholder[((cse_var_5 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 1024)], 0f32)))
-                  compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_6) + cse_var_4)]*max(placeholder[((cse_var_5 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 1280)], 0f32)))
-                  compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_6) + cse_var_4)]*max(placeholder[((cse_var_5 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 1536)], 0f32)))
-                  compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_6) + cse_var_4)]*max(placeholder[((cse_var_5 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 1792)], 0f32)))
-                  compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_6) + cse_var_4)]*max(placeholder[((cse_var_5 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 2048)], 0f32)))
-                  compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_6) + cse_var_4)]*max(placeholder[((cse_var_5 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 2304)], 0f32)))
-                  compute_5[cse_var_20] = (compute_5[cse_var_20] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_6) + cse_var_4)]*max(placeholder[((cse_var_5 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 2560)], 0f32)))
-                  compute_5[cse_var_19] = (compute_5[cse_var_19] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_6) + cse_var_4)]*max(placeholder[((cse_var_5 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 2816)], 0f32)))
-                  compute_5[cse_var_18] = (compute_5[cse_var_18] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_6) + cse_var_4)]*max(placeholder[((cse_var_5 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 3072)], 0f32)))
-                  compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_6) + cse_var_4)]*max(placeholder[((cse_var_5 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 3328)], 0f32)))
-                  compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_6) + cse_var_4)]*max(placeholder[((cse_var_5 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 3584)], 0f32)))
-                  compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_6) + cse_var_4)]*max(placeholder[((cse_var_5 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 3840)], 0f32)))
+      preflattened_buffer_map = {placeholder_5: placeholder_15: Buffer(placeholder_10, float32, [128, 256], []), placeholder_7: placeholder_16: Buffer(placeholder_12, int32, [4916], []), placeholder_9: placeholder_17: Buffer(placeholder_14, float32, [128, 512], []), placeholder_6: placeholder_18: Buffer(placeholder_11, float32, [4916, 16, 1], []), placeholder_8: placeholder_19: Buffer(placeholder_13, int32, [33], []), compute_1: compute_3: Buffer(compute_2, float32, [128, 512], [])} {
+      for (i0.outer.i1.outer.fused: int32, 0, 16) "parallel" {
+        allocate(compute_4: Pointer(global float32), float32, [4096]), storage_scope = global {
+          for (i.outer.inner: int32, 0, 64) {
+            for (nb_j.inner: int32, 0, 2) {
+              let cse_var_2: int32 = ((i0.outer.i1.outer.fused*2) + nb_j.inner)
+              let cse_var_1: int32 = ((i.outer.inner*64) + (nb_j.inner*16))
+               {
+                compute_5: Buffer(compute_4, float32, [4096], [])[cse_var_1] = 0f32
+                compute_5[(cse_var_1 + 1)] = 0f32
+                compute_5[(cse_var_1 + 2)] = 0f32
+                compute_5[(cse_var_1 + 3)] = 0f32
+                compute_5[(cse_var_1 + 4)] = 0f32
+                compute_5[(cse_var_1 + 5)] = 0f32
+                compute_5[(cse_var_1 + 6)] = 0f32
+                compute_5[(cse_var_1 + 7)] = 0f32
+                compute_5[(cse_var_1 + 8)] = 0f32
+                compute_5[(cse_var_1 + 9)] = 0f32
+                compute_5[(cse_var_1 + 10)] = 0f32
+                compute_5[(cse_var_1 + 11)] = 0f32
+                compute_5[(cse_var_1 + 12)] = 0f32
+                compute_5[(cse_var_1 + 13)] = 0f32
+                compute_5[(cse_var_1 + 14)] = 0f32
+                compute_5[(cse_var_1 + 15)] = 0f32
+                compute_5[(cse_var_1 + 32)] = 0f32
+                compute_5[(cse_var_1 + 33)] = 0f32
+                compute_5[(cse_var_1 + 34)] = 0f32
+                compute_5[(cse_var_1 + 35)] = 0f32
+                compute_5[(cse_var_1 + 36)] = 0f32
+                compute_5[(cse_var_1 + 37)] = 0f32
+                compute_5[(cse_var_1 + 38)] = 0f32
+                compute_5[(cse_var_1 + 39)] = 0f32
+                compute_5[(cse_var_1 + 40)] = 0f32
+                compute_5[(cse_var_1 + 41)] = 0f32
+                compute_5[(cse_var_1 + 42)] = 0f32
+                compute_5[(cse_var_1 + 43)] = 0f32
+                compute_5[(cse_var_1 + 44)] = 0f32
+                compute_5[(cse_var_1 + 45)] = 0f32
+                compute_5[(cse_var_1 + 46)] = 0f32
+                compute_5[(cse_var_1 + 47)] = 0f32
+                for (elem_idx: int32, 0, (placeholder_3[(cse_var_2 + 1)] - placeholder_3[cse_var_2])) {
+                  let cse_var_35: int32 = (cse_var_1 + 10)
+                  let cse_var_34: int32 = (cse_var_1 + 11)
+                  let cse_var_33: int32 = (cse_var_1 + 12)
+                  let cse_var_32: int32 = (cse_var_1 + 13)
+                  let cse_var_31: int32 = (cse_var_1 + 14)
+                  let cse_var_30: int32 = (cse_var_1 + 15)
+                  let cse_var_29: int32 = (cse_var_1 + 2)
+                  let cse_var_28: int32 = (cse_var_1 + 3)
+                  let cse_var_27: int32 = (cse_var_1 + 32)
+                  let cse_var_26: int32 = (cse_var_1 + 33)
+                  let cse_var_25: int32 = (cse_var_1 + 34)
+                  let cse_var_24: int32 = (cse_var_1 + 35)
+                  let cse_var_23: int32 = (cse_var_1 + 36)
+                  let cse_var_22: int32 = (cse_var_1 + 37)
+                  let cse_var_21: int32 = (cse_var_1 + 38)
+                  let cse_var_20: int32 = (cse_var_1 + 1)
+                  let cse_var_19: int32 = (i.outer.inner*512)
+                  let cse_var_18: int32 = (elem_idx*16)
+                  let cse_var_17: int32 = (cse_var_1 + 9)
+                  let cse_var_16: int32 = (cse_var_1 + 8)
+                  let cse_var_15: int32 = (cse_var_1 + 7)
+                  let cse_var_14: int32 = (cse_var_1 + 6)
+                  let cse_var_13: int32 = (cse_var_1 + 5)
+                  let cse_var_12: int32 = (cse_var_1 + 39)
+                  let cse_var_11: int32 = (cse_var_1 + 46)
+                  let cse_var_10: int32 = (cse_var_1 + 45)
+                  let cse_var_9: int32 = (cse_var_1 + 44)
+                  let cse_var_8: int32 = (cse_var_1 + 43)
+                  let cse_var_7: int32 = (cse_var_1 + 42)
+                  let cse_var_6: int32 = (cse_var_1 + 41)
+                  let cse_var_5: int32 = (cse_var_1 + 40)
+                  let cse_var_4: int32 = (cse_var_1 + 4)
+                  let cse_var_3: int32 = (cse_var_1 + 47)
+                   {
+                    compute_5[cse_var_1] = (compute_5[cse_var_1] + (placeholder_1[((placeholder_3[cse_var_2]*16) + cse_var_18)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)])], 0f32)))
+                    compute_5[cse_var_20] = (compute_5[cse_var_20] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 1)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)])], 0f32)))
+                    compute_5[cse_var_29] = (compute_5[cse_var_29] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 2)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)])], 0f32)))
+                    compute_5[cse_var_28] = (compute_5[cse_var_28] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 3)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)])], 0f32)))
+                    compute_5[cse_var_4] = (compute_5[cse_var_4] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 4)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)])], 0f32)))
+                    compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 5)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)])], 0f32)))
+                    compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 6)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)])], 0f32)))
+                    compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 7)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)])], 0f32)))
+                    compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 8)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)])], 0f32)))
+                    compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 9)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)])], 0f32)))
+                    compute_5[cse_var_35] = (compute_5[cse_var_35] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 10)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)])], 0f32)))
+                    compute_5[cse_var_34] = (compute_5[cse_var_34] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 11)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)])], 0f32)))
+                    compute_5[cse_var_33] = (compute_5[cse_var_33] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 12)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)])], 0f32)))
+                    compute_5[cse_var_32] = (compute_5[cse_var_32] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 13)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)])], 0f32)))
+                    compute_5[cse_var_31] = (compute_5[cse_var_31] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 14)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)])], 0f32)))
+                    compute_5[cse_var_30] = (compute_5[cse_var_30] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 15)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)])], 0f32)))
+                    compute_5[cse_var_27] = (compute_5[cse_var_27] + (placeholder_1[((placeholder_3[cse_var_2]*16) + cse_var_18)]*max(placeholder[((cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 256)], 0f32)))
+                    compute_5[cse_var_26] = (compute_5[cse_var_26] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 1)]*max(placeholder[((cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 256)], 0f32)))
+                    compute_5[cse_var_25] = (compute_5[cse_var_25] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 2)]*max(placeholder[((cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 256)], 0f32)))
+                    compute_5[cse_var_24] = (compute_5[cse_var_24] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 3)]*max(placeholder[((cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 256)], 0f32)))
+                    compute_5[cse_var_23] = (compute_5[cse_var_23] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 4)]*max(placeholder[((cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 256)], 0f32)))
+                    compute_5[cse_var_22] = (compute_5[cse_var_22] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 5)]*max(placeholder[((cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 256)], 0f32)))
+                    compute_5[cse_var_21] = (compute_5[cse_var_21] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 6)]*max(placeholder[((cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 256)], 0f32)))
+                    compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 7)]*max(placeholder[((cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 256)], 0f32)))
+                    compute_5[cse_var_5] = (compute_5[cse_var_5] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 8)]*max(placeholder[((cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 256)], 0f32)))
+                    compute_5[cse_var_6] = (compute_5[cse_var_6] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 9)]*max(placeholder[((cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 256)], 0f32)))
+                    compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 10)]*max(placeholder[((cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 256)], 0f32)))
+                    compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 11)]*max(placeholder[((cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 256)], 0f32)))
+                    compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 12)]*max(placeholder[((cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 256)], 0f32)))
+                    compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 13)]*max(placeholder[((cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 256)], 0f32)))
+                    compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 14)]*max(placeholder[((cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 256)], 0f32)))
+                    compute_5[cse_var_3] = (compute_5[cse_var_3] + (placeholder_1[(((placeholder_3[cse_var_2]*16) + cse_var_18) + 15)]*max(placeholder[((cse_var_19 + placeholder_2[(placeholder_3[cse_var_2] + elem_idx)]) + 256)], 0f32)))
+                  }
                 }
               }
             }
           }
           for (i0.inner: int32, 0, 128) {
-            let cse_var_21: int32 = ((i0.inner*512) + i0.outer.i1.outer.fused)
-            compute[cse_var_21] = max((compute_5[i0.inner] + placeholder_4[cse_var_21]), 0f32)
+            let cse_var_36: int32 = ((i0.inner*512) + (i0.outer.i1.outer.fused*32))
+            compute[ramp(cse_var_36, 1, 32)] = max((compute_5[ramp((i0.inner*32), 1, 32)] + placeholder_4[ramp(cse_var_36, 1, 32)]), broadcast(0f32, 32))
           }
         }
       }
@@ -481,7 +530,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 3.719 ms
+    Execution time of this operator: 3.329 ms
 
 
 
diff --git a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
index 410fb15db..05635a7c2 100644
--- a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**00:43.894** total execution time for **how_to_tune_with_autotvm** files:
+**00:43.948** total execution time for **how_to_tune_with_autotvm** files:
 
-- **00:43.001**: :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``)
-- **00:00.232**: :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``)
-- **00:00.227**: :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_mobile_gpu.py` (``tune_relay_mobile_gpu.py``)
-- **00:00.219**: :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_cuda.py` (``tune_relay_cuda.py``)
-- **00:00.215**: :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_arm.py` (``tune_relay_arm.py``)
+- **00:43.073**: :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``)
+- **00:00.228**: :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``)
+- **00:00.217**: :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_mobile_gpu.py` (``tune_relay_mobile_gpu.py``)
+- **00:00.217**: :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_cuda.py` (``tune_relay_cuda.py``)
+- **00:00.213**: :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_arm.py` (``tune_relay_arm.py``)
diff --git a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
index 886830b76..acafb1020 100644
--- a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
@@ -859,8 +859,8 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 854, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 4, 32]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 128]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2885496
-    No: 6   GFLOPS: 95.49/95.49     result: MeasureResult(costs=(0.0024242672500000002,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.6259732246398926, timestamp=1654117245.5954592)      [('tile_f', [-1, 1, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3754080
-    No: 7   GFLOPS: 0.00/95.49      result: Traceback (most recent call last):
+    No: 6   GFLOPS: 103.46/103.46   result: MeasureResult(costs=(0.002237678166666667,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.6022238731384277, timestamp=1654117543.9835403)       [('tile_f', [-1, 1, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3754080
+    No: 7   GFLOPS: 0.00/103.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 571, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 523, in _build_func_common
@@ -983,7 +983,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 854, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 16, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 256, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6225319
-    No: 8   GFLOPS: 0.00/95.49      result: Traceback (most recent call last):
+    No: 8   GFLOPS: 0.00/103.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 571, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 523, in _build_func_common
@@ -1106,7 +1106,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 854, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 1, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 8, 64]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,943546
-    No: 9   GFLOPS: 0.00/95.49      result: Traceback (most recent call last):
+    No: 9   GFLOPS: 0.00/103.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 571, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 523, in _build_func_common
@@ -1229,7 +1229,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 854, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 16, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 16, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2868708
-    No: 10  GFLOPS: 0.00/95.49      result: Traceback (most recent call last):
+    No: 10  GFLOPS: 0.00/103.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 142, in build
         res = future.result()
       File "/usr/lib/python3.7/concurrent/futures/_base.py", line 435, in result
@@ -1247,7 +1247,7 @@ for this template
     TimeoutError
 
             [('tile_f', [-1, 32, 2, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4691833
-    No: 11  GFLOPS: 0.00/95.49      result: Traceback (most recent call last):
+    No: 11  GFLOPS: 0.00/103.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 571, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 523, in _build_func_common
@@ -1370,7 +1370,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 854, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 2, 64]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1042124
-    No: 12  GFLOPS: 0.00/95.49      result: Traceback (most recent call last):
+    No: 12  GFLOPS: 0.00/103.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 571, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 523, in _build_func_common
@@ -1493,7 +1493,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 854, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 32, 1, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 32, 16]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10013405
-    No: 13  GFLOPS: 0.00/95.49      result: Traceback (most recent call last):
+    No: 13  GFLOPS: 0.00/103.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 571, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 523, in _build_func_common
@@ -1616,7 +1616,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 854, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 8, 8, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 32]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6732082
-    No: 14  GFLOPS: 0.00/95.49      result: Traceback (most recent call last):
+    No: 14  GFLOPS: 0.00/103.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 571, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 523, in _build_func_common
@@ -1739,7 +1739,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 854, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 4, 32]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 128]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7536735
-    No: 15  GFLOPS: 0.00/95.49      result: Traceback (most recent call last):
+    No: 15  GFLOPS: 0.00/103.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 571, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 523, in _build_func_common
@@ -1862,7 +1862,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 854, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 1, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 128, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,482121
-    No: 16  GFLOPS: 0.00/95.49      result: Traceback (most recent call last):
+    No: 16  GFLOPS: 0.00/103.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 571, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 523, in _build_func_common
@@ -1985,7 +1985,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 854, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 1, 16]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 32, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2824525
-    No: 17  GFLOPS: 0.00/95.49      result: Traceback (most recent call last):
+    No: 17  GFLOPS: 0.00/103.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 571, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 523, in _build_func_common
@@ -2108,7 +2108,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 854, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 64, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4559286
-    No: 18  GFLOPS: 0.00/95.49      result: Traceback (most recent call last):
+    No: 18  GFLOPS: 0.00/103.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 571, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 523, in _build_func_common
@@ -2231,7 +2231,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 854, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 32, 16]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 512]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9677544
-    No: 19  GFLOPS: 0.00/95.49      result: Traceback (most recent call last):
+    No: 19  GFLOPS: 0.00/103.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 721, in __call__
         yield remote, remote.load_module(os.path.split(build_result.filename)[1])
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 685, in run_through_rpc
@@ -2319,7 +2319,7 @@ for this template
       15: _PyEval_EvalFrameDefault
       14: 0x0000000000537c30
       13: _PyObject_FastCallKeywords
-      12: 0x00007f3309e61fa2
+      12: 0x00007fd366fd0fa2
       11: _ctypes_callproc
       10: ffi_call
       9: ffi_call_unix64
@@ -2384,7 +2384,7 @@ for this template
       21: _PyFunction_FastCallKeywords
       20: _PyEval_EvalFrameDefault
       19: _PyFunction_FastCall      [('tile_f', [-1, 8, 2, 16]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6390073
-    No: 20  GFLOPS: 142.99/142.99   result: MeasureResult(costs=(0.001619031258064516,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.184067964553833, timestamp=1654117271.7792768)        [('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9881539
+    No: 20  GFLOPS: 143.90/143.90   result: MeasureResult(costs=(0.00160871697,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4165451526641846, timestamp=1654117570.4327035)      [('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9881539
 
 
 
@@ -2437,7 +2437,7 @@ and measure running time.
 
     Best config:
     [('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9881539
-    Time cost of this operator: 0.001998
+    Time cost of this operator: 0.001953
 
 
 
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
index dc49e5a67..273684d21 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
@@ -294,10 +294,10 @@ Timing the untuned program
     ########## Build without Autotuning ##########
     Node Name                                     Ops                                           Time(us)  Time(%)  Shape              Inputs  Outputs  
     ---------                                     ---                                           --------  -------  -----              ------  -------  
-    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  313.9     98.732   (1, 2, 10, 10, 3)  2       1        
-    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       3.128     0.984    (1, 6, 10, 10)     1       1        
-    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.902     0.284    (1, 1, 10, 10, 3)  1       1        
-    Total_time                                    -                                             317.93    -        -                  -       -        
+    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  310.3     98.738   (1, 2, 10, 10, 3)  2       1        
+    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       3.064     0.975    (1, 6, 10, 10)     1       1        
+    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.901     0.287    (1, 1, 10, 10, 3)  1       1        
+    Total_time                                    -                                             314.265   -        -                  -       -        
 
 
 
@@ -359,10 +359,10 @@ Timing the tuned program
     ########## Build with Autotuning ##########
     Node Name                                     Ops                                           Time(us)  Time(%)  Shape              Inputs  Outputs  
     ---------                                     ---                                           --------  -------  -----              ------  -------  
-    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  79.6      96.743   (1, 6, 10, 10, 1)  2       1        
-    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       1.779     2.162    (1, 6, 10, 10)     1       1        
-    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.901     1.095    (1, 1, 10, 10, 3)  1       1        
-    Total_time                                    -                                             82.28     -        -                  -       -        
+    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  80.0      96.706   (1, 6, 10, 10, 1)  2       1        
+    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       1.784     2.157    (1, 6, 10, 10)     1       1        
+    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.941     1.138    (1, 1, 10, 10, 3)  1       1        
+    Total_time                                    -                                             82.725    -        -                  -       -        
 
 
 
diff --git a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
index 513107861..28ef2469f 100644
--- a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**00:46.137** total execution time for **how_to_work_with_microtvm** files:
+**00:45.417** total execution time for **how_to_work_with_microtvm** files:
 
-- **00:41.928**: :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``)
-- **00:03.603**: :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``)
-- **00:00.207**: :ref:`sphx_glr_how_to_work_with_microtvm_micro_ethosu.py` (``micro_ethosu.py``)
-- **00:00.206**: :ref:`sphx_glr_how_to_work_with_microtvm_micro_tvmc.py` (``micro_tvmc.py``)
+- **00:41.214**: :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``)
+- **00:03.605**: :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``)
+- **00:00.205**: :ref:`sphx_glr_how_to_work_with_microtvm_micro_tvmc.py` (``micro_tvmc.py``)
+- **00:00.200**: :ref:`sphx_glr_how_to_work_with_microtvm_micro_ethosu.py` (``micro_ethosu.py``)
 - **00:00.192**: :ref:`sphx_glr_how_to_work_with_microtvm_micro_reference_vm.py` (``micro_reference_vm.py``)
diff --git a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
index c40b902bf..488aa3cd0 100644
--- a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
@@ -5,8 +5,8 @@
 
 Computation times
 =================
-**00:09.715** total execution time for **how_to_work_with_relay** files:
+**00:10.057** total execution time for **how_to_work_with_relay** files:
 
-- **00:07.961**: :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``)
-- **00:01.526**: :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``)
-- **00:00.228**: :ref:`sphx_glr_how_to_work_with_relay_using_relay_viz.py` (``using_relay_viz.py``)
+- **00:08.117**: :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``)
+- **00:01.737**: :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``)
+- **00:00.204**: :ref:`sphx_glr_how_to_work_with_relay_using_relay_viz.py` (``using_relay_viz.py``)
diff --git a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
index 925264674..193034654 100644
--- a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
@@ -5,13 +5,13 @@
 
 Computation times
 =================
-**00:05.654** total execution time for **how_to_work_with_schedules** files:
+**00:05.571** total execution time for **how_to_work_with_schedules** files:
 
-- **00:02.061**: :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``)
-- **00:01.115**: :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``)
-- **00:00.728**: :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``)
-- **00:00.704**: :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``)
-- **00:00.314**: :ref:`sphx_glr_how_to_work_with_schedules_extern_op.py` (``extern_op.py``)
-- **00:00.251**: :ref:`sphx_glr_how_to_work_with_schedules_tedd.py` (``tedd.py``)
-- **00:00.250**: :ref:`sphx_glr_how_to_work_with_schedules_schedule_primitives.py` (``schedule_primitives.py``)
-- **00:00.233**: :ref:`sphx_glr_how_to_work_with_schedules_tuple_inputs.py` (``tuple_inputs.py``)
+- **00:02.078**: :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``)
+- **00:01.141**: :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``)
+- **00:00.709**: :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``)
+- **00:00.702**: :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``)
+- **00:00.290**: :ref:`sphx_glr_how_to_work_with_schedules_extern_op.py` (``extern_op.py``)
+- **00:00.224**: :ref:`sphx_glr_how_to_work_with_schedules_tedd.py` (``tedd.py``)
+- **00:00.223**: :ref:`sphx_glr_how_to_work_with_schedules_schedule_primitives.py` (``schedule_primitives.py``)
+- **00:00.205**: :ref:`sphx_glr_how_to_work_with_schedules_tuple_inputs.py` (``tuple_inputs.py``)
diff --git a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
index 0ce65c4e5..68a38a32a 100644
--- a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
@@ -318,7 +318,7 @@ The importing needs to happen before the tensorized GEMV being executed.
                  C: Buffer(C_2: Pointer(float32), float32, [524288], [])}
       buffer_map = {A_1: A, B_1: B, C_1: C}
       preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [1024, 64], []), B_1: B_3: Buffer(B_2, float32, [512, 64], []), C_1: C_3: Buffer(C_2, float32, [1024, 512], [])} {
-      attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmpwjtfa9qo/input0.cc'\nsource_filename = \"/tmp/tmpwjtfa9qo/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n  %7 = alloca float*, align 8\n  %8 = alloca float*, align 8\n  %9 = alloca floa [...]
+      attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmpjlztcl8o/input0.cc'\nsource_filename = \"/tmp/tmpjlztcl8o/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n  %7 = alloca float*, align 8\n  %8 = alloca float*, align 8\n  %9 = alloca floa [...]
       for (i, 0, 1024) {
         for (j.outer: int32, 0, 32) {
           @tir.call_extern("gemv_update", @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), C_2, ((i*512) + (j.outer*16)), 16, 2, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), A_2, (i*64), 64, 1, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), B_2, (j.outer*1024), 1024, 1, dtype=handle), 16, 64, 64, dtype=int32)
diff --git a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
index 17c173ca8..b45963c9e 100644
--- a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
@@ -5,7 +5,7 @@
 
 Computation times
 =================
-**00:20.729** total execution time for **topic_vta_tutorials_autotvm** files:
+**00:20.712** total execution time for **topic_vta_tutorials_autotvm** files:
 
-- **00:20.523**: :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``)
-- **00:00.206**: :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_alu_vta.py` (``tune_alu_vta.py``)
+- **00:20.510**: :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``)
+- **00:00.203**: :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_alu_vta.py` (``tune_alu_vta.py``)
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
index f0d77907d..194eda83d 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
@@ -267,7 +267,7 @@ The compilation steps are:
       DeprecationWarning,
     /workspace/vta/tutorials/frontend/deploy_classification.py:213: DeprecationWarning: legacy graph executor behavior of producing json / lib / params will be removed in the next release. Please see documents of tvm.contrib.graph_executor.GraphModule for the  new recommended usage.
       relay_prog, target=tvm.target.Target(target, host=env.target_host), params=params
-    resnet18_v1 inference graph built in 21.88s!
+    resnet18_v1 inference graph built in 21.24s!
 
 
 
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
index 532287470..a7e944edd 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
@@ -303,7 +303,7 @@ The compilation steps are:
       "target_host parameter is going to be deprecated. "
     /workspace/python/tvm/relay/build_module.py:389: DeprecationWarning: Please use input parameter mod (tvm.IRModule) instead of deprecated parameter mod (tvm.relay.function.Function)
       DeprecationWarning,
-    yolov3-tiny inference graph built in 15.42s!
+    yolov3-tiny inference graph built in 14.83s!
 
 
 
diff --git a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
index e9f6ea5bd..a14d8845b 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
@@ -5,7 +5,7 @@
 
 Computation times
 =================
-**01:29.809** total execution time for **topic_vta_tutorials_frontend** files:
+**01:28.144** total execution time for **topic_vta_tutorials_frontend** files:
 
-- **00:47.699**: :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``)
-- **00:42.111**: :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``)
+- **00:46.873**: :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``)
+- **00:41.271**: :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``)
diff --git a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
index ded0e786b..9ade823f4 100644
--- a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
@@ -5,7 +5,7 @@
 
 Computation times
 =================
-**00:03.499** total execution time for **topic_vta_tutorials_optimize** files:
+**00:03.546** total execution time for **topic_vta_tutorials_optimize** files:
 
-- **00:02.954**: :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``)
-- **00:00.545**: :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``)
+- **00:02.991**: :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``)
+- **00:00.555**: :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``)
diff --git a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
index 4ecb8edee..99d820633 100644
--- a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
@@ -5,7 +5,7 @@
 
 Computation times
 =================
-**00:00.999** total execution time for **topic_vta_tutorials** files:
+**00:00.982** total execution time for **topic_vta_tutorials** files:
 
-- **00:00.507**: :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``)
-- **00:00.492**: :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``)
+- **00:00.500**: :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``)
+- **00:00.482**: :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``)
diff --git a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
index 32a0efcf3..c48b66322 100644
--- a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
@@ -306,7 +306,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 93.982 ms
+    Execution time of this operator: 93.968 ms
 
 
 
diff --git a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
index 247e5331a..0491fe8a0 100644
--- a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
@@ -280,7 +280,7 @@ standard deviation.
 
  .. code-block:: none
 
-    {'mean': 493.0805308300023, 'median': 492.9776465500254, 'std': 1.6479009030430605}
+    {'mean': 491.8569577499966, 'median': 491.976347300033, 'std': 0.7611859830537622}
 
 
 
@@ -494,31 +494,31 @@ the tuning data to.
 
  .. code-block:: none
 
-
    [Task  1/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  1/25]  Current/Best:   17.57/  17.57 GFLOPS | Progress: (4/20) | 5.88 s
    [Task  1/25]  Current/Best:    6.16/  17.57 GFLOPS | Progress: (8/20) | 8.83 s
    [Task  1/25]  Current/Best:   11.54/  22.79 GFLOPS | Progress: (12/20) | 11.31 s
    [Task  1/25]  Current/Best:   16.78/  22.85 GFLOPS | Progress: (16/20) | 12.98 s
    [Task  1/25]  Current/Best:   11.65/  23.94 GFLOPS | Progress: (20/20) | 14.70 s Done.
-
    [Task  2/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  2/25]  Current/Best:   12.28/  13.14 GFLOPS | Progress: (4/20) | 3.77 s
    [Task  2/25]  Current/Best:   14.31/  18.08 GFLOPS | Progress: (8/20) | 5.06 s
    [Task  2/25]  Current/Best:   21.24/  21.24 GFLOPS | Progress: (12/20) | 6.36 s
    [Task  2/25]  Current/Best:   12.75/  21.24 GFLOPS | Progress: (16/20) | 7.64 s
    [Task  2/25]  Current/Best:   19.72/  21.24 GFLOPS | Progress: (20/20) | 9.24 s Done.
-
    [Task  3/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  3/25]  Current/Best:    1.63/  10.55 GFLOPS | Progress: (4/20) | 5.76 s
    [Task  3/25]  Current/Best:   15.57/  16.80 GFLOPS | Progress: (8/20) | 7.67 s
    [Task  3/25]  Current/Best:   14.91/  16.80 GFLOPS | Progress: (12/20) | 9.39 s
    [Task  3/25]  Current/Best:    7.20/  20.02 GFLOPS | Progress: (16/20) | 11.34 s
    [Task  3/25]  Current/Best:   12.55/  20.02 GFLOPS | Progress: (20/20) | 15.88 s Done.
-
    [Task  4/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  4/25]  Current/Best:    9.55/  20.32 GFLOPS | Progress: (4/20) | 2.30 s
    [Task  4/25]  Current/Best:    6.87/  20.32 GFLOPS | Progress: (8/20) | 6.97 s
    [Task  4/25]  Current/Best:   22.55/  22.55 GFLOPS | Progress: (12/20) | 11.83 s
    [Task  4/25]  Current/Best:   17.20/  22.55 GFLOPS | Progress: (16/20) | 14.21 s
    [Task  4/25]  Current/Best:   13.51/  22.55 GFLOPS | Progress: (20/20) | 16.30 s Done.
-
    [Task  5/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  5/25]  Current/Best:    9.89/  10.52 GFLOPS | Progress: (4/20) | 2.48 s
    [Task  5/25]  Current/Best:   11.87/  12.72 GFLOPS | Progress: (8/20) | 4.53 s
    [Task  5/25]  Current/Best:   11.82/  18.07 GFLOPS | Progress: (12/20) | 7.71 s
    [Task  5/25]  Current/Best:   11.99/  22.76 GFLOPS | Progress: (16/20) | 9.10 s
    [Task  5/25]  Current/Best:   12.07/  22.76 GFLOPS | Progress: (20/20) | 10.98 s Done.
-
    [Task  6/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  6/25]  Current/Best:   12.29/  20.77 GFLOPS | Progress: (4/20) | 4.02 s
    [Task  6/25]  Current/Best:   18.94/  20.77 GFLOPS | Progress: (8/20) | 5.75 s
    [Task  6/25]  Current/Best:   13.31/  20.77 GFLOPS | Progress: (12/20) | 7.68 s
    [Task  6/25]  Current/Best:   20.01/  20.77 GFLOPS | Progress: (16/20) | 9.92 s
    [Task  6/25]  Current/Best:    3.74/  20.77 GFLOPS | Progress: (20/20) | 12.41 s Done.
-
    [Task  7/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  7/25]  Current/Best:   11.22/  12.90 GFLOPS | Progress: (4/20) | 3.53 s
    [Task  7/25]  Current/Best:   20.18/  21.22 GFLOPS | Progress: (8/20) | 5.03 s
    [Task  7/25]  Current/Best:   16.23/  21.22 GFLOPS | Progress: (12/20) | 6.92 s
    [Task  7/25]  Current/Best:   12.25/  21.22 GFLOPS | Progress: (16/20) | 8.95 s
    [Task  7/25]  Current/Best:    6.35/  21.84 GFLOPS | Progress: (20/20) | 11.40 s Done.
-
    [Task  8/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  8/25]  Current/Best:   10.02/  13.94 GFLOPS | Progress: (4/20) | 2.82 s
    [Task  8/25]  Current/Best:    9.94/  13.94 GFLOPS | Progress: (8/20) | 7.93 s
    [Task  8/25]  Current/Best:   12.74/  13.94 GFLOPS | Progress: (12/20) | 14.32 s
    [Task  8/25]  Current/Best:   18.88/  18.88 GFLOPS | Progress: (16/20) | 16.41 s
    [Task  8/25]  Current/Best:   20.16/  20.16 GFLOPS | Progress: (20/20) | 23.47 s Done.
-
    [Task  9/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  9/25]  Current/Best:   14.42/  15.89 GFLOPS | Progress: (4/20) | 11.85 s
    [Task  9/25]  Current/Best:   23.52/  23.52 GFLOPS | Progress: (8/20) | 13.64 s
    [Task  9/25]  Current/Best:    8.28/  23.52 GFLOPS | Progress: (12/20) | 16.17 s
    [Task  9/25]  Current/Best:   17.94/  23.52 GFLOPS | Progress: (16/20) | 19.04 s
    [Task  9/25]  Current/Best:    9.05/  23.52 GFLOPS | Progress: (20/20) | 27.50 s
    [Task 10/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 10/25]  Current/Best:   18.31/  18.31 GFLOPS | Progress: (4/20) | 2.46 s Done.
-
    [Task 10/25]  Current/Best:   15.49/  18.31 GFLOPS | Progress: (8/20) | 4.06 s
    [Task 10/25]  Current/Best:   13.02/  18.99 GFLOPS | Progress: (12/20) | 5.60 s
    [Task 10/25]  Current/Best:   19.15/  20.32 GFLOPS | Progress: (16/20) | 6.69 s
    [Task 10/25]  Current/Best:    8.82/  20.32 GFLOPS | Progress: (20/20) | 8.22 s Done.
-
    [Task 11/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 11/25]  Current/Best:   12.30/  18.19 GFLOPS | Progress: (4/20) | 3.29 s
    [Task 11/25]  Current/Best:   16.92/  18.19 GFLOPS | Progress: (8/20) | 6.12 s
    [Task 11/25]  Current/Best:   18.07/  18.19 GFLOPS | Progress: (12/20) | 8.15 s
    [Task 11/25]  Current/Best:   11.40/  21.24 GFLOPS | Progress: (16/20) | 11.04 s
    [Task 11/25]  Current/Best:   19.52/  21.61 GFLOPS | Progress: (20/20) | 13.12 s Done.
-
    [Task 12/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 12/25]  Current/Best:    7.80/  18.13 GFLOPS | Progress: (4/20) | 5.68 s
    [Task 12/25]  Current/Best:    5.42/  18.13 GFLOPS | Progress: (8/20) | 9.58 s
    [Task 12/25]  Current/Best:   18.97/  18.97 GFLOPS | Progress: (12/20) | 11.53 s
    [Task 12/25]  Current/Best:   15.40/  18.97 GFLOPS | Progress: (16/20) | 14.46 s
    [Task 12/25]  Current/Best:   15.15/  19.29 GFLOPS | Progress: (20/20) | 16.36 s Done.
-
    [Task 13/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 13/25]  Current/Best:    8.77/  17.22 GFLOPS | Progress: (4/20) | 3.67 s
    [Task 13/25]  Current/Best:   15.45/  21.10 GFLOPS | Progress: (8/20) | 6.29 s
    [Task 13/25]  Current/Best:   19.63/  21.10 GFLOPS | Progress: (12/20) | 9.42 s
    [Task 13/25]  Current/Best:   12.22/  21.10 GFLOPS | Progress: (16/20) | 12.83 s
    [Task 13/25]  Current/Best:   18.60/  21.10 GFLOPS | Progress: (20/20) | 15.09 s Done.
-
    [Task 14/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 14/25]  Current/Best:   12.03/  13.21 GFLOPS | Progress: (4/20) | 3.37 s
    [Task 14/25]  Current/Best:    6.06/  13.31 GFLOPS | Progress: (8/20) | 5.56 s
    [Task 14/25]  Current/Best:   20.16/  20.16 GFLOPS | Progress: (12/20) | 8.21 s
    [Task 14/25]  Current/Best:   17.36/  20.16 GFLOPS | Progress: (16/20) | 10.07 s
    [Task 14/25]  Current/Best:   17.12/  20.16 GFLOPS | Progress: (20/20) | 11.76 s
    [Task 15/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 15/25]  Current/Best:   16.12/  17.57 GFLOPS | Progress: (4/20) | 2.59 s
    [Task 15/25]  Current/Best:   14.28/  17.97 GFLOPS | Progress: (8/20) | 4.10 s
    [Task 15/25]  Current/Best:   10.36/  22.30 GFLOPS | Progress: (12/20) | 6.33 s
    [Task 15/25]  Current/Best:   20.25/  22.30 GFLOPS | Progress: (16/20) | 9.48 s Done.
-
    [Task 15/25]  Current/Best:    9.71/  22.30 GFLOPS | Progress: (20/20) | 10.65 s
    [Task 16/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 16/25]  Current/Best:   20.52/  20.52 GFLOPS | Progress: (4/20) | 2.84 s
    [Task 16/25]  Current/Best:    3.04/  20.52 GFLOPS | Progress: (8/20) | 4.46 s
    [Task 16/25]  Current/Best:   19.66/  20.52 GFLOPS | Progress: (12/20) | 5.67 s
    [Task 16/25]  Current/Best:   17.31/  20.52 GFLOPS | Progress: (16/20) | 7.05 s
    [Task 16/25]  Current/Best:    9.97/  22.37 GFLOPS | Progress: (20/20) | 9.21 s Done.
-
    [Task 17/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 17/25]  Current/Best:   13.73/  17.94 GFLOPS | Progress: (4/20) | 4.71 s
    [Task 17/25]  Current/Best:   14.46/  23.24 GFLOPS | Progress: (8/20) | 7.62 s
    [Task 17/25]  Current/Best:   17.50/  23.24 GFLOPS | Progress: (12/20) | 9.65 s
    [Task 17/25]  Current/Best:   16.49/  23.24 GFLOPS | Progress: (16/20) | 11.85 s
    [Task 17/25]  Current/Best:   10.03/  23.24 GFLOPS | Progress: (20/20) | 14.02 s Done.
-
    [Task 18/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 18/25]  Current/Best:   11.13/  18.01 GFLOPS | Progress: (4/20) | 3.71 s
    [Task 18/25]  Current/Best:   10.58/  19.17 GFLOPS | Progress: (8/20) | 7.36 s
    [Task 18/25]  Current/Best:   19.12/  19.17 GFLOPS | Progress: (12/20) | 9.28 s
    [Task 18/25]  Current/Best:   10.05/  19.17 GFLOPS | Progress: (16/20) | 13.15 s
    [Task 18/25]  Current/Best:   20.75/  20.75 GFLOPS | Progress: (20/20) | 14.66 s Done.
-
    [Task 19/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 19/25]  Current/Best:    6.99/  20.35 GFLOPS | Progress: (4/20) | 6.06 s
    [Task 19/25]  Current/Best:    2.60/  20.35 GFLOPS | Progress: (8/20) | 9.40 s
    [Task 19/25]  Current/Best:   20.32/  21.91 GFLOPS | Progress: (12/20) | 12.36 s
    [Task 19/25]  Current/Best:   15.43/  21.91 GFLOPS | Progress: (16/20) | 15.32 s
    [Task 19/25]  Current/Best:    2.69/  23.24 GFLOPS | Progress: (20/20) | 18.07 s Done.
-
    [Task 20/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 20/25]  Current/Best:    9.01/  15.11 GFLOPS | Progress: (4/20) | 3.30 s
    [Task 20/25]  Current/Best:    9.90/  15.11 GFLOPS | Progress: (8/20) | 6.86 s
    [Task 20/25]  Current/Best:    2.32/  16.66 GFLOPS | Progress: (12/20) | 10.73 s
    [Task 20/25]  Current/Best:   12.43/  16.66 GFLOPS | Progress: (16/20) | 14.64 s
    [Task 20/25]  Current/Best:   12.14/  21.66 GFLOPS | Progress: (20/20) | 16.74 s
    [Task 21/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 21/25]  Current/Best:    6.40/  17.70 GFLOPS | Progress: (4/20) | 3.21 s
    [Task 21/25]  Current/Best:   14.56/  17.70 GFLOPS | Progress: (8/20) | 4.82 s
    [Task 21/25]  Current/Best:    1.61/  17.70 GFLOPS | Progress: (12/20) | 6.91 s
    [Task 21/25]  Current/Best:   18.00/  18.00 GFLOPS | Progress: (16/20) | 10.43 s
    [Task 21/25]  Current/Best:    4.46/  18.00 GFLOPS | Progress: (20/20)
  | 17.71 s
    [Task 22/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s Done.
+
    [Task  1/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  1/25]  Current/Best:   17.48/  17.48 GFLOPS | Progress: (4/20) | 5.44 s
    [Task  1/25]  Current/Best:    6.16/  17.48 GFLOPS | Progress: (8/20) | 8.83 s
    [Task  1/25]  Current/Best:   11.55/  22.82 GFLOPS | Progress: (12/20) | 11.28 s
    [Task  1/25]  Current/Best:   16.93/  22.87 GFLOPS | Progress: (16/20) | 12.95 s
    [Task  1/25]  Current/Best:   11.57/  23.89 GFLOPS | Progress: (20/20) | 14.69 s Done.
+
    [Task  2/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  2/25]  Current/Best:   12.07/  13.06 GFLOPS | Progress: (4/20) | 3.65 s
    [Task  2/25]  Current/Best:   14.21/  18.60 GFLOPS | Progress: (8/20) | 4.97 s
    [Task  2/25]  Current/Best:   21.12/  21.12 GFLOPS | Progress: (12/20) | 6.31 s
    [Task  2/25]  Current/Best:   10.94/  21.12 GFLOPS | Progress: (16/20) | 7.57 s
    [Task  2/25]  Current/Best:   19.16/  21.12 GFLOPS | Progress: (20/20) | 9.17 s Done.
+
    [Task  3/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  3/25]  Current/Best:    1.63/  10.60 GFLOPS | Progress: (4/20) | 5.78 s
    [Task  3/25]  Current/Best:   15.56/  16.88 GFLOPS | Progress: (8/20) | 7.69 s
    [Task  3/25]  Current/Best:   14.90/  16.88 GFLOPS | Progress: (12/20) | 9.39 s
    [Task  3/25]  Current/Best:    7.16/  23.79 GFLOPS | Progress: (16/20) | 11.28 s
    [Task  3/25]  Current/Best:   12.60/  23.79 GFLOPS | Progress: (20/20) | 15.81 s Done.
+
    [Task  4/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  4/25]  Current/Best:    9.55/  20.44 GFLOPS | Progress: (4/20) | 2.32 s
    [Task  4/25]  Current/Best:    6.83/  20.44 GFLOPS | Progress: (8/20) | 7.01 s
    [Task  4/25]  Current/Best:   21.57/  21.57 GFLOPS | Progress: (12/20) | 11.96 s
    [Task  4/25]  Current/Best:   15.66/  21.57 GFLOPS | Progress: (16/20) | 14.34 s
    [Task  4/25]  Current/Best:   13.31/  21.57 GFLOPS | Progress: (20/20) | 16.38 s Done.
+
    [Task  5/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  5/25]  Current/Best:   10.02/  10.54 GFLOPS | Progress: (4/20) | 2.52 s
    [Task  5/25]  Current/Best:   11.95/  13.00 GFLOPS | Progress: (8/20) | 4.56 s
    [Task  5/25]  Current/Best:   11.47/  18.08 GFLOPS | Progress: (12/20) | 7.74 s
    [Task  5/25]  Current/Best:   11.90/  22.74 GFLOPS | Progress: (16/20) | 9.15 s
    [Task  5/25]  Current/Best:   12.09/  22.74 GFLOPS | Progress: (20/20) | 11.03 s Done.
+
    [Task  6/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  6/25]  Current/Best:   12.21/  20.75 GFLOPS | Progress: (4/20) | 3.98 s
    [Task  6/25]  Current/Best:   19.01/  20.75 GFLOPS | Progress: (8/20) | 5.74 s
    [Task  6/25]  Current/Best:   13.13/  20.75 GFLOPS | Progress: (12/20) | 7.66 s
    [Task  6/25]  Current/Best:   20.09/  20.75 GFLOPS | Progress: (16/20) | 9.89 s
    [Task  6/25]  Current/Best:    3.73/  20.75 GFLOPS | Progress: (20/20) | 12.39 s Done.
+
    [Task  7/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  7/25]  Current/Best:   11.27/  12.20 GFLOPS | Progress: (4/20) | 3.55 s
    [Task  7/25]  Current/Best:   20.22/  21.15 GFLOPS | Progress: (8/20) | 5.06 s
    [Task  7/25]  Current/Best:   16.09/  21.15 GFLOPS | Progress: (12/20) | 6.95 s
    [Task  7/25]  Current/Best:   12.25/  21.15 GFLOPS | Progress: (16/20) | 8.99 s
    [Task  7/25]  Current/Best:    6.37/  21.52 GFLOPS | Progress: (20/20) | 11.43 s Done.
+
    [Task  8/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  8/25]  Current/Best:   10.33/  14.14 GFLOPS | Progress: (4/20) | 2.87 s
    [Task  8/25]  Current/Best:    9.72/  14.14 GFLOPS | Progress: (8/20) | 8.02 s
    [Task  8/25]  Current/Best:   12.79/  14.14 GFLOPS | Progress: (12/20) | 14.48 s
    [Task  8/25]  Current/Best:   18.82/  18.82 GFLOPS | Progress: (16/20) | 16.57 s
    [Task  8/25]  Current/Best:   20.01/  20.01 GFLOPS | Progress: (20/20) | 23.64 s Done.
+
    [Task  9/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  9/25]  Current/Best:   12.92/  15.50 GFLOPS | Progress: (4/20) | 11.88 s
    [Task  9/25]  Current/Best:   23.51/  23.51 GFLOPS | Progress: (8/20) | 13.67 s
    [Task  9/25]  Current/Best:    8.32/  23.51 GFLOPS | Progress: (12/20) | 16.19 s
    [Task  9/25]  Current/Best:   17.94/  23.51 GFLOPS | Progress: (16/20) | 19.04 s
    [Task  9/25]  Current/Best:    9.06/  23.51 GFLOPS | Progress: (20/20) | 27.62 s
    [Task 10/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 10/25]  Current/Best:   18.19/  18.19 GFLOPS | Progress: (4/20) | 2.46 s
    [Task 10/25]  Current/Best:   15.39/  18.19 GFLOPS | Progress: (8/20) | 4.11 s
    [Task 10/25]  Current/Best:   12.98/  18.91 GFLOPS | Progress: (12/20) | 5.65 s
    [Task 10/25]  Current/Best:   19.18/  20.34 GFLOPS | Progress: (16/20) | 6.75 s
    [Task 10/25]  Current/Best:    8.95/  20.34 GFLOPS | Progress: (20/20
 ) | 8.27 s Done.
+
    [Task 11/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 11/25]  Current/Best:   12.36/  18.14 GFLOPS | Progress: (4/20) | 3.27 s
    [Task 11/25]  Current/Best:   16.93/  18.14 GFLOPS | Progress: (8/20) | 6.09 s
    [Task 11/25]  Current/Best:   18.11/  18.14 GFLOPS | Progress: (12/20) | 8.15 s
    [Task 11/25]  Current/Best:   11.94/  21.24 GFLOPS | Progress: (16/20) | 11.12 s
    [Task 11/25]  Current/Best:   19.49/  21.58 GFLOPS | Progress: (20/20) | 13.18 s Done.
+
    [Task 12/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 12/25]  Current/Best:    7.80/  18.15 GFLOPS | Progress: (4/20) | 5.56 s
    [Task 12/25]  Current/Best:    5.31/  18.15 GFLOPS | Progress: (8/20) | 9.45 s
    [Task 12/25]  Current/Best:   18.85/  18.97 GFLOPS | Progress: (12/20) | 11.42 s
    [Task 12/25]  Current/Best:   15.43/  18.97 GFLOPS | Progress: (16/20) | 14.34 s
    [Task 12/25]  Current/Best:   15.10/  19.14 GFLOPS | Progress: (20/20) | 16.24 s Done.
+
    [Task 13/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 13/25]  Current/Best:    8.80/  17.40 GFLOPS | Progress: (4/20) | 3.63 s
    [Task 13/25]  Current/Best:   15.70/  21.13 GFLOPS | Progress: (8/20) | 6.21 s
    [Task 13/25]  Current/Best:   19.59/  21.57 GFLOPS | Progress: (12/20) | 9.31 s
    [Task 13/25]  Current/Best:   12.29/  21.57 GFLOPS | Progress: (16/20) | 12.67 s
    [Task 13/25]  Current/Best:   18.86/  21.57 GFLOPS | Progress: (20/20) | 14.98 s Done.
+
    [Task 14/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 14/25]  Current/Best:   13.55/  13.55 GFLOPS | Progress: (4/20) | 3.32 s
    [Task 14/25]  Current/Best:    6.12/  13.55 GFLOPS | Progress: (8/20) | 5.48 s
    [Task 14/25]  Current/Best:   20.92/  20.92 GFLOPS | Progress: (12/20) | 8.13 s
    [Task 14/25]  Current/Best:   17.22/  20.92 GFLOPS | Progress: (16/20) | 10.00 s Done.
+
    [Task 14/25]  Current/Best:   17.07/  20.92 GFLOPS | Progress: (20/20) | 11.81 s
    [Task 15/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 15/25]  Current/Best:   16.13/  17.64 GFLOPS | Progress: (4/20) | 2.58 s
    [Task 15/25]  Current/Best:   14.42/  18.11 GFLOPS | Progress: (8/20) | 4.11 s
    [Task 15/25]  Current/Best:   10.36/  22.31 GFLOPS | Progress: (12/20) | 6.43 s
    [Task 15/25]  Current/Best:   20.43/  22.31 GFLOPS | Progress: (16/20) | 9.49 s
    [Task 15/25]  Current/Best:    9.72/  22.31 GFLOPS | Progress: (20/20) | 10.67 s
    [Task 16/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 16/25]  Current/Best:   20.55/  20.55 GFLOPS | Progress: (4/20) | 2.82 s
    [Task 16/25]  Current/Best:    3.04/  20.55 GFLOPS | Progress: (8/20) | 4.43 s
    [Task 16/25]  Current/Best:   19.59/  20.55 GFLOPS | Progress: (12/20) | 5.64 s
    [Task 16/25]  Current/Best:   17.73/  20.55 GFLOPS | Progress: (16/20) |
  6.99 s
    [Task 16/25]  Current/Best:   10.00/  20.81 GFLOPS | Progress: (20/20) | 9.13 s Done.
+
    [Task 17/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 17/25]  Current/Best:   12.99/  18.70 GFLOPS | Progress: (4/20) | 4.69 s
    [Task 17/25]  Current/Best:   14.30/  23.35 GFLOPS | Progress: (8/20) | 7.55 s
    [Task 17/25]  Current/Best:   16.90/  23.35 GFLOPS | Progress: (12/20) | 9.59 s
    [Task 17/25]  Current/Best:   16.49/  23.35 GFLOPS | Progress: (16/20) | 11.83 s
    [Task 17/25]  Current/Best:   10.03/  23.35 GFLOPS | Progress: (20/20) | 13.98 s Done.
+
    [Task 18/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 18/25]  Current/Best:   11.31/  17.85 GFLOPS | Progress: (4/20) | 3.69 s
    [Task 18/25]  Current/Best:   10.61/  17.85 GFLOPS | Progress: (8/20) | 7.37 s
    [Task 18/25]  Current/Best:   19.22/  19.22 GFLOPS | Progress: (12/20) | 9.29 s
    [Task 18/25]  Current/Best:   10.00/  19.22 GFLOPS | Progress: (16/20) | 13.14 s
    [Task 18/25]  Current/Best:   20.94/  20.94 GFLOPS | Progress: (20/20) | 14.65 s Done.
+
    [Task 19/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 19/25]  Current/Best:    7.14/  20.43 GFLOPS | Progress: (4/20) | 5.96 s
    [Task 19/25]  Current/Best:    2.60/  20.43 GFLOPS | Progress: (8/20) | 9.31 s
    [Task 19/25]  Current/Best:   19.77/  21.90 GFLOPS | Progress: (12/20) | 12.25 s
    [Task 19/25]  Current/Best:   15.09/  21.90 GFLOPS | Progress: (16/20) | 15.27 s
    [Task 19/25]  Current/Best:    2.70/  23.26 GFLOPS | Progress: (20/20) | 18.05 s Done.
+
    [Task 20/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 20/25]  Current/Best:    9.28/  15.00 GFLOPS | Progress: (4/20) | 3.24 s Done.
      Done.
+
    [Task 20/25]  Current/Best:    9.65/  15.00 GFLOPS | Progress: (8/20) | 6.79 s
    [Task 20/25]  Current/Best:    2.32/  16.59 GFLOPS | Progress: (12/20) | 10.86 s
    [Task 20/25]  Current/Best:   12.44/  16.59 GFLOPS | Progress: (16/20) | 14.58 s
    [Task 20/25]  Current/Best:   12.23/  22.20 GFLOPS | Progress: (20/20) | 16.68 s
    [Task 21/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 21/25]  Current/Best:    6.38/  17.63 GFLOPS | Progress: (4/20) | 3.21 s
    [Task 21/25]  Current/Best:   14.55/  17.63 GFLOPS | Progress: (8/20) | 4.81 s
    [Task 21/25]  Current/Best:    1.61/  17.63 GFLOPS | Progress: (12/20) | 6.91 s
    [Task 21/25]  Current/Best:   17.83/  17.83 GFLOPS | Progress: (16/20) | 10.43 s
    [Task 21/25]  Current/Best:    4.44/  17.83 GFLOPS | Progress: (20/20) | 17.73 s
    [Task 22/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 22/25]  Current/Best:    2.70/  17.03 GFLOPS | Progress: (4/20
 ) | 2.59 s
    [Task 22/25]  Current/Best:    8.81/  21.81 GFLOPS | Progress: (8/20) | 4.61 s
    [Task 22/25]  Current/Best:   19.97/  21.81 GFLOPS | Progress: (12/20) | 6.96 s
    [Task 22/25]  Current/Best:   15.22/  21.81 GFLOPS | Progress: (16/20) | 9.06 s
    [Task 22/25]  Current/Best:   14.66/  21.81 GFLOPS | Progress: (20/20) | 10.79 s Done.
+
    [Task 23/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 23/25]  Current/Best:   17.24/  20.30 GFLOPS | Progress: (4/20) | 3.17 s
    [Task 23/25]  Current/Best:   13.78/  20.30 GFLOPS | Progress: (8/20) | 6.58 s
    [Task 23/25]  Current/Best:   20.80/  21.38 GFLOPS | Progress: (12/20) | 8.43 s
    [Task 23/25]  Current/Best:    6.43/  21.38 GFLOPS | Progress: (16/20) | 15.42 s
    [Task 23/25]  Current/Best:    7.78/  21.38 GFLOPS | Progress: (20/20) | 19.67 s Done.
+
    [Task 24/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 24/25]  Current/Best:    8.60/   8.60 GFLOPS | Progress: (4/20) | 11.74 s
    [Task 24/25]  Current/Best:    2.15/   8.60 GFLOPS | Progress: (8/20) | 22.71 s
    [Task 24/25]  Current/Best:    4.43/   8.60 GFLOPS | Progress: (12/20) | 34.19 s Done.
      Done.
-
    [Task 22/25]  Current/Best:    2.70/  17.02 GFLOPS | Progress: (4/20) | 2.69 s
    [Task 22/25]  Current/Best:    8.70/  21.82 GFLOPS | Progress: (8/20) | 4.66 s
    [Task 22/25]  Current/Best:   20.02/  21.82 GFLOPS | Progress: (12/20) | 7.06 s
    [Task 22/25]  Current/Best:   15.38/  21.82 GFLOPS | Progress: (16/20) | 9.16 s
    [Task 22/25]  Current/Best:   14.71/  21.82 GFLOPS | Progress: (20/20) | 10.90 s Done.
-
    [Task 23/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 23/25]  Current/Best:   17.52/  20.36 GFLOPS | Progress: (4/20) | 3.17 s
    [Task 23/25]  Current/Best:   15.79/  20.36 GFLOPS | Progress: (8/20) | 6.63 s
    [Task 23/25]  Current/Best:   21.00/  21.67 GFLOPS | Progress: (12/20) | 8.46 s
    [Task 23/25]  Current/Best:    6.30/  21.67 GFLOPS | Progress: (16/20) | 15.62 s
    [Task 23/25]  Current/Best:    7.66/  21.67 GFLOPS | Progress: (20/20) | 19.86 s Done.
-
    [Task 24/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 24/25]  Current/Best:    8.62/   8.62 GFLOPS | Progress: (4/20) | 11.72 s
    [Task 24/25]  Current/Best:    1.93/   8.62 GFLOPS | Progress: (8/20) | 22.73 s
    [Task 24/25]  Current/Best:    4.47/   8.62 GFLOPS | Progress: (12/20) | 34.21 s
    [Task 24/25]  Current/Best:    7.23/   8.93 GFLOPS | Progress: (16/20) | 39.85 s
    [Task 24/25]  Current/Best:    3.32/   8.98 GFLOPS | Progress: (20/20) | 45.71 s Done.
-
    [Task 25/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 25/25]  Current/Best:    1.55/   2.74 GFLOPS | Progress: (4/20) | 11.54 s
    [Task 25/25]  Current/Best:    5.89/   7.91 GFLOPS | Progress: (8/20) | 22.75 s
    [Task 25/25]  Current/Best:    5.87/   7.91 GFLOPS | Progress: (12/20) | 34.21 s
    [Task 25/25]  Current/Best:    5.66/   8.33 GFLOPS | Progress: (16/20) | 36.06 s
    [Task 25/25]  Current/Best:    2.92/   9.10 GFLOPS | Progress: (20/20) | 46.76 s
+
    [Task 24/25]  Current/Best:    6.53/   8.84 GFLOPS | Progress: (16/20) | 39.88 s
    [Task 24/25]  Current/Best:    3.23/   8.84 GFLOPS | Progress: (20/20) | 45.83 s Done.
+
    [Task 25/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 25/25]  Current/Best:    1.55/   2.95 GFLOPS | Progress: (4/20) | 11.54 s
    [Task 25/25]  Current/Best:    5.72/   7.95 GFLOPS | Progress: (8/20) | 22.76 s
    [Task 25/25]  Current/Best:    5.92/   7.95 GFLOPS | Progress: (12/20) | 34.03 s
    [Task 25/25]  Current/Best:    5.72/   9.47 GFLOPS | Progress: (16/20) | 35.75 s
    [Task 25/25]  Current/Best:    2.85/   9.47 GFLOPS | Progress: (20/20) | 46.43 s
 
 
 The output from this tuning process will look something like this:
@@ -660,8 +660,8 @@ improvement in comparing the optimized model to the unoptimized model.
 
  .. code-block:: none
 
-    optimized: {'mean': 410.3633574900141, 'median': 410.1542377500209, 'std': 0.7243711791744223}
-    unoptimized: {'mean': 493.0805308300023, 'median': 492.9776465500254, 'std': 1.6479009030430605}
+    optimized: {'mean': 411.85385866001525, 'median': 411.8287240999507, 'std': 0.6571885942666992}
+    unoptimized: {'mean': 491.8569577499966, 'median': 491.976347300033, 'std': 0.7611859830537622}
 
 
 
@@ -681,7 +681,7 @@ profiling/benchmarking.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 10 minutes  17.689 seconds)
+   **Total running time of the script:** ( 10 minutes  19.858 seconds)
 
 
 .. _sphx_glr_download_tutorial_autotvm_relay_x86.py:
diff --git a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
index d566a9a50..b066b2f33 100644
--- a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
+++ b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
@@ -235,7 +235,7 @@ device and returns the measured cost. Network overhead is excluded.
 
  .. code-block:: none
 
-    1.258e-07 secs/op
+    1.286e-07 secs/op
 
 
 
diff --git a/docs/_sources/tutorial/intro_topi.rst.txt b/docs/_sources/tutorial/intro_topi.rst.txt
index b07652211..d8dc3771e 100644
--- a/docs/_sources/tutorial/intro_topi.rst.txt
+++ b/docs/_sources/tutorial/intro_topi.rst.txt
@@ -233,7 +233,7 @@ As you can see, scheduled stages of computation have been accumulated and we can
 
  .. code-block:: none
 
-    [stage(a, placeholder(a, 0x1f9028d0)), stage(b, placeholder(b, 0x19a89430)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(mi [...]
+    [stage(a, placeholder(a, 0x5b6e770)), stage(b, placeholder(b, 0x1ae845b0)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min [...]
 
 
 
diff --git a/docs/_sources/tutorial/sg_execution_times.rst.txt b/docs/_sources/tutorial/sg_execution_times.rst.txt
index 7b68567d8..ba941e9fc 100644
--- a/docs/_sources/tutorial/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorial/sg_execution_times.rst.txt
@@ -5,17 +5,17 @@
 
 Computation times
 =================
-**13:02.922** total execution time for **tutorial** files:
+**13:01.626** total execution time for **tutorial** files:
 
-- **10:17.689**: :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``)
-- **00:57.468**: :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``)
-- **00:55.893**: :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``)
-- **00:26.579**: :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``)
-- **00:23.633**: :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``)
-- **00:00.725**: :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``)
-- **00:00.565**: :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``)
-- **00:00.201**: :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``)
-- **00:00.046**: :ref:`sphx_glr_tutorial_introduction.py` (``introduction.py``)
-- **00:00.043**: :ref:`sphx_glr_tutorial_tvmc_python.py` (``tvmc_python.py``)
-- **00:00.042**: :ref:`sphx_glr_tutorial_tvmc_command_line_driver.py` (``tvmc_command_line_driver.py``)
-- **00:00.039**: :ref:`sphx_glr_tutorial_install.py` (``install.py``)
+- **10:19.858**: :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``)
+- **00:58.568**: :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``)
+- **00:52.050**: :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``)
+- **00:26.016**: :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``)
+- **00:23.556**: :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``)
+- **00:00.710**: :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``)
+- **00:00.551**: :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``)
+- **00:00.185**: :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``)
+- **00:00.040**: :ref:`sphx_glr_tutorial_introduction.py` (``introduction.py``)
+- **00:00.032**: :ref:`sphx_glr_tutorial_install.py` (``install.py``)
+- **00:00.031**: :ref:`sphx_glr_tutorial_tvmc_python.py` (``tvmc_python.py``)
+- **00:00.031**: :ref:`sphx_glr_tutorial_tvmc_command_line_driver.py` (``tvmc_command_line_driver.py``)
diff --git a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
index 43283b1be..c9094100f 100644
--- a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
+++ b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
@@ -344,7 +344,7 @@ compile and run this new schedule with the parallel operation applied:
 
  .. code-block:: none
 
-    parallel: 0.000007
+    parallel: 0.000006
 
 
 
@@ -447,10 +447,10 @@ We can now compare the different schedules
  .. code-block:: none
 
                 Operator                  Timing             Performance
-                   numpy    8.229270006268053e-06                    1.0
-                   naive    5.8474999999999995e-06    0.7105733552971392
-                parallel              6.9645e-06      0.8463083596352161
-                  vector    2.4539800000000002e-05     2.982014198259211
+                   numpy    8.040249995246995e-06                    1.0
+                   naive    5.8134999999999995e-06     0.723049656843588
+                parallel              6.0041e-06      0.7467553874008062
+                  vector             2.45482e-05        3.05316377158816
 
 
 
@@ -839,7 +839,7 @@ matrix multiplication.
 
  .. code-block:: none
 
-    Numpy running time: 0.018120
+    Numpy running time: 0.018308
 
 
 
@@ -897,7 +897,7 @@ optimizations.
 
     /workspace/python/tvm/driver/build_module.py:264: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
       "target_host parameter is going to be deprecated. "
-    none: 3.113451
+    none: 3.236665
 
 
 
@@ -996,7 +996,7 @@ schedule.
 
  .. code-block:: none
 
-    blocking: 0.311937
+    blocking: 0.298833
 
 
 
@@ -1088,7 +1088,7 @@ already cache friendly from our previous optimizations.
 
  .. code-block:: none
 
-    vectorization: 0.341692
+    vectorization: 0.332470
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1160,7 +1160,7 @@ more cache friendly.
 
  .. code-block:: none
 
-    loop permutation: 0.115958
+    loop permutation: 0.118225
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1257,7 +1257,7 @@ optimized schedule.
 
  .. code-block:: none
 
-    array packing: 0.109064
+    array packing: 0.110705
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1348,7 +1348,7 @@ to `C` when all the block results are ready.
 
  .. code-block:: none
 
-    block caching: 0.110371
+    block caching: 0.111500
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1432,7 +1432,7 @@ of thread-level parallelization.
 
  .. code-block:: none
 
-    parallelization: 0.144670
+    parallelization: 0.143513
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1511,13 +1511,13 @@ working, we can compare the results.
  .. code-block:: none
 
                 Operator                  Timing             Performance
-                    none            3.1134511074                     1.0
-                blocking             0.311936983     0.10019010167161232
-           vectorization     0.34169213530000003     0.10974706957429706
-        loop permutation            0.1159584181    0.037244335658392684
-           array packing             0.109063906    0.035029908046662006
-           block caching     0.11037149990000002     0.03544989019987204
-         parallelization            0.1446698268     0.04646606669240802
+                    none      3.2366650888999997                     1.0
+                blocking            0.2988325121     0.09232728870368236
+           vectorization            0.3324702958     0.10272001787895578
+        loop permutation     0.11822451599999999     0.03652664478800904
+           array packing            0.1107052041     0.03420347828994066
+           block caching            0.1114997147    0.034448950273657714
+         parallelization            0.1435131968       0.044339835249613
 
 
 
diff --git a/docs/commit_hash b/docs/commit_hash
index 9f61f38f1..60543fc25 100644
--- a/docs/commit_hash
+++ b/docs/commit_hash
@@ -1 +1 @@
-24b93f56fdbb723cc0f631ce4da0e27d7fb212b1
+b9890dbbebeff95202a7dc65cbce3e808869cd33
diff --git a/docs/contribute/ci.html b/docs/contribute/ci.html
index 528207d09..eb316e6ea 100644
--- a/docs/contribute/ci.html
+++ b/docs/contribute/ci.html
@@ -211,13 +211,6 @@
 <li class="toctree-l4"><a class="reference internal" href="#reproduce-failures">Reproduce Failures</a></li>
 </ul>
 </li>
-<li class="toctree-l3"><a class="reference internal" href="#keeping-ci-green">Keeping CI Green</a><ul>
-<li class="toctree-l4"><a class="reference internal" href="#skip-ci-for-reverts">Skip CI for Reverts</a></li>
-</ul>
-</li>
-<li class="toctree-l3"><a class="reference internal" href="#handling-flaky-failures">Handling Flaky Failures</a></li>
-<li class="toctree-l3"><a class="reference internal" href="#ci-docker-staging"><code class="docutils literal notranslate"><span class="pre">ci-docker-staging</span></code></a></li>
-<li class="toctree-l3"><a class="reference internal" href="#docker-images">Docker Images</a></li>
 <li class="toctree-l3"><a class="reference internal" href="#reporting-issues">Reporting Issues</a></li>
 </ul>
 </li>
@@ -333,21 +326,13 @@
 <span id="ci-guide"></span><h1>Using TVM’s CI<a class="headerlink" href="#using-tvm-s-ci" title="Permalink to this headline">¶</a></h1>
 <div class="contents local topic" id="contents">
 <ul class="simple">
-<li><p><a class="reference internal" href="#debugging-failures" id="id3">Debugging Failures</a></p>
+<li><p><a class="reference internal" href="#debugging-failures" id="id1">Debugging Failures</a></p>
 <ul>
-<li><p><a class="reference internal" href="#jenkins-logs" id="id4">Jenkins Logs</a></p></li>
-<li><p><a class="reference internal" href="#reproduce-failures" id="id5">Reproduce Failures</a></p></li>
+<li><p><a class="reference internal" href="#jenkins-logs" id="id2">Jenkins Logs</a></p></li>
+<li><p><a class="reference internal" href="#reproduce-failures" id="id3">Reproduce Failures</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#keeping-ci-green" id="id6">Keeping CI Green</a></p>
-<ul>
-<li><p><a class="reference internal" href="#skip-ci-for-reverts" id="id7">Skip CI for Reverts</a></p></li>
-</ul>
-</li>
-<li><p><a class="reference internal" href="#handling-flaky-failures" id="id8">Handling Flaky Failures</a></p></li>
-<li><p><a class="reference internal" href="#ci-docker-staging" id="id9"><code class="docutils literal notranslate"><span class="pre">ci-docker-staging</span></code></a></p></li>
-<li><p><a class="reference internal" href="#docker-images" id="id10">Docker Images</a></p></li>
-<li><p><a class="reference internal" href="#reporting-issues" id="id11">Reporting Issues</a></p></li>
+<li><p><a class="reference internal" href="#reporting-issues" id="id4">Reporting Issues</a></p></li>
 </ul>
 </div>
 <p>TVM uses Jenkins for running Linux continuous integration (CI) tests on
@@ -361,10 +346,10 @@ has successfully completed. To diagnose failing steps, click through to the fail
 pipeline stage then to the failing step to see the output logs.</p>
 <a class="reference internal image-reference" href="https://github.com/tlc-pack/web-data/raw/main/images/contribute/ci.png"><img alt="The Jenkins UI for a CI run" src="https://github.com/tlc-pack/web-data/raw/main/images/contribute/ci.png" style="width: 800px;" /></a>
 <div class="section" id="debugging-failures">
-<h2><a class="toc-backref" href="#id3">Debugging Failures</a><a class="headerlink" href="#debugging-failures" title="Permalink to this headline">¶</a></h2>
+<h2><a class="toc-backref" href="#id1">Debugging Failures</a><a class="headerlink" href="#debugging-failures" title="Permalink to this headline">¶</a></h2>
 <p>When CI fails for some reason, there are several methods to diagnose the issue.</p>
 <div class="section" id="jenkins-logs">
-<h3><a class="toc-backref" href="#id4">Jenkins Logs</a><a class="headerlink" href="#jenkins-logs" title="Permalink to this headline">¶</a></h3>
+<h3><a class="toc-backref" href="#id2">Jenkins Logs</a><a class="headerlink" href="#jenkins-logs" title="Permalink to this headline">¶</a></h3>
 <p>The first place to look for a failure is in the CI logs, follow the red Xs on
 the failing job to view the logs. Note:</p>
 <ul class="simple">
@@ -375,99 +360,12 @@ need to scroll up to view the actual failure.</p></li>
 </ul>
 </div>
 <div class="section" id="reproduce-failures">
-<h3><a class="toc-backref" href="#id5">Reproduce Failures</a><a class="headerlink" href="#reproduce-failures" title="Permalink to this headline">¶</a></h3>
+<h3><a class="toc-backref" href="#id3">Reproduce Failures</a><a class="headerlink" href="#reproduce-failures" title="Permalink to this headline">¶</a></h3>
 <p>Most TVM Python tests run under <a class="reference external" href="https://docs.pytest.org/en/6.2.x/"><code class="docutils literal notranslate"><span class="pre">pytest</span></code></a> and can be run as described in <a class="reference internal" href="pull_request.html#pr-testing"><span class="std std-ref">Testing</span></a>.</p>
 </div>
 </div>
-<div class="section" id="keeping-ci-green">
-<h2><a class="toc-backref" href="#id6">Keeping CI Green</a><a class="headerlink" href="#keeping-ci-green" title="Permalink to this headline">¶</a></h2>
-<p>Developers rely on the TVM CI to get signal on their PRs before merging.
-Occasionally breakages slip through and break <code class="docutils literal notranslate"><span class="pre">main</span></code>, which in turn causes
-the same error to show up on an PR that is based on the broken commit(s). Broken
-commits can be identified <a class="reference external" href="https://github.com/apache/tvm/commits/main">through GitHub</a>
-via the commit status icon or via <a class="reference external" href="https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/activity?branch=main">Jenkins</a>.
-In these situations it is possible to either revert the offending commit or
-submit a forward fix to address the issue. It is up to the committer and commit
-author which option to choose, keeping in mind that a broken CI affects all TVM
-developers and should be fixed as soon as possible.</p>
-<div class="section" id="skip-ci-for-reverts">
-<h3><a class="toc-backref" href="#id7">Skip CI for Reverts</a><a class="headerlink" href="#skip-ci-for-reverts" title="Permalink to this headline">¶</a></h3>
-<p>For reverts and trivial forward fixes, adding <code class="docutils literal notranslate"><span class="pre">[skip</span> <span class="pre">ci]</span></code> to the revert’s
-PR title will cause CI to shortcut and only run lint. Committers should
-take care that they only merge CI-skipped PRs to fix a failure on <code class="docutils literal notranslate"><span class="pre">main</span></code> and
-not in cases where the submitter wants to shortcut CI to merge a change faster.
-The PR title is checked when the build is first run (specifically during the lint
-step, so changes after that has run do not affect CI and will require the job to
-be re-triggered by another <code class="docutils literal notranslate"><span class="pre">git</span> <span class="pre">push</span></code>).</p>
-<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># Revert HEAD commit, make sure to insert &#39;[skip ci]&#39; at the beginning of</span>
-<span class="c1"># the commit subject</span>
-git revert HEAD
-git checkout -b my_fix
-<span class="c1"># After you have pushed your branch, create a PR as usual.</span>
-git push my_repo
-<span class="c1"># Example: Skip CI on a branch with an existing PR</span>
-<span class="c1"># Adding this commit to an existing branch will cause a new CI run where</span>
-<span class="c1"># Jenkins is skipped</span>
-git commit --allow-empty --message <span class="s2">&quot;[skip ci] Trigger skipped CI&quot;</span>
-git push my_repo
-</pre></div>
-</div>
-</div>
-</div>
-<div class="section" id="handling-flaky-failures">
-<h2><a class="toc-backref" href="#id8">Handling Flaky Failures</a><a class="headerlink" href="#handling-flaky-failures" title="Permalink to this headline">¶</a></h2>
-<p>If you notice a failure on your PR that seems unrelated to your change, you should
-search <a class="reference external" href="https://github.com/apache/tvm/issues?q=is%3Aissue+%5BCI+Problem%5D+Flaky+">recent GitHub issues related to flaky tests</a> and
-<a class="reference external" href="https://github.com/apache/tvm/issues/new?assignees=&amp;labels=&amp;template=ci-problem.md&amp;title=%5BCI+Problem%5D+">file a new issue</a>
-if you don’t see any reports of the failure. If a certain test or class of tests affects
-several PRs or commits on <code class="docutils literal notranslate"><span class="pre">main</span></code> with flaky failures, the test should be disabled via
-<a class="reference external" href="https://docs.pytest.org/en/6.2.x/skipping.html#xfail-mark-test-functions-as-expected-to-fail">pytest’s <code class="docutils literal notranslate"><span class="pre">&#64;xfail</span></code> decorator</a> with <a class="reference external" href="https://docs.pytest.org/en/6.2.x/skipping.html#strict-parameter"><code class="docutils literal notranslate"><span class="pre">strict=True</span></code></a> and the relevant issue linked in the
-disabling PR.</p>
-<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="nd">@pytest</span><span class="o">.</span><span class="n">mark</span><span class="o">.</span><span class="n">xfail</span><span class="p">(</span><span class="n">strict</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">reason</span><span class="o">=</span><span class="s2">&quot;Flaky test: https://github.com/apache/tvm/issues/1234&quot;</span><s [...]
-<span class="k">def</span> <span class="nf">test_something_flaky</span><span class="p">():</span>
-    <span class="k">pass</span>
-</pre></div>
-</div>
-</div>
-<div class="section" id="ci-docker-staging">
-<h2><a class="toc-backref" href="#id9"><code class="docutils literal notranslate"><span class="pre">ci-docker-staging</span></code></a><a class="headerlink" href="#ci-docker-staging" title="Permalink to this headline">¶</a></h2>
-<p>The <a class="reference external" href="https://github.com/apache/tvm/tree/ci-docker-staging">ci-docker-staging</a>
-branch is used to test updates to Docker images and <code class="docutils literal notranslate"><span class="pre">Jenkinsfile</span></code> changes. When
-running a build for a normal PR from a forked repository, Jenkins uses the code
-from the PR except for the <code class="docutils literal notranslate"><span class="pre">Jenkinsfile</span></code> itself, which comes from the base branch.
-When branches are built, the <code class="docutils literal notranslate"><span class="pre">Jenkinsfile</span></code> in the branch is used, so a committer
-with write access must push PRs to a branch in apache/tvm to properly test
-<code class="docutils literal notranslate"><span class="pre">Jenkinsfile</span></code> changes. If your PR makes changes to the <code class="docutils literal notranslate"><span class="pre">Jenkinsfile</span></code>, make sure
-to &#64; a <a class="reference external" href="https://github.com/apache/tvm/blob/main/CONTRIBUTORS.md">committer</a>
-and ask them to push your PR as a branch to test the changes.</p>
-</div>
-<div class="section" id="docker-images">
-<span id="id2"></span><h2><a class="toc-backref" href="#id10">Docker Images</a><a class="headerlink" href="#docker-images" title="Permalink to this headline">¶</a></h2>
-<p>Each CI job runs most of its work inside a Docker container, built from files
-in the <a class="reference external" href="https://github.com/apache/tvm/tree/main/docker">docker/</a> folder. These
-files are built nightly in Jenkins via the <a class="reference external" href="https://ci.tlcpack.ai/job/docker-images-ci/">docker-images-ci</a> job.
-The images for these containers are hosted in the <a class="reference external" href="https://hub.docker.com/u/tlcpack">tlcpack Docker Hub</a>
-and referenced at the <a class="reference external" href="https://github.com/apache/tvm/blob/7481a297740f073b193a3f09b3e27f056e8c7f2e/Jenkinsfile#L48-L54">top of the <code class="docutils literal notranslate"><span class="pre">Jenkinsfile</span></code></a>. These can be inspected and run
-locally via standard Docker commands.</p>
-<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># Beware: CI images can be several GB in size</span>
-<span class="c1"># Get a bare docker shell in the ci-gpu container</span>
-docker run -it tlcpack/ci-gpu:v0.78 /bin/bash
-</pre></div>
-</div>
-<p><code class="docutils literal notranslate"><span class="pre">docker/bash.sh</span></code> will automatically grab the latest image from the <code class="docutils literal notranslate"><span class="pre">Jenkinsfile</span></code>
-and help in mounting your current directory.</p>
-<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># Run the ci_cpu image specified in Jenkinsfile</span>
-<span class="nb">cd</span> tvm
-bash docker/bash.sh ci_cpu
-<span class="c1"># the tvm directory is automatically mounted</span>
-<span class="c1"># example: build tvm (note: this will overrwrite build/)</span>
-$ ./tests/scripts/task_config_build_cpu.sh
-$ ./tests/scripts/task_build.sh build -j32
-</pre></div>
-</div>
-</div>
 <div class="section" id="reporting-issues">
-<h2><a class="toc-backref" href="#id11">Reporting Issues</a><a class="headerlink" href="#reporting-issues" title="Permalink to this headline">¶</a></h2>
+<h2><a class="toc-backref" href="#id4">Reporting Issues</a><a class="headerlink" href="#reporting-issues" title="Permalink to this headline">¶</a></h2>
 <p>Issues with CI should be <a class="reference external" href="https://github.com/apache/tvm/issues/new?assignees=&amp;labels=&amp;template=ci-problem.md&amp;title=%5BCI+Problem%5D+">reported on GitHub</a>
 with a link to the relevant jobs, commits, or PRs.</p>
 </div>
diff --git a/docs/contribute/index.html b/docs/contribute/index.html
index d880067bf..6cbc485ce 100644
--- a/docs/contribute/index.html
+++ b/docs/contribute/index.html
@@ -389,10 +389,6 @@ design choices of the internal.</p></li>
 </li>
 <li class="toctree-l1"><a class="reference internal" href="ci.html">Using TVM’s CI</a><ul>
 <li class="toctree-l2"><a class="reference internal" href="ci.html#debugging-failures">Debugging Failures</a></li>
-<li class="toctree-l2"><a class="reference internal" href="ci.html#keeping-ci-green">Keeping CI Green</a></li>
-<li class="toctree-l2"><a class="reference internal" href="ci.html#handling-flaky-failures">Handling Flaky Failures</a></li>
-<li class="toctree-l2"><a class="reference internal" href="ci.html#ci-docker-staging"><code class="docutils literal notranslate"><span class="pre">ci-docker-staging</span></code></a></li>
-<li class="toctree-l2"><a class="reference internal" href="ci.html#docker-images">Docker Images</a></li>
 <li class="toctree-l2"><a class="reference internal" href="ci.html#reporting-issues">Reporting Issues</a></li>
 </ul>
 </li>
diff --git a/docs/how_to/compile_models/from_mxnet.html b/docs/how_to/compile_models/from_mxnet.html
index ce875d232..c24ca8da5 100644
--- a/docs/how_to/compile_models/from_mxnet.html
+++ b/docs/how_to/compile_models/from_mxnet.html
@@ -401,7 +401,7 @@
 </div>
 <img alt="../../_images/sphx_glr_from_mxnet_001.png" class="sphx-glr-single-img" src="../../_images/sphx_glr_from_mxnet_001.png" />
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip01b8b8fa-eb13-4bfa-b333-c520e5c61568 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip674fcbf0-c9e1-47b7-be5a-bdc8ccacdf0b from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
 x (1, 3, 224, 224)
 </pre></div>
 </div>
diff --git a/docs/how_to/compile_models/from_oneflow.html b/docs/how_to/compile_models/from_oneflow.html
index 016dd149f..c977b768a 100644
--- a/docs/how_to/compile_models/from_oneflow.html
+++ b/docs/how_to/compile_models/from_oneflow.html
@@ -406,56 +406,87 @@ python3 -m pip install -f https://release.oneflow.info <span class="nv">oneflow<
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: &quot;https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip&quot; to /workspace/.oneflow/flowvision_cache/resnet18.zip
 
   0%|          | 0.00/41.5M [00:00&lt;?, ?B/s]
-  0%|          | 16.0k/41.5M [00:00&lt;07:51, 92.3kB/s]
-  0%|          | 48.0k/41.5M [00:00&lt;04:57, 146kB/s]
-  0%|          | 72.0k/41.5M [00:00&lt;05:05, 142kB/s]
-  0%|          | 144k/41.5M [00:00&lt;02:53, 249kB/s]
-  1%|          | 288k/41.5M [00:00&lt;01:34, 457kB/s]
-  1%|1         | 584k/41.5M [00:01&lt;00:48, 879kB/s]
-  2%|2         | 992k/41.5M [00:01&lt;00:31, 1.36MB/s]
-  3%|3         | 1.36M/41.5M [00:01&lt;00:25, 1.66MB/s]
-  4%|4         | 1.76M/41.5M [00:01&lt;00:22, 1.87MB/s]
-  5%|5         | 2.16M/41.5M [00:01&lt;00:20, 2.03MB/s]
-  6%|6         | 2.60M/41.5M [00:01&lt;00:18, 2.19MB/s]
-  7%|7         | 3.05M/41.5M [00:02&lt;00:17, 2.32MB/s]
-  8%|8         | 3.51M/41.5M [00:02&lt;00:16, 2.44MB/s]
- 10%|9         | 3.98M/41.5M [00:02&lt;00:15, 2.53MB/s]
- 11%|#         | 4.48M/41.5M [00:02&lt;00:14, 2.66MB/s]
- 12%|#2        | 5.00M/41.5M [00:02&lt;00:13, 2.78MB/s]
- 13%|#3        | 5.55M/41.5M [00:03&lt;00:12, 2.91MB/s]
- 15%|#4        | 6.12M/41.5M [00:03&lt;00:12, 3.04MB/s]
- 16%|#6        | 6.70M/41.5M [00:03&lt;00:11, 3.16MB/s]
- 18%|#7        | 7.30M/41.5M [00:03&lt;00:10, 3.27MB/s]
- 19%|#9        | 7.93M/41.5M [00:03&lt;00:10, 3.39MB/s]
- 21%|##        | 8.57M/41.5M [00:03&lt;00:09, 3.51MB/s]
- 22%|##2       | 9.23M/41.5M [00:04&lt;00:09, 3.62MB/s]
- 24%|##3       | 9.93M/41.5M [00:04&lt;00:08, 3.76MB/s]
- 26%|##5       | 10.7M/41.5M [00:04&lt;00:08, 3.93MB/s]
- 28%|##7       | 11.4M/41.5M [00:04&lt;00:07, 4.10MB/s]
- 29%|##9       | 12.2M/41.5M [00:04&lt;00:07, 4.29MB/s]
- 32%|###1      | 13.1M/41.5M [00:04&lt;00:06, 4.47MB/s]
- 34%|###3      | 14.0M/41.5M [00:05&lt;00:06, 4.69MB/s]
- 36%|###5      | 14.9M/41.5M [00:05&lt;00:05, 4.91MB/s]
- 38%|###8      | 15.8M/41.5M [00:05&lt;00:05, 5.14MB/s]
- 41%|####      | 16.9M/41.5M [00:05&lt;00:04, 5.38MB/s]
- 43%|####3     | 17.9M/41.5M [00:05&lt;00:04, 5.62MB/s]
- 46%|####5     | 19.0M/41.5M [00:06&lt;00:04, 5.89MB/s]
- 49%|####8     | 20.2M/41.5M [00:06&lt;00:03, 6.18MB/s]
- 52%|#####1    | 21.4M/41.5M [00:06&lt;00:03, 6.48MB/s]
- 55%|#####4    | 22.7M/41.5M [00:06&lt;00:02, 6.80MB/s]
- 58%|#####7    | 24.0M/41.5M [00:06&lt;00:02, 7.13MB/s]
- 61%|######1   | 25.5M/41.5M [00:06&lt;00:02, 7.49MB/s]
- 65%|######4   | 26.9M/41.5M [00:07&lt;00:01, 7.84MB/s]
- 68%|######8   | 28.4M/41.5M [00:07&lt;00:01, 8.09MB/s]
- 72%|#######2  | 29.9M/41.5M [00:07&lt;00:01, 8.27MB/s]
- 76%|#######5  | 31.4M/41.5M [00:07&lt;00:01, 8.39MB/s]
- 79%|#######9  | 32.8M/41.5M [00:07&lt;00:01, 8.48MB/s]
- 83%|########2 | 34.3M/41.5M [00:08&lt;00:00, 8.52MB/s]
- 86%|########6 | 35.8M/41.5M [00:08&lt;00:00, 8.58MB/s]
- 90%|########9 | 37.3M/41.5M [00:08&lt;00:00, 8.60MB/s]
- 93%|#########3| 38.7M/41.5M [00:08&lt;00:00, 8.61MB/s]
- 97%|#########6| 40.2M/41.5M [00:08&lt;00:00, 8.63MB/s]
-100%|##########| 41.5M/41.5M [00:08&lt;00:00, 4.97MB/s]
+  0%|          | 16.0k/41.5M [00:00&lt;07:47, 93.1kB/s]
+  0%|          | 48.0k/41.5M [00:00&lt;04:55, 147kB/s]
+  0%|          | 96.0k/41.5M [00:00&lt;03:26, 211kB/s]
+  0%|          | 184k/41.5M [00:00&lt;02:13, 325kB/s]
+  1%|          | 272k/41.5M [00:00&lt;01:50, 391kB/s]
+  1%|          | 368k/41.5M [00:01&lt;01:36, 447kB/s]
+  1%|1         | 464k/41.5M [00:01&lt;01:29, 482kB/s]
+  1%|1         | 568k/41.5M [00:01&lt;01:22, 520kB/s]
+  2%|1         | 672k/41.5M [00:01&lt;01:17, 552kB/s]
+  2%|1         | 784k/41.5M [00:01&lt;01:13, 582kB/s]
+  2%|2         | 904k/41.5M [00:01&lt;01:09, 616kB/s]
+  2%|2         | 1.01M/41.5M [00:02&lt;01:05, 647kB/s]
+  3%|2         | 1.14M/41.5M [00:02&lt;01:01, 688kB/s]
+  3%|3         | 1.27M/41.5M [00:02&lt;00:58, 718kB/s]
+  3%|3         | 1.41M/41.5M [00:02&lt;00:55, 761kB/s]
+  4%|3         | 1.57M/41.5M [00:02&lt;00:52, 801kB/s]
+  4%|4         | 1.73M/41.5M [00:03&lt;00:49, 838kB/s]
+  5%|4         | 1.88M/41.5M [00:03&lt;00:47, 875kB/s]
+  5%|4         | 2.05M/41.5M [00:03&lt;00:45, 918kB/s]
+  5%|5         | 2.23M/41.5M [00:03&lt;00:43, 948kB/s]
+  6%|5         | 2.41M/41.5M [00:03&lt;00:41, 983kB/s]
+  6%|6         | 2.59M/41.5M [00:03&lt;00:39, 1.02MB/s]
+  7%|6         | 2.80M/41.5M [00:04&lt;00:38, 1.06MB/s]
+  7%|7         | 3.00M/41.5M [00:04&lt;00:36, 1.11MB/s]
+  8%|7         | 3.21M/41.5M [00:04&lt;00:34, 1.15MB/s]
+  8%|8         | 3.43M/41.5M [00:04&lt;00:33, 1.21MB/s]
+  9%|8         | 3.66M/41.5M [00:04&lt;00:31, 1.26MB/s]
+  9%|9         | 3.91M/41.5M [00:04&lt;00:30, 1.31MB/s]
+ 10%|#         | 4.17M/41.5M [00:05&lt;00:28, 1.38MB/s]
+ 11%|#         | 4.44M/41.5M [00:05&lt;00:26, 1.45MB/s]
+ 11%|#1        | 4.73M/41.5M [00:05&lt;00:25, 1.51MB/s]
+ 12%|#2        | 5.02M/41.5M [00:05&lt;00:23, 1.60MB/s]
+ 13%|#2        | 5.34M/41.5M [00:05&lt;00:22, 1.68MB/s]
+ 14%|#3        | 5.66M/41.5M [00:06&lt;00:21, 1.76MB/s]
+ 14%|#4        | 6.01M/41.5M [00:06&lt;00:18, 1.99MB/s]
+ 15%|#5        | 6.36M/41.5M [00:06&lt;00:16, 2.18MB/s]
+ 16%|#6        | 6.65M/41.5M [00:06&lt;00:16, 2.20MB/s]
+ 17%|#6        | 6.87M/41.5M [00:06&lt;00:17, 2.04MB/s]
+ 17%|#7        | 7.11M/41.5M [00:06&lt;00:19, 1.87MB/s]
+ 18%|#8        | 7.52M/41.5M [00:06&lt;00:17, 2.05MB/s]
+ 19%|#9        | 7.94M/41.5M [00:07&lt;00:16, 2.19MB/s]
+ 20%|##        | 8.38M/41.5M [00:07&lt;00:14, 2.32MB/s]
+ 21%|##1       | 8.83M/41.5M [00:07&lt;00:14, 2.43MB/s]
+ 22%|##2       | 9.30M/41.5M [00:07&lt;00:13, 2.55MB/s]
+ 24%|##3       | 9.80M/41.5M [00:07&lt;00:12, 2.68MB/s]
+ 25%|##4       | 10.3M/41.5M [00:07&lt;00:11, 2.79MB/s]
+ 26%|##6       | 10.9M/41.5M [00:08&lt;00:10, 2.93MB/s]
+ 28%|##7       | 11.4M/41.5M [00:08&lt;00:09, 3.33MB/s]
+ 29%|##8       | 12.0M/41.5M [00:08&lt;00:08, 3.63MB/s]
+ 30%|##9       | 12.4M/41.5M [00:08&lt;00:08, 3.66MB/s]
+ 31%|###       | 12.7M/41.5M [00:08&lt;00:09, 3.14MB/s]
+ 32%|###1      | 13.3M/41.5M [00:08&lt;00:09, 3.17MB/s]
+ 34%|###3      | 13.9M/41.5M [00:08&lt;00:07, 3.73MB/s]
+ 35%|###5      | 14.6M/41.5M [00:09&lt;00:07, 3.85MB/s]
+ 37%|###6      | 15.3M/41.5M [00:09&lt;00:07, 3.72MB/s]
+ 39%|###8      | 16.1M/41.5M [00:09&lt;00:06, 4.24MB/s]
+ 41%|####      | 16.9M/41.5M [00:09&lt;00:05, 4.68MB/s]
+ 42%|####2     | 17.5M/41.5M [00:09&lt;00:04, 5.09MB/s]
+ 43%|####3     | 18.0M/41.5M [00:09&lt;00:05, 4.46MB/s]
+ 45%|####4     | 18.6M/41.5M [00:10&lt;00:05, 4.52MB/s]
+ 47%|####6     | 19.5M/41.5M [00:10&lt;00:04, 5.20MB/s]
+ 48%|####8     | 20.1M/41.5M [00:10&lt;00:04, 5.57MB/s]
+ 50%|####9     | 20.7M/41.5M [00:10&lt;00:04, 4.77MB/s]
+ 52%|#####1    | 21.4M/41.5M [00:10&lt;00:04, 5.25MB/s]
+ 54%|#####4    | 22.5M/41.5M [00:10&lt;00:03, 6.51MB/s]
+ 56%|#####5    | 23.1M/41.5M [00:10&lt;00:03, 6.11MB/s]
+ 57%|#####7    | 23.7M/41.5M [00:10&lt;00:03, 5.26MB/s]
+ 60%|#####9    | 24.7M/41.5M [00:11&lt;00:03, 5.61MB/s]
+ 62%|######2   | 25.9M/41.5M [00:11&lt;00:02, 6.65MB/s]
+ 65%|######5   | 27.2M/41.5M [00:11&lt;00:01, 8.09MB/s]
+ 67%|######7   | 28.0M/41.5M [00:11&lt;00:01, 7.52MB/s]
+ 69%|######9   | 28.8M/41.5M [00:11&lt;00:02, 6.51MB/s]
+ 72%|#######2  | 29.9M/41.5M [00:11&lt;00:01, 6.86MB/s]
+ 76%|#######5  | 31.4M/41.5M [00:12&lt;00:01, 7.41MB/s]
+ 79%|#######9  | 32.8M/41.5M [00:12&lt;00:01, 7.83MB/s]
+ 83%|########2 | 34.3M/41.5M [00:12&lt;00:00, 7.93MB/s]
+ 86%|########6 | 35.8M/41.5M [00:12&lt;00:00, 8.17MB/s]
+ 90%|########9 | 37.2M/41.5M [00:12&lt;00:00, 8.33MB/s]
+ 93%|#########3| 38.7M/41.5M [00:12&lt;00:00, 8.44MB/s]
+ 97%|#########6| 40.2M/41.5M [00:13&lt;00:00, 8.52MB/s]
+100%|##########| 41.5M/41.5M [00:13&lt;00:00, 3.31MB/s]
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/compile_models/from_paddle.html b/docs/how_to/compile_models/from_paddle.html
index 55154ac3f..936e40caf 100644
--- a/docs/how_to/compile_models/from_paddle.html
+++ b/docs/how_to/compile_models/from_paddle.html
@@ -469,7 +469,7 @@ A quick solution is</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>TVM prediction top-1 id: 282, class name:  282: &#39;tiger cat&#39;,
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  7.419 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  7.170 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-paddle-py">
 <div class="sphx-glr-download docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/16269b77359771348d507395692524cf/from_paddle.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_paddle.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/from_pytorch.html b/docs/how_to/compile_models/from_pytorch.html
index 86809c7b9..1823793ce 100644
--- a/docs/how_to/compile_models/from_pytorch.html
+++ b/docs/how_to/compile_models/from_pytorch.html
@@ -387,9 +387,10 @@ be unstable.</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: &quot;https://download.pytorch.org/models/resnet18-f37072fd.pth&quot; to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
 
   0%|          | 0.00/44.7M [00:00&lt;?, ?B/s]
- 19%|#9        | 8.62M/44.7M [00:00&lt;00:00, 90.3MB/s]
- 71%|#######   | 31.6M/44.7M [00:00&lt;00:00, 179MB/s]
-100%|##########| 44.7M/44.7M [00:00&lt;00:00, 183MB/s]
+  7%|7         | 3.27M/44.7M [00:00&lt;00:01, 33.9MB/s]
+ 15%|#4        | 6.70M/44.7M [00:00&lt;00:01, 35.0MB/s]
+ 73%|#######2  | 32.6M/44.7M [00:00&lt;00:00, 142MB/s]
+100%|##########| 44.7M/44.7M [00:00&lt;00:00, 135MB/s]
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/compile_models/from_tensorflow.html b/docs/how_to/compile_models/from_tensorflow.html
index 47bd7e851..295127083 100644
--- a/docs/how_to/compile_models/from_tensorflow.html
+++ b/docs/how_to/compile_models/from_tensorflow.html
@@ -612,7 +612,6 @@ banana (score = 0.00022)
 desk (score = 0.00019)
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  3.030 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-tensorflow-py">
 <div class="sphx-glr-download docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/7f1d3d1b878694c201c614c807cdebc8/from_tensorflow.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_tensorflow.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/sg_execution_times.html b/docs/how_to/compile_models/sg_execution_times.html
index 7f24ba177..71dfe0923 100644
--- a/docs/how_to/compile_models/sg_execution_times.html
+++ b/docs/how_to/compile_models/sg_execution_times.html
@@ -300,17 +300,17 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-compile-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>05:38.347</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
+<p><strong>05:21.764</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
 <ul class="simple">
-<li><p><strong>01:07.419</strong>: <a class="reference internal" href="from_paddle.html#sphx-glr-how-to-compile-models-from-paddle-py"><span class="std std-ref">Compile PaddlePaddle Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_paddle.py</span></code>)</p></li>
-<li><p><strong>01:03.030</strong>: <a class="reference internal" href="from_tensorflow.html#sphx-glr-how-to-compile-models-from-tensorflow-py"><span class="std std-ref">Compile Tensorflow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tensorflow.py</span></code>)</p></li>
-<li><p><strong>00:59.272</strong>: <a class="reference internal" href="from_darknet.html#sphx-glr-how-to-compile-models-from-darknet-py"><span class="std std-ref">Compile YOLO-V2 and YOLO-V3 in DarkNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_darknet.py</span></code>)</p></li>
-<li><p><strong>00:33.312</strong>: <a class="reference internal" href="from_oneflow.html#sphx-glr-how-to-compile-models-from-oneflow-py"><span class="std std-ref">Compile OneFlow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_oneflow.py</span></code>)</p></li>
-<li><p><strong>00:27.046</strong>: <a class="reference internal" href="from_keras.html#sphx-glr-how-to-compile-models-from-keras-py"><span class="std std-ref">Compile Keras Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_keras.py</span></code>)</p></li>
-<li><p><strong>00:24.126</strong>: <a class="reference internal" href="from_tflite.html#sphx-glr-how-to-compile-models-from-tflite-py"><span class="std std-ref">Compile TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tflite.py</span></code>)</p></li>
-<li><p><strong>00:21.479</strong>: <a class="reference internal" href="from_coreml.html#sphx-glr-how-to-compile-models-from-coreml-py"><span class="std std-ref">Compile CoreML Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_coreml.py</span></code>)</p></li>
-<li><p><strong>00:21.297</strong>: <a class="reference internal" href="from_mxnet.html#sphx-glr-how-to-compile-models-from-mxnet-py"><span class="std std-ref">Compile MXNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_mxnet.py</span></code>)</p></li>
-<li><p><strong>00:18.867</strong>: <a class="reference internal" href="from_pytorch.html#sphx-glr-how-to-compile-models-from-pytorch-py"><span class="std std-ref">Compile PyTorch Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_pytorch.py</span></code>)</p></li>
+<li><p><strong>01:07.170</strong>: <a class="reference internal" href="from_paddle.html#sphx-glr-how-to-compile-models-from-paddle-py"><span class="std std-ref">Compile PaddlePaddle Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_paddle.py</span></code>)</p></li>
+<li><p><strong>00:59.772</strong>: <a class="reference internal" href="from_tensorflow.html#sphx-glr-how-to-compile-models-from-tensorflow-py"><span class="std std-ref">Compile Tensorflow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tensorflow.py</span></code>)</p></li>
+<li><p><strong>00:55.447</strong>: <a class="reference internal" href="from_darknet.html#sphx-glr-how-to-compile-models-from-darknet-py"><span class="std std-ref">Compile YOLO-V2 and YOLO-V3 in DarkNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_darknet.py</span></code>)</p></li>
+<li><p><strong>00:37.304</strong>: <a class="reference internal" href="from_oneflow.html#sphx-glr-how-to-compile-models-from-oneflow-py"><span class="std std-ref">Compile OneFlow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_oneflow.py</span></code>)</p></li>
+<li><p><strong>00:23.793</strong>: <a class="reference internal" href="from_tflite.html#sphx-glr-how-to-compile-models-from-tflite-py"><span class="std std-ref">Compile TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tflite.py</span></code>)</p></li>
+<li><p><strong>00:21.971</strong>: <a class="reference internal" href="from_coreml.html#sphx-glr-how-to-compile-models-from-coreml-py"><span class="std std-ref">Compile CoreML Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_coreml.py</span></code>)</p></li>
+<li><p><strong>00:21.093</strong>: <a class="reference internal" href="from_mxnet.html#sphx-glr-how-to-compile-models-from-mxnet-py"><span class="std std-ref">Compile MXNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_mxnet.py</span></code>)</p></li>
+<li><p><strong>00:19.394</strong>: <a class="reference internal" href="from_pytorch.html#sphx-glr-how-to-compile-models-from-pytorch-py"><span class="std std-ref">Compile PyTorch Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_pytorch.py</span></code>)</p></li>
+<li><p><strong>00:13.320</strong>: <a class="reference internal" href="from_keras.html#sphx-glr-how-to-compile-models-from-keras-py"><span class="std std-ref">Compile Keras Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_keras.py</span></code>)</p></li>
 <li><p><strong>00:02.499</strong>: <a class="reference internal" href="from_onnx.html#sphx-glr-how-to-compile-models-from-onnx-py"><span class="std std-ref">Compile ONNX Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_onnx.py</span></code>)</p></li>
 </ul>
 </div>
diff --git a/docs/how_to/deploy_models/deploy_model_on_android.html b/docs/how_to/deploy_models/deploy_model_on_android.html
index 7c9b79d50..6d80bc087 100644
--- a/docs/how_to/deploy_models/deploy_model_on_android.html
+++ b/docs/how_to/deploy_models/deploy_model_on_android.html
@@ -627,7 +627,7 @@ to the remote android device.</p>
 Evaluate inference time cost...
 Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  16.0851      15.9495      16.5632      15.8696       0.2554
+  16.0734      15.9684      16.5769      15.6808       0.3410
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
index 4f617f85a..0d238721b 100644
--- a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
+++ b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
@@ -409,16 +409,14 @@ be unstable.</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: &quot;https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth&quot; to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
 
   0%|          | 0.00/170M [00:00&lt;?, ?B/s]
-  0%|          | 496k/170M [00:00&lt;00:36, 4.93MB/s]
-  1%|          | 984k/170M [00:00&lt;00:39, 4.54MB/s]
- 13%|#3        | 22.9M/170M [00:00&lt;00:01, 102MB/s]
- 23%|##2       | 38.9M/170M [00:00&lt;00:01, 127MB/s]
- 36%|###6      | 61.8M/170M [00:00&lt;00:00, 167MB/s]
- 50%|#####     | 85.2M/170M [00:00&lt;00:00, 193MB/s]
- 65%|######5   | 110M/170M [00:00&lt;00:00, 216MB/s]
- 79%|#######9  | 134M/170M [00:00&lt;00:00, 227MB/s]
- 92%|#########1| 156M/170M [00:00&lt;00:00, 209MB/s]
-100%|##########| 170M/170M [00:01&lt;00:00, 177MB/s]
+  9%|8         | 14.4M/170M [00:00&lt;00:01, 151MB/s]
+ 22%|##1       | 37.0M/170M [00:00&lt;00:00, 201MB/s]
+ 35%|###5      | 59.8M/170M [00:00&lt;00:00, 219MB/s]
+ 47%|####7     | 80.6M/170M [00:00&lt;00:00, 217MB/s]
+ 62%|######2   | 106M/170M [00:00&lt;00:00, 235MB/s]
+ 78%|#######7  | 132M/170M [00:00&lt;00:00, 246MB/s]
+ 91%|#########1| 155M/170M [00:00&lt;00:00, 247MB/s]
+100%|##########| 170M/170M [00:00&lt;00:00, 234MB/s]
 /usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:3878: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
   for i in range(dim)
 /usr/local/lib/python3.7/dist-packages/torchvision/models/detection/anchor_utils.py:127: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the &#39;trunc&#39; function NOT &#39;floor&#39;). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode=&#39;trunc&#39;), or for actual floor division, use torch.div(a, b, rounding_mode=&#39;floor&#39;).
@@ -516,7 +514,7 @@ torchvision rcnn models.</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Get 9 valid boxes
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes  4.298 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes  1.705 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-object-detection-pytorch-py">
 <div class="sphx-glr-download docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/7795da4b258c8feff986668b95ef57ad/deploy_object_detection_pytorch.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_object_detection_pytorch.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized.html b/docs/how_to/deploy_models/deploy_prequantized.html
index 53ef30e89..7b339c306 100644
--- a/docs/how_to/deploy_models/deploy_prequantized.html
+++ b/docs/how_to/deploy_models/deploy_prequantized.html
@@ -450,8 +450,9 @@ training. Other models require a full post training calibration.</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: &quot;https://download.pytorch.org/models/mobilenet_v2-b0353104.pth&quot; to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
 
   0%|          | 0.00/13.6M [00:00&lt;?, ?B/s]
- 71%|#######1  | 9.62M/13.6M [00:00&lt;00:00, 101MB/s]
-100%|##########| 13.6M/13.6M [00:00&lt;00:00, 122MB/s]
+ 30%|##9       | 4.04M/13.6M [00:00&lt;00:00, 42.3MB/s]
+ 61%|######1   | 8.31M/13.6M [00:00&lt;00:00, 43.8MB/s]
+100%|##########| 13.6M/13.6M [00:00&lt;00:00, 61.3MB/s]
 </pre></div>
 </div>
 </div>
@@ -545,7 +546,7 @@ output values are identical out of 1000 outputs from mobilenet v2.</p>
 <p class="sphx-glr-script-out">Out:</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  90.2391      90.1584      91.7401      90.0002       0.2421
+  90.3358      90.2240      95.5686      90.0991       0.5598
 </pre></div>
 </div>
 <div class="admonition note">
@@ -584,7 +585,7 @@ This includes support for the VNNI 8 bit dot product instruction (CascadeLake or
 <div class="section" id="deploy-a-quantized-tflite-model">
 <h2>Deploy a quantized TFLite Model<a class="headerlink" href="#deploy-a-quantized-tflite-model" title="Permalink to this headline">¶</a></h2>
 <p>TODO</p>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  5.584 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  5.752 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-py">
 <div class="sphx-glr-download docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/fb8217c13f4351224c6cf3aacf1a87fc/deploy_prequantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized_tflite.html b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
index 7417e4e36..138a04726 100644
--- a/docs/how_to/deploy_models/deploy_prequantized_tflite.html
+++ b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
@@ -545,7 +545,7 @@ TFLite Top-5 labels: [387 102 386 341 349]
 <p class="sphx-glr-script-out">Out:</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  118.6064     118.5160     125.6847     117.7654      0.7983
+  118.8476     118.7190     128.0987     117.9396      1.0735
 </pre></div>
 </div>
 <div class="admonition note">
@@ -573,7 +573,7 @@ network for ARM CPU</span></a>.</p></li>
 </ul>
 </div></blockquote>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes  4.454 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  56.256 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-tflite-py">
 <div class="sphx-glr-download docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/56691c7a27d45da61d112276334640d3/deploy_prequantized_tflite.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized_tflite.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_quantized.html b/docs/how_to/deploy_models/deploy_quantized.html
index 8e5ea1fa6..c1a7430b5 100644
--- a/docs/how_to/deploy_models/deploy_quantized.html
+++ b/docs/how_to/deploy_models/deploy_quantized.html
@@ -482,7 +482,7 @@ for calibration. But the accuracy might be impacted.</p>
   DeprecationWarning,
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  31.885 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  51.334 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-quantized-py">
 <div class="sphx-glr-download docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/7810ecf51bfc05f7d5e8a400ac3e815d/deploy_quantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_quantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
index c2c62ef21..b45b8473c 100644
--- a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
+++ b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
@@ -415,22 +415,22 @@ to your device.</p>
 Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
 
   0%|          | 0/132723 [00:00&lt;?, ?KB/s]
-  5%|4         | 6345/132723 [00:00&lt;00:01, 63435.84KB/s]
- 11%|#1        | 15063/132723 [00:00&lt;00:01, 77394.05KB/s]
- 18%|#8        | 23903/132723 [00:00&lt;00:01, 82416.06KB/s]
- 25%|##4       | 32725/132723 [00:00&lt;00:01, 84702.37KB/s]
- 31%|###1      | 41509/132723 [00:00&lt;00:01, 85828.95KB/s]
- 38%|###7      | 50347/132723 [00:00&lt;00:00, 86692.81KB/s]
- 45%|####4     | 59254/132723 [00:00&lt;00:00, 87468.15KB/s]
- 51%|#####1    | 68159/132723 [00:00&lt;00:00, 87968.46KB/s]
- 58%|#####8    | 77041/132723 [00:00&lt;00:00, 88231.64KB/s]
- 65%|######4   | 85921/132723 [00:01&lt;00:00, 88404.39KB/s]
- 71%|#######1  | 94775/132723 [00:01&lt;00:00, 88443.59KB/s]
- 78%|#######8  | 103620/132723 [00:01&lt;00:00, 88359.89KB/s]
- 85%|########4 | 112529/132723 [00:01&lt;00:00, 88578.24KB/s]
- 91%|#########1| 121436/132723 [00:01&lt;00:00, 88717.79KB/s]
- 98%|#########8| 130337/132723 [00:01&lt;00:00, 88802.67KB/s]
-100%|##########| 132723/132723 [00:01&lt;00:00, 86789.35KB/s]
+  5%|4         | 6594/132723 [00:00&lt;00:01, 65922.46KB/s]
+ 11%|#1        | 15107/132723 [00:00&lt;00:01, 77217.54KB/s]
+ 18%|#7        | 23884/132723 [00:00&lt;00:01, 82032.74KB/s]
+ 24%|##4       | 32088/132723 [00:00&lt;00:01, 77000.78KB/s]
+ 30%|###       | 39832/132723 [00:00&lt;00:01, 68793.41KB/s]
+ 37%|###6      | 48487/132723 [00:00&lt;00:01, 74225.49KB/s]
+ 43%|####2     | 56691/132723 [00:00&lt;00:00, 76600.03KB/s]
+ 49%|####9     | 65442/132723 [00:00&lt;00:00, 79902.42KB/s]
+ 56%|#####5    | 74129/132723 [00:00&lt;00:00, 82004.80KB/s]
+ 62%|######2   | 82867/132723 [00:01&lt;00:00, 83621.97KB/s]
+ 69%|######8   | 91504/132723 [00:01&lt;00:00, 84438.72KB/s]
+ 76%|#######5  | 100269/132723 [00:01&lt;00:00, 85400.43KB/s]
+ 82%|########2 | 109030/132723 [00:01&lt;00:00, 86063.02KB/s]
+ 89%|########8 | 117744/132723 [00:01&lt;00:00, 86376.21KB/s]
+ 95%|#########5| 126445/132723 [00:01&lt;00:00, 86564.76KB/s]
+100%|##########| 132723/132723 [00:01&lt;00:00, 81643.73KB/s]
 </pre></div>
 </div>
 <p>Create TVM runtime and do inference
@@ -475,7 +475,7 @@ Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from h
 </pre></div>
 </div>
 <img alt="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" class="sphx-glr-single-img" src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" />
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes  39.070 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes  38.624 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-ssd-gluoncv-py">
 <div class="sphx-glr-download docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/cccb17d28e5e8b2e94ea8cd5ec59f6ed/deploy_ssd_gluoncv.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_ssd_gluoncv.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/sg_execution_times.html b/docs/how_to/deploy_models/sg_execution_times.html
index 513da2767..8cb51d2cb 100644
--- a/docs/how_to/deploy_models/sg_execution_times.html
+++ b/docs/how_to/deploy_models/sg_execution_times.html
@@ -300,16 +300,16 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-deploy-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>11:16.413</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
+<p><strong>11:23.667</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
 <ul class="simple">
-<li><p><strong>03:04.298</strong>: <a class="reference internal" href="deploy_object_detection_pytorch.html#sphx-glr-how-to-deploy-models-deploy-object-detection-pytorch-py"><span class="std std-ref">Compile PyTorch Object Detection Models</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_object_detection_pytorch.py</span></code>)</p></li>
-<li><p><strong>02:39.070</strong>: <a class="reference internal" href="deploy_ssd_gluoncv.html#sphx-glr-how-to-deploy-models-deploy-ssd-gluoncv-py"><span class="std std-ref">Deploy Single Shot Multibox Detector(SSD) model</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_ssd_gluoncv.py</span></code>)</p></li>
-<li><p><strong>02:04.454</strong>: <a class="reference internal" href="deploy_prequantized_tflite.html#sphx-glr-how-to-deploy-models-deploy-prequantized-tflite-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM - Part 3 (TFLite)</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized_tflite.py</span></code>)</p></li>
-<li><p><strong>01:31.885</strong>: <a class="reference internal" href="deploy_quantized.html#sphx-glr-how-to-deploy-models-deploy-quantized-py"><span class="std std-ref">Deploy a Quantized Model on Cuda</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_quantized.py</span></code>)</p></li>
-<li><p><strong>01:05.584</strong>: <a class="reference internal" href="deploy_prequantized.html#sphx-glr-how-to-deploy-models-deploy-prequantized-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized.py</span></code>)</p></li>
-<li><p><strong>00:28.683</strong>: <a class="reference internal" href="deploy_model_on_android.html#sphx-glr-how-to-deploy-models-deploy-model-on-android-py"><span class="std std-ref">Deploy the Pretrained Model on Android</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_android.py</span></code>)</p></li>
-<li><p><strong>00:22.231</strong>: <a class="reference internal" href="deploy_model_on_rasp.html#sphx-glr-how-to-deploy-models-deploy-model-on-rasp-py"><span class="std std-ref">Deploy the Pretrained Model on Raspberry Pi</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_rasp.py</span></code>)</p></li>
-<li><p><strong>00:00.207</strong>: <a class="reference internal" href="deploy_sparse.html#sphx-glr-how-to-deploy-models-deploy-sparse-py"><span class="std std-ref">Deploy a Hugging Face Pruned Model on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_sparse.py</span></code>)</p></li>
+<li><p><strong>03:01.705</strong>: <a class="reference internal" href="deploy_object_detection_pytorch.html#sphx-glr-how-to-deploy-models-deploy-object-detection-pytorch-py"><span class="std std-ref">Compile PyTorch Object Detection Models</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_object_detection_pytorch.py</span></code>)</p></li>
+<li><p><strong>02:38.624</strong>: <a class="reference internal" href="deploy_ssd_gluoncv.html#sphx-glr-how-to-deploy-models-deploy-ssd-gluoncv-py"><span class="std std-ref">Deploy Single Shot Multibox Detector(SSD) model</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_ssd_gluoncv.py</span></code>)</p></li>
+<li><p><strong>01:56.256</strong>: <a class="reference internal" href="deploy_prequantized_tflite.html#sphx-glr-how-to-deploy-models-deploy-prequantized-tflite-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM - Part 3 (TFLite)</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized_tflite.py</span></code>)</p></li>
+<li><p><strong>01:51.334</strong>: <a class="reference internal" href="deploy_quantized.html#sphx-glr-how-to-deploy-models-deploy-quantized-py"><span class="std std-ref">Deploy a Quantized Model on Cuda</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_quantized.py</span></code>)</p></li>
+<li><p><strong>01:05.752</strong>: <a class="reference internal" href="deploy_prequantized.html#sphx-glr-how-to-deploy-models-deploy-prequantized-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized.py</span></code>)</p></li>
+<li><p><strong>00:27.760</strong>: <a class="reference internal" href="deploy_model_on_android.html#sphx-glr-how-to-deploy-models-deploy-model-on-android-py"><span class="std std-ref">Deploy the Pretrained Model on Android</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_android.py</span></code>)</p></li>
+<li><p><strong>00:22.042</strong>: <a class="reference internal" href="deploy_model_on_rasp.html#sphx-glr-how-to-deploy-models-deploy-model-on-rasp-py"><span class="std std-ref">Deploy the Pretrained Model on Raspberry Pi</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_rasp.py</span></code>)</p></li>
+<li><p><strong>00:00.193</strong>: <a class="reference internal" href="deploy_sparse.html#sphx-glr-how-to-deploy-models-deploy-sparse-py"><span class="std std-ref">Deploy a Hugging Face Pruned Model on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_sparse.py</span></code>)</p></li>
 </ul>
 </div>
 
diff --git a/docs/how_to/extend_tvm/bring_your_own_datatypes.html b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
index 8ae2f0ba6..2c1ab78b8 100644
--- a/docs/how_to/extend_tvm/bring_your_own_datatypes.html
+++ b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
@@ -590,7 +590,7 @@ In this alpha state of the Bring Your Own Datatypes framework, we have not imple
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip56f6cf58-0e13-4247-9e1a-4d304384b2ba from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip70153db1-eac3-460b-9858-61569c71e123 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
 </pre></div>
 </div>
 <p>It’s easy to execute MobileNet with native TVM:</p>
diff --git a/docs/how_to/extend_tvm/sg_execution_times.html b/docs/how_to/extend_tvm/sg_execution_times.html
index 40c09c580..458e5e3db 100644
--- a/docs/how_to/extend_tvm/sg_execution_times.html
+++ b/docs/how_to/extend_tvm/sg_execution_times.html
@@ -300,12 +300,12 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-extend-tvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:38.488</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
+<p><strong>00:38.041</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
 <ul class="simple">
-<li><p><strong>00:34.938</strong>: <a class="reference internal" href="bring_your_own_datatypes.html#sphx-glr-how-to-extend-tvm-bring-your-own-datatypes-py"><span class="std std-ref">Bring Your Own Datatypes to TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">bring_your_own_datatypes.py</span></code>)</p></li>
-<li><p><strong>00:02.285</strong>: <a class="reference internal" href="use_pass_instrument.html#sphx-glr-how-to-extend-tvm-use-pass-instrument-py"><span class="std std-ref">How to Use TVM Pass Instrument</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_instrument.py</span></code>)</p></li>
-<li><p><strong>00:01.055</strong>: <a class="reference internal" href="use_pass_infra.html#sphx-glr-how-to-extend-tvm-use-pass-infra-py"><span class="std std-ref">How to Use TVM Pass Infra</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_infra.py</span></code>)</p></li>
-<li><p><strong>00:00.210</strong>: <a class="reference internal" href="low_level_custom_pass.html#sphx-glr-how-to-extend-tvm-low-level-custom-pass-py"><span class="std std-ref">Writing a Customized Pass</span></a> (<code class="docutils literal notranslate"><span class="pre">low_level_custom_pass.py</span></code>)</p></li>
+<li><p><strong>00:34.545</strong>: <a class="reference internal" href="bring_your_own_datatypes.html#sphx-glr-how-to-extend-tvm-bring-your-own-datatypes-py"><span class="std std-ref">Bring Your Own Datatypes to TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">bring_your_own_datatypes.py</span></code>)</p></li>
+<li><p><strong>00:02.256</strong>: <a class="reference internal" href="use_pass_instrument.html#sphx-glr-how-to-extend-tvm-use-pass-instrument-py"><span class="std std-ref">How to Use TVM Pass Instrument</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_instrument.py</span></code>)</p></li>
+<li><p><strong>00:01.037</strong>: <a class="reference internal" href="use_pass_infra.html#sphx-glr-how-to-extend-tvm-use-pass-infra-py"><span class="std std-ref">How to Use TVM Pass Infra</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_infra.py</span></code>)</p></li>
+<li><p><strong>00:00.203</strong>: <a class="reference internal" href="low_level_custom_pass.html#sphx-glr-how-to-extend-tvm-low-level-custom-pass-py"><span class="std std-ref">Writing a Customized Pass</span></a> (<code class="docutils literal notranslate"><span class="pre">low_level_custom_pass.py</span></code>)</p></li>
 </ul>
 </div>
 
diff --git a/docs/how_to/extend_tvm/use_pass_instrument.html b/docs/how_to/extend_tvm/use_pass_instrument.html
index 6ff500eb5..2595bcd33 100644
--- a/docs/how_to/extend_tvm/use_pass_instrument.html
+++ b/docs/how_to/extend_tvm/use_pass_instrument.html
@@ -486,10 +486,10 @@ profile the execution time of each passes.</p>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 6042us [6042us] (44.79%; 44.79%)
-FoldScaleAxis: 7446us [5us] (55.21%; 55.21%)
-        FoldConstant: 7441us [1494us] (55.17%; 99.93%)
-                InferType: 5947us [5947us] (44.09%; 79.92%)
+InferType: 5991us [5991us] (45.53%; 45.53%)
+FoldScaleAxis: 7168us [5us] (54.47%; 54.47%)
+        FoldConstant: 7162us [1462us] (54.43%; 99.92%)
+                InferType: 5700us [5700us] (43.31%; 79.58%)
 </pre></div>
 </div>
 </div>
@@ -512,10 +512,10 @@ Refer to following sections and <a class="reference internal" href="../../refere
 </div>
 <p class="sphx-glr-script-out">Out:</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 5874us [5874us] (43.91%; 43.91%)
-FoldScaleAxis: 7504us [5us] (56.09%; 56.09%)
-        FoldConstant: 7499us [1758us] (56.06%; 99.93%)
-                InferType: 5741us [5741us] (42.92%; 76.56%)
+InferType: 5778us [5778us] (44.81%; 44.81%)
+FoldScaleAxis: 7117us [4us] (55.19%; 55.19%)
+        FoldConstant: 7113us [1474us] (55.16%; 99.94%)
+                InferType: 5639us [5639us] (43.73%; 79.28%)
 </pre></div>
 </div>
 <p>Register empty list to clear existing instruments.</p>
diff --git a/docs/how_to/optimize_operators/opt_conv_cuda.html b/docs/how_to/optimize_operators/opt_conv_cuda.html
index d000b3808..fd3ea363e 100644
--- a/docs/how_to/optimize_operators/opt_conv_cuda.html
+++ b/docs/how_to/optimize_operators/opt_conv_cuda.html
@@ -534,7 +534,7 @@ latency of convolution.</p>
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 54.120575 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 34.780645 ms
 </pre></div>
 </div>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-optimize-operators-opt-conv-cuda-py">
diff --git a/docs/how_to/optimize_operators/opt_conv_tensorcore.html b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
index 74cc36241..43c2d34e9 100644
--- a/docs/how_to/optimize_operators/opt_conv_tensorcore.html
+++ b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
@@ -878,7 +878,7 @@ be able to run on our build server</p>
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 7.868202 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 8.070889 ms
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/optimize_operators/opt_gemm.html b/docs/how_to/optimize_operators/opt_gemm.html
index 59a8c1fad..af8c37b95 100644
--- a/docs/how_to/optimize_operators/opt_gemm.html
+++ b/docs/how_to/optimize_operators/opt_gemm.html
@@ -431,8 +431,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.018608
-Baseline: 3.227036
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.017973
+Baseline: 3.184344
 </pre></div>
 </div>
 <p>In TVM, we can always inspect lower level IR to debug or optimize our schedule.
@@ -494,7 +494,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.301383
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.295863
 </pre></div>
 </div>
 <p>Here is the generated IR after blocking.</p>
@@ -563,7 +563,7 @@ vastly.</p>
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.337461
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.334231
 </pre></div>
 </div>
 <p>Here is the generated IR after vectorization.</p>
@@ -626,7 +626,7 @@ the access pattern for A matrix is more cache friendly.</p>
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.117425
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.116509
 </pre></div>
 </div>
 <p>Here is the generated IR after loop permutation.</p>
@@ -711,7 +711,7 @@ flattening.</p>
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.110596
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.113170
 </pre></div>
 </div>
 <p>Here is the generated IR after array packing.</p>
@@ -799,7 +799,7 @@ write to C when all the block results are ready.</p>
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.110941
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.109905
 </pre></div>
 </div>
 <p>Here is the generated IR after blocking.</p>
@@ -891,7 +891,7 @@ write to C when all the block results are ready.</p>
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.145143
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.144260
 </pre></div>
 </div>
 <p>Here is the generated IR after parallelization.</p>
diff --git a/docs/how_to/optimize_operators/sg_execution_times.html b/docs/how_to/optimize_operators/sg_execution_times.html
index 0b0119a28..7a1dfc96a 100644
--- a/docs/how_to/optimize_operators/sg_execution_times.html
+++ b/docs/how_to/optimize_operators/sg_execution_times.html
@@ -300,11 +300,11 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-optimize-operators-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:34.552</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
+<p><strong>00:34.160</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
 <ul class="simple">
-<li><p><strong>00:31.889</strong>: <a class="reference internal" href="opt_gemm.html#sphx-glr-how-to-optimize-operators-opt-gemm-py"><span class="std std-ref">How to optimize GEMM on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_gemm.py</span></code>)</p></li>
-<li><p><strong>00:01.444</strong>: <a class="reference internal" href="opt_conv_tensorcore.html#sphx-glr-how-to-optimize-operators-opt-conv-tensorcore-py"><span class="std std-ref">How to optimize convolution using TensorCores</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_tensorcore.py</span></code>)</p></li>
-<li><p><strong>00:01.219</strong>: <a class="reference internal" href="opt_conv_cuda.html#sphx-glr-how-to-optimize-operators-opt-conv-cuda-py"><span class="std std-ref">How to optimize convolution on GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_cuda.py</span></code>)</p></li>
+<li><p><strong>00:31.505</strong>: <a class="reference internal" href="opt_gemm.html#sphx-glr-how-to-optimize-operators-opt-gemm-py"><span class="std std-ref">How to optimize GEMM on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_gemm.py</span></code>)</p></li>
+<li><p><strong>00:01.451</strong>: <a class="reference internal" href="opt_conv_tensorcore.html#sphx-glr-how-to-optimize-operators-opt-conv-tensorcore-py"><span class="std std-ref">How to optimize convolution using TensorCores</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_tensorcore.py</span></code>)</p></li>
+<li><p><strong>00:01.204</strong>: <a class="reference internal" href="opt_conv_cuda.html#sphx-glr-how-to-optimize-operators-opt-conv-cuda-py"><span class="std std-ref">How to optimize convolution on GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_cuda.py</span></code>)</p></li>
 </ul>
 </div>
 
diff --git a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
index dfff3d063..a09347581 100644
--- a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
+++ b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
@@ -300,14 +300,14 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-tune-with-autoscheduler-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>05:01.083</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
+<p><strong>05:01.349</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
 <ul class="simple">
-<li><p><strong>02:26.752</strong>: <a class="reference internal" href="tune_conv2d_layer_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py"><span class="std std-ref">Auto-scheduling a Convolution Layer for GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_layer_cuda.py</span></code>)</p></li>
-<li><p><strong>01:19.468</strong>: <a class="reference internal" href="tune_network_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-x86-py"><span class="std std-ref">Auto-scheduling a Neural Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_x86.py</span></code>)</p></li>
-<li><p><strong>00:41.019</strong>: <a class="reference internal" href="tune_network_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-cuda-py"><span class="std std-ref">Auto-scheduling a Neural Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_cuda.py</span></code>)</p></li>
-<li><p><strong>00:16.416</strong>: <a class="reference internal" href="tune_sparse_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-sparse-x86-py"><span class="std std-ref">Auto-scheduling Sparse Matrix Multiplication on CPU with Custom Sketch Rule</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_sparse_x86.py</span></code>)</p></li>
-<li><p><strong>00:08.850</strong>: <a class="reference internal" href="tune_network_mali.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-mali-py"><span class="std std-ref">Auto-scheduling a Neural Network for mali GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_mali.py</span></code>)</p></li>
-<li><p><strong>00:08.577</strong>: <a class="reference internal" href="tune_network_arm.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-arm-py"><span class="std std-ref">Auto-scheduling a Neural Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_arm.py</span></code>)</p></li>
+<li><p><strong>02:27.893</strong>: <a class="reference internal" href="tune_conv2d_layer_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py"><span class="std std-ref">Auto-scheduling a Convolution Layer for GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_layer_cuda.py</span></code>)</p></li>
+<li><p><strong>01:18.366</strong>: <a class="reference internal" href="tune_network_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-x86-py"><span class="std std-ref">Auto-scheduling a Neural Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_x86.py</span></code>)</p></li>
+<li><p><strong>00:40.456</strong>: <a class="reference internal" href="tune_network_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-cuda-py"><span class="std std-ref">Auto-scheduling a Neural Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_cuda.py</span></code>)</p></li>
+<li><p><strong>00:17.473</strong>: <a class="reference internal" href="tune_sparse_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-sparse-x86-py"><span class="std std-ref">Auto-scheduling Sparse Matrix Multiplication on CPU with Custom Sketch Rule</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_sparse_x86.py</span></code>)</p></li>
+<li><p><strong>00:08.884</strong>: <a class="reference internal" href="tune_network_mali.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-mali-py"><span class="std std-ref">Auto-scheduling a Neural Network for mali GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_mali.py</span></code>)</p></li>
+<li><p><strong>00:08.277</strong>: <a class="reference internal" href="tune_network_arm.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-arm-py"><span class="std std-ref">Auto-scheduling a Neural Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_arm.py</span></code>)</p></li>
 </ul>
 </div>
 
diff --git a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
index dadc1a299..3c31f1edf 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
@@ -470,166 +470,1035 @@ cooperative fetching, unrolling and operator fusion.</p>
              compute: Buffer(compute_2: Pointer(float32), float32, [25088], [])}
   buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, compute_1: compute}
   preflattened_buffer_map = {data_1: data_3: Buffer(data_2, float32, [1, 512, 7, 7], []), kernel_1: kernel_3: Buffer(kernel_2, float32, [512, 512, 3, 3], []), bias_1: bias_3: Buffer(bias_2, float32, [1, 512, 1, 1], []), compute_1: compute_3: Buffer(compute_2, float32, [1, 512, 7, 7], [])} {
-  attr [IterVar(blockIdx.x: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;blockIdx.x&quot;)] &quot;thread_extent&quot; = 56;
-  allocate(conv2d_nchw: Pointer(local float32), float32, [7]), storage_scope = local;
-  allocate(pad_temp.shared: Pointer(shared float32), float32, [216]), storage_scope = shared;
-  allocate(kernel.shared: Pointer(shared float32), float32, [4608]), storage_scope = shared;
-  attr [IterVar(threadIdx.x: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64 {
-    conv2d_nchw_1: Buffer(conv2d_nchw, float32, [1], [], scope=&quot;local&quot;, align=4)[0] = 0f32
-    conv2d_nchw_1[1] = 0f32
+  attr [IterVar(blockIdx.x: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;blockIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+  allocate(conv2d_nchw: Pointer(local float32), float32, [8]), storage_scope = local;
+  allocate(pad_temp.shared: Pointer(shared float32), float32, [1568]), storage_scope = shared;
+  allocate(kernel.shared: Pointer(shared float32), float32, [256]), storage_scope = shared;
+  attr [IterVar(threadIdx.x: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49 {
+    conv2d_nchw_1: Buffer(conv2d_nchw, float32, [4], [], scope=&quot;local&quot;, align=8)[0] = 0f32
     conv2d_nchw_1[2] = 0f32
-    conv2d_nchw_1[3] = 0f32
     conv2d_nchw_1[4] = 0f32
-    conv2d_nchw_1[5] = 0f32
     conv2d_nchw_1[6] = 0f32
-    for (rc.outer.outer: int32, 0, 64) {
-      let cse_var_2: int32 = (rc.outer.outer*392)
-      let cse_var_1: int32 = (rc.outer.outer*72)
-       {
-        attr [IterVar(threadIdx.x_1: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-        pad_temp.shared_1: Buffer(pad_temp.shared, float32, [216], [], scope=&quot;shared&quot;)[threadIdx.x_1] = @tir.if_then_else(((((3 &lt;= floormod(threadIdx.x_1, 27)) &amp;&amp; (floormod(threadIdx.x_1, 27) &lt; 24)) &amp;&amp; (1 &lt;= (floormod(blockIdx.x, 7) + floormod(threadIdx.x_1, 3)))) &amp;&amp; ((floormod(blockIdx.x, 7) + floormod(threadIdx.x_1, 3)) &lt; 8)), data[(((((cse_var_2 + (floordiv(threadIdx.x_1, 27)*49)) + (floordiv(floormod(threadIdx.x_1, 27), 3)*7)) + floormod( [...]
-        attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-        pad_temp.shared_1[(threadIdx.x_1 + 64)] = @tir.if_then_else(((((3 &lt;= floormod((threadIdx.x_1 + 64), 27)) &amp;&amp; (floormod((threadIdx.x_1 + 10), 27) &lt; 24)) &amp;&amp; (1 &lt;= (floormod(blockIdx.x, 7) + floormod((threadIdx.x_1 + 1), 3)))) &amp;&amp; ((floormod(blockIdx.x, 7) + floormod((threadIdx.x_1 + 1), 3)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 64), 27)*49)) + (floordiv(floormod((threadIdx.x_1 + 10), 27), 3)*7)) + floormod(blockIdx.x, 7)) + floorm [...]
-        attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-        pad_temp.shared_1[(threadIdx.x_1 + 128)] = @tir.if_then_else(((((3 &lt;= floormod((threadIdx.x_1 + 128), 27)) &amp;&amp; (floormod((threadIdx.x_1 + 20), 27) &lt; 24)) &amp;&amp; (1 &lt;= (floormod(blockIdx.x, 7) + floormod((threadIdx.x_1 + 2), 3)))) &amp;&amp; ((floormod(blockIdx.x, 7) + floormod((threadIdx.x_1 + 2), 3)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 128), 27)*49)) + (floordiv(floormod((threadIdx.x_1 + 20), 27), 3)*7)) + floormod(blockIdx.x, 7)) + flo [...]
-        attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-        if @tir.likely((threadIdx.x_1 &lt; 24), dtype=bool) {
-          pad_temp.shared_1[(threadIdx.x_1 + 192)] = @tir.if_then_else((((floormod((threadIdx.x_1 + 3), 27) &lt; 24) &amp;&amp; (1 &lt;= (floormod(blockIdx.x, 7) + floormod(threadIdx.x_1, 3)))) &amp;&amp; ((floormod(blockIdx.x, 7) + floormod(threadIdx.x_1, 3)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 192), 27)*49)) + (floormod((floordiv(threadIdx.x_1, 3) + 1), 9)*7)) + floormod(blockIdx.x, 7)) + floormod(threadIdx.x_1, 3)) - 8)], 0f32, dtype=float32)
-        }
-        attr [IterVar(threadIdx.x_2: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-        kernel.shared_1: Buffer(kernel.shared, float32, [4608], [], scope=&quot;shared&quot;)[ramp((threadIdx.x_2*4), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(threadIdx.x_2, 18)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp((threadIdx.x_2*4), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp(threadIdx.x_2, 1, 4), broadcast(3, 4)))]
-        attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-        kernel.shared_1[ramp(((threadIdx.x_2*4) + 256), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 256), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 256), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 64), 1, 4), broadcast(3, 4)))]
-        attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-        kernel.shared_1[ramp(((threadIdx.x_2*4) + 512), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 512), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 512), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 128), 1, 4), broadcast(3, 4)))]
-        attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-        kernel.shared_1[ramp(((threadIdx.x_2*4) + 768), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 768), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 768), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 192), 1, 4), broadcast(3, 4)))]
-        attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-        kernel.shared_1[ramp(((threadIdx.x_2*4) + 1024), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 1024), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 1024), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 256), 1, 4), broadcast(3, 4)))]
-        attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-        kernel.shared_1[ramp(((threadIdx.x_2*4) + 1280), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 1280), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 1280), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 320), 1, 4), broadcast(3, 4)))]
-        attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-        kernel.shared_1[ramp(((threadIdx.x_2*4) + 1536), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 1536), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 1536), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 384), 1, 4), broadcast(3, 4)))]
-        attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-        kernel.shared_1[ramp(((threadIdx.x_2*4) + 1792), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 1792), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 1792), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 448), 1, 4), broadcast(3, 4)))]
-        attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-        kernel.shared_1[ramp(((threadIdx.x_2*4) + 2048), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 2048), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 2048), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 512), 1, 4), broadcast(3, 4)))]
-        attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-        kernel.shared_1[ramp(((threadIdx.x_2*4) + 2304), 1, 4)] = kernel[((broadcast(((((floordiv(blockIdx.x, 7)*294912) + (floordiv(threadIdx.x_2, 18)*4608)) + cse_var_1) + 147456), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 2304), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 576), 1, 4), broadcast(3, 4)))]
-        attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-        kernel.shared_1[ramp(((threadIdx.x_2*4) + 2560), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 2560), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 2560), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 640), 1, 4), broadcast(3, 4)))]
-        attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-        kernel.shared_1[ramp(((threadIdx.x_2*4) + 2816), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 2816), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 2816), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 704), 1, 4), broadcast(3, 4)))]
-        attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-        kernel.shared_1[ramp(((threadIdx.x_2*4) + 3072), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 3072), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 3072), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 768), 1, 4), broadcast(3, 4)))]
-        attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-        kernel.shared_1[ramp(((threadIdx.x_2*4) + 3328), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 3328), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 3328), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 832), 1, 4), broadcast(3, 4)))]
-        attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-        kernel.shared_1[ramp(((threadIdx.x_2*4) + 3584), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 3584), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 3584), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 896), 1, 4), broadcast(3, 4)))]
-        attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-        kernel.shared_1[ramp(((threadIdx.x_2*4) + 3840), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 3840), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 3840), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 960), 1, 4), broadcast(3, 4)))]
-        attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-        kernel.shared_1[ramp(((threadIdx.x_2*4) + 4096), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 4096), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 4096), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 1024), 1, 4), broadcast(3, 4)))]
-        attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-        kernel.shared_1[ramp(((threadIdx.x_2*4) + 4352), 1, 4)] = kernel[((broadcast((((floordiv(blockIdx.x, 7)*294912) + (floordiv(((threadIdx.x_2*4) + 4352), 72)*4608)) + cse_var_1), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 4352), 1, 4), broadcast(3, 4)), broadcast(24, 4))*broadcast(3, 4))) + floormod(ramp((threadIdx.x_2 + 1088), 1, 4), broadcast(3, 4)))]
-        for (rc.inner: int32, 0, 8) {
-          let cse_var_24: int32 = (rc.inner*27)
-          let cse_var_23: int32 = (cse_var_24 + 10)
-          let cse_var_22: int32 = (cse_var_24 + 11)
-          let cse_var_21: int32 = (cse_var_24 + 12)
-          let cse_var_20: int32 = (cse_var_24 + 13)
-          let cse_var_19: int32 = (cse_var_24 + 14)
-          let cse_var_18: int32 = (cse_var_24 + 15)
-          let cse_var_17: int32 = (cse_var_24 + 16)
-          let cse_var_16: int32 = (cse_var_24 + 17)
-          let cse_var_15: int32 = (cse_var_24 + 18)
-          let cse_var_14: int32 = (cse_var_24 + 20)
-          let cse_var_13: int32 = (cse_var_24 + 21)
-          let cse_var_12: int32 = (cse_var_24 + 9)
-          let cse_var_11: int32 = (cse_var_24 + 8)
-          let cse_var_10: int32 = (cse_var_24 + 7)
-          let cse_var_9: int32 = (cse_var_24 + 6)
-          let cse_var_8: int32 = (cse_var_24 + 5)
-          let cse_var_7: int32 = (cse_var_24 + 4)
-          let cse_var_6: int32 = (cse_var_24 + 3)
-          let cse_var_5: int32 = (cse_var_24 + 23)
-          let cse_var_4: int32 = (cse_var_24 + 19)
-          let cse_var_3: int32 = (cse_var_24 + 22)
-           {
-            conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_24]*kernel.shared_1[((threadIdx.x*72) + (rc.inner*9))]))
-            conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_6]*kernel.shared_1[((threadIdx.x*72) + (rc.inner*9))]))
-            conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_9]*kernel.shared_1[((threadIdx.x*72) + (rc.inner*9))]))
-            conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_12]*kernel.shared_1[((threadIdx.x*72) + (rc.inner*9))]))
-            conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_21]*kernel.shared_1[((threadIdx.x*72) + (rc.inner*9))]))
-            conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_18]*kernel.shared_1[((threadIdx.x*72) + (rc.inner*9))]))
-            conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_15]*kernel.shared_1[((threadIdx.x*72) + (rc.inner*9))]))
-            conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(cse_var_24 + 1)]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 1)]))
-            conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_7]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 1)]))
-            conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_10]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 1)]))
-            conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_23]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 1)]))
-            conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_20]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 1)]))
-            conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_17]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 1)]))
-            conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_4]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 1)]))
-            conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(cse_var_24 + 2)]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 2)]))
-            conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_8]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 2)]))
-            conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_11]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 2)]))
-            conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_22]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 2)]))
-            conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_19]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 2)]))
-            conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_16]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 2)]))
-            conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_14]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 2)]))
-            conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_6]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 3)]))
-            conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_9]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 3)]))
-            conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_12]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 3)]))
-            conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_21]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 3)]))
-            conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_18]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 3)]))
-            conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_15]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 3)]))
-            conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_13]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 3)]))
-            conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_7]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 4)]))
-            conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_10]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 4)]))
-            conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_23]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 4)]))
-            conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_20]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 4)]))
-            conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_17]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 4)]))
-            conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_4]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 4)]))
-            conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_3]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 4)]))
-            conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_8]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 5)]))
-            conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_11]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 5)]))
-            conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_22]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 5)]))
-            conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_19]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 5)]))
-            conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_16]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 5)]))
-            conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_14]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 5)]))
-            conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_5]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 5)]))
-            conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_9]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 6)]))
-            conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_12]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 6)]))
-            conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_21]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 6)]))
-            conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_18]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 6)]))
-            conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_15]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 6)]))
-            conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_13]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 6)]))
-            conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(cse_var_24 + 24)]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 6)]))
-            conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_10]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 7)]))
-            conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_23]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 7)]))
-            conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_20]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 7)]))
-            conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_17]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 7)]))
-            conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_4]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 7)]))
-            conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_3]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 7)]))
-            conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(cse_var_24 + 25)]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 7)]))
-            conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_11]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 8)]))
-            conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_22]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 8)]))
-            conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_19]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 8)]))
-            conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_16]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 8)]))
-            conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_14]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 8)]))
-            conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_5]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 8)]))
-            conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(cse_var_24 + 26)]*kernel.shared_1[(((threadIdx.x*72) + (rc.inner*9)) + 8)]))
+    conv2d_nchw_1[1] = 0f32
+    conv2d_nchw_1[3] = 0f32
+    conv2d_nchw_1[5] = 0f32
+    conv2d_nchw_1[7] = 0f32
+    for (rc.outer.outer: int32, 0, 16) {
+      for (ry.outer.outer: int32, 0, 3) {
+        let cse_var_2: int32 = (rc.outer.outer*288)
+        let cse_var_1: int32 = (ry.outer.outer*3)
+         {
+          attr [IterVar(threadIdx.x_1: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1: Buffer(pad_temp.shared, float32, [1568], [], scope=&quot;shared&quot;)[threadIdx.x_1] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) - 8)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 49)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 41)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 98)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 90)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 147)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 139)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 196)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 188)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 245)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 237)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 294)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 286)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 343)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 335)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 392)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 384)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 441)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 433)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 490)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 482)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 539)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 531)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 588)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 580)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 637)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 629)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 686)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 678)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 735)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 727)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 784)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 776)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 833)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 825)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 882)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 874)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 931)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 923)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 980)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 972)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1029)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1021)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1078)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1070)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1127)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1119)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1176)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1168)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1225)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1217)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1274)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1266)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1323)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1315)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1372)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1364)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1421)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1413)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1470)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1462)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1519)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 7))), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1511)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_2: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          kernel.shared_1: Buffer(kernel.shared, float32, [256], [], scope=&quot;shared&quot;)[threadIdx.x_2] = kernel[(((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 32)*4608)) + cse_var_2) + (floormod(threadIdx.x_2, 32)*9)) + cse_var_1)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          kernel.shared_1[(threadIdx.x_2 + 49)] = kernel[(((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 49), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 17), 32)*9)) + cse_var_1)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          kernel.shared_1[(threadIdx.x_2 + 98)] = kernel[(((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 98), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 2), 32)*9)) + cse_var_1)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          kernel.shared_1[(threadIdx.x_2 + 147)] = kernel[(((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 147), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 19), 32)*9)) + cse_var_1)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          kernel.shared_1[(threadIdx.x_2 + 196)] = kernel[(((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 196), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 4), 32)*9)) + cse_var_1)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          if @tir.likely((threadIdx.x_2 &lt; 11), dtype=bool) {
+            kernel.shared_1[(threadIdx.x_2 + 245)] = kernel[(((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 245), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 21), 32)*9)) + cse_var_1)]
+          }
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[0]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[64]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[128]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[192]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[32]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[96]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[160]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[224]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[1]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[65]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[129]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[193]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[33]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[97]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[161]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[225]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[2]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[66]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[130]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[194]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[34]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[98]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[162]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[226]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[3]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[67]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[131]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[195]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[35]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[99]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[163]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[227]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[4]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[68]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[132]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[196]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[36]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[100]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[164]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[228]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[5]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[69]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[133]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[197]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[37]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[101]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[165]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[229]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[6]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[70]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[134]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[198]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[38]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[102]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[166]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[230]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[7]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[71]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[135]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[199]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[39]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[103]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[167]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[231]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[8]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[72]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[136]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[200]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[40]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[104]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[168]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[232]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[9]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[73]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[137]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[201]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[41]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[105]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[169]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[233]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[10]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[74]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[138]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[202]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[42]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[106]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[170]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[234]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[11]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[75]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[139]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[203]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[43]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[107]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[171]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[235]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[12]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[76]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[140]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[204]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[44]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[108]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[172]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[236]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[13]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[77]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[141]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[205]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[45]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[109]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[173]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[237]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[14]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[78]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[142]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[206]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[46]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[110]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[174]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[238]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[15]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[79]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[143]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[207]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[47]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[111]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[175]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[239]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[16]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[80]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[144]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[208]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[48]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[112]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[176]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[240]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[17]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[81]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[145]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[209]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[49]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[113]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[177]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[241]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[18]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[82]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[146]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[210]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[50]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[114]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[178]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[242]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[19]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[83]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[147]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[211]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[51]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[115]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[179]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[243]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[20]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[84]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[148]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[212]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[52]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[116]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[180]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[244]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[21]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[85]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[149]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[213]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[53]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[117]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[181]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[245]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[22]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[86]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[150]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[214]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[54]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[118]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[182]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[246]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[23]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[87]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[151]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[215]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[55]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[119]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[183]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[247]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[24]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[88]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[152]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[216]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[56]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[120]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[184]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[248]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[25]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[89]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[153]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[217]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[57]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[121]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[185]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[249]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[26]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[90]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[154]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[218]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[58]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[122]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[186]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[250]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[27]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[91]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[155]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[219]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[59]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[123]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[187]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[251]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[28]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[92]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[156]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[220]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[60]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[124]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[188]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[252]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[29]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[93]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[157]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[221]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[61]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[125]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[189]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[253]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[30]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[94]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[158]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[222]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[62]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[126]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[190]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[254]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[31]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[95]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[159]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[223]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[63]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[127]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[191]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[255]))
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[threadIdx.x_1] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) - 7)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 49)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 42)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 98)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 91)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 147)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 140)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 196)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 189)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 245)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 238)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 294)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 287)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 343)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 336)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 392)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 385)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 441)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 434)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 490)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 483)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 539)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 532)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 588)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 581)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 637)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 630)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 686)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 679)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 735)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 728)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 784)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 777)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 833)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 826)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 882)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 875)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 931)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 924)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 980)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 973)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1029)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1022)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1078)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1071)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1127)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1120)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1176)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1169)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1225)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1218)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1274)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1267)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1323)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1316)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1372)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1365)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1421)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1414)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1470)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1463)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1519)] = @tir.if_then_else(((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1512)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          kernel.shared_1[threadIdx.x_2] = kernel[((((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 32)*4608)) + cse_var_2) + (floormod(threadIdx.x_2, 32)*9)) + cse_var_1) + 1)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          kernel.shared_1[(threadIdx.x_2 + 49)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 49), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 17), 32)*9)) + cse_var_1) + 1)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          kernel.shared_1[(threadIdx.x_2 + 98)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 98), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 2), 32)*9)) + cse_var_1) + 1)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          kernel.shared_1[(threadIdx.x_2 + 147)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 147), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 19), 32)*9)) + cse_var_1) + 1)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          kernel.shared_1[(threadIdx.x_2 + 196)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 196), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 4), 32)*9)) + cse_var_1) + 1)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          if @tir.likely((threadIdx.x_2 &lt; 11), dtype=bool) {
+            kernel.shared_1[(threadIdx.x_2 + 245)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 245), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 21), 32)*9)) + cse_var_1) + 1)]
+          }
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[0]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[64]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[128]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[192]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[32]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[96]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[160]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[224]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[1]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[65]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[129]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[193]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[33]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[97]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[161]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[225]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[2]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[66]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[130]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[194]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[34]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[98]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[162]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[226]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[3]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[67]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[131]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[195]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[35]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[99]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[163]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[227]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[4]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[68]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[132]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[196]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[36]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[100]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[164]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[228]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[5]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[69]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[133]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[197]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[37]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[101]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[165]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[229]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[6]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[70]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[134]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[198]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[38]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[102]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[166]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[230]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[7]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[71]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[135]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[199]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[39]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[103]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[167]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[231]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[8]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[72]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[136]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[200]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[40]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[104]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[168]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[232]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[9]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[73]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[137]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[201]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[41]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[105]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[169]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[233]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[10]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[74]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[138]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[202]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[42]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[106]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[170]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[234]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[11]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[75]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[139]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[203]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[43]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[107]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[171]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[235]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[12]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[76]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[140]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[204]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[44]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[108]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[172]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[236]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[13]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[77]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[141]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[205]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[45]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[109]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[173]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[237]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[14]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[78]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[142]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[206]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[46]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[110]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[174]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[238]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[15]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[79]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[143]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[207]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[47]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[111]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[175]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[239]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[16]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[80]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[144]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[208]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[48]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[112]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[176]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[240]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[17]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[81]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[145]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[209]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[49]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[113]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[177]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[241]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[18]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[82]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[146]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[210]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[50]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[114]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[178]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[242]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[19]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[83]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[147]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[211]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[51]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[115]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[179]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[243]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[20]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[84]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[148]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[212]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[52]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[116]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[180]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[244]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[21]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[85]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[149]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[213]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[53]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[117]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[181]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[245]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[22]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[86]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[150]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[214]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[54]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[118]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[182]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[246]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[23]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[87]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[151]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[215]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[55]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[119]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[183]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[247]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[24]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[88]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[152]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[216]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[56]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[120]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[184]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[248]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[25]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[89]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[153]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[217]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[57]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[121]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[185]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[249]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[26]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[90]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[154]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[218]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[58]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[122]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[186]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[250]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[27]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[91]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[155]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[219]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[59]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[123]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[187]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[251]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[28]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[92]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[156]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[220]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[60]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[124]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[188]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[252]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[29]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[93]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[157]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[221]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[61]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[125]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[189]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[253]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[30]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[94]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[158]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[222]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[62]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[126]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[190]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[254]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[31]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[95]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[159]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[223]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[63]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[127]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[191]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[255]))
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[threadIdx.x_1] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) - 6)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 49)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 43)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 98)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 92)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 147)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 141)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 196)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 190)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 245)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 239)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 294)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 288)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 343)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 337)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 392)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 386)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 441)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 435)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 490)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 484)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 539)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 533)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 588)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 582)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 637)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 631)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 686)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 680)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 735)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 729)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 784)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 778)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 833)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 827)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 882)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 876)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 931)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 925)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 980)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 974)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1029)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1023)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1078)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1072)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1127)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1121)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1176)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1170)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1225)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1219)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1274)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1268)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1323)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1317)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1372)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1366)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1421)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1415)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1470)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1464)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          pad_temp.shared_1[(threadIdx.x_1 + 1519)] = @tir.if_then_else((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (floormod(threadIdx.x_1, 7) &lt; 6)), data[((((rc.outer.outer*1568) + (ry.outer.outer*7)) + threadIdx.x_1) + 1513)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          kernel.shared_1[threadIdx.x_2] = kernel[((((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 32)*4608)) + cse_var_2) + (floormod(threadIdx.x_2, 32)*9)) + cse_var_1) + 2)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          kernel.shared_1[(threadIdx.x_2 + 49)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 49), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 17), 32)*9)) + cse_var_1) + 2)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          kernel.shared_1[(threadIdx.x_2 + 98)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 98), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 2), 32)*9)) + cse_var_1) + 2)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          kernel.shared_1[(threadIdx.x_2 + 147)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 147), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 19), 32)*9)) + cse_var_1) + 2)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          kernel.shared_1[(threadIdx.x_2 + 196)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 196), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 4), 32)*9)) + cse_var_1) + 2)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
+          if @tir.likely((threadIdx.x_2 &lt; 11), dtype=bool) {
+            kernel.shared_1[(threadIdx.x_2 + 245)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 245), 32)*4608)) + cse_var_2) + (floormod((threadIdx.x_2 + 21), 32)*9)) + cse_var_1) + 2)]
           }
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[0]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[64]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[128]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[192]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[32]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[96]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[160]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[threadIdx.x]*kernel.shared_1[224]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[1]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[65]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[129]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[193]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[33]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[97]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[161]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 49)]*kernel.shared_1[225]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[2]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[66]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[130]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[194]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[34]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[98]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[162]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 98)]*kernel.shared_1[226]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[3]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[67]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[131]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[195]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[35]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[99]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[163]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 147)]*kernel.shared_1[227]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[4]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[68]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[132]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[196]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[36]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[100]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[164]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 196)]*kernel.shared_1[228]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[5]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[69]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[133]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[197]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[37]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[101]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[165]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 245)]*kernel.shared_1[229]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[6]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[70]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[134]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[198]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[38]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[102]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[166]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 294)]*kernel.shared_1[230]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[7]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[71]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[135]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[199]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[39]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[103]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[167]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 343)]*kernel.shared_1[231]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[8]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[72]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[136]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[200]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[40]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[104]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[168]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 392)]*kernel.shared_1[232]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[9]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[73]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[137]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[201]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[41]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[105]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[169]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 441)]*kernel.shared_1[233]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[10]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[74]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[138]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[202]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[42]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[106]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[170]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 490)]*kernel.shared_1[234]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[11]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[75]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[139]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[203]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[43]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[107]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[171]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 539)]*kernel.shared_1[235]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[12]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[76]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[140]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[204]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[44]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[108]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[172]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 588)]*kernel.shared_1[236]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[13]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[77]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[141]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[205]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[45]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[109]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[173]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 637)]*kernel.shared_1[237]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[14]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[78]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[142]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[206]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[46]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[110]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[174]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 686)]*kernel.shared_1[238]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[15]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[79]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[143]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[207]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[47]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[111]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[175]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 735)]*kernel.shared_1[239]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[16]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[80]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[144]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[208]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[48]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[112]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[176]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 784)]*kernel.shared_1[240]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[17]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[81]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[145]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[209]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[49]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[113]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[177]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 833)]*kernel.shared_1[241]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[18]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[82]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[146]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[210]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[50]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[114]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[178]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 882)]*kernel.shared_1[242]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[19]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[83]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[147]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[211]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[51]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[115]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[179]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 931)]*kernel.shared_1[243]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[20]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[84]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[148]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[212]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[52]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[116]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[180]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 980)]*kernel.shared_1[244]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[21]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[85]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[149]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[213]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[53]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[117]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[181]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1029)]*kernel.shared_1[245]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[22]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[86]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[150]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[214]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[54]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[118]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[182]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1078)]*kernel.shared_1[246]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[23]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[87]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[151]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[215]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[55]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[119]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[183]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1127)]*kernel.shared_1[247]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[24]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[88]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[152]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[216]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[56]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[120]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[184]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1176)]*kernel.shared_1[248]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[25]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[89]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[153]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[217]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[57]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[121]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[185]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1225)]*kernel.shared_1[249]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[26]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[90]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[154]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[218]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[58]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[122]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[186]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1274)]*kernel.shared_1[250]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[27]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[91]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[155]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[219]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[59]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[123]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[187]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1323)]*kernel.shared_1[251]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[28]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[92]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[156]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[220]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[60]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[124]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[188]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1372)]*kernel.shared_1[252]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[29]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[93]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[157]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[221]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[61]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[125]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[189]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1421)]*kernel.shared_1[253]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[30]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[94]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[158]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[222]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[62]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[126]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[190]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1470)]*kernel.shared_1[254]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[31]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[95]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[159]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[223]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[63]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[127]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[191]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(threadIdx.x + 1519)]*kernel.shared_1[255]))
         }
       }
     }
-    compute[(((floordiv(blockIdx.x, 7)*3136) + (threadIdx.x*49)) + floormod(blockIdx.x, 7))] = max((conv2d_nchw_1[0] + bias[((floordiv(blockIdx.x, 7)*64) + threadIdx.x)]), 0f32)
-    compute[((((floordiv(blockIdx.x, 7)*3136) + (threadIdx.x*49)) + floormod(blockIdx.x, 7)) + 7)] = max((conv2d_nchw_1[1] + bias[((floordiv(blockIdx.x, 7)*64) + threadIdx.x)]), 0f32)
-    compute[((((floordiv(blockIdx.x, 7)*3136) + (threadIdx.x*49)) + floormod(blockIdx.x, 7)) + 14)] = max((conv2d_nchw_1[2] + bias[((floordiv(blockIdx.x, 7)*64) + threadIdx.x)]), 0f32)
... 8077 lines suppressed ...