You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by tq...@apache.org on 2022/12/01 21:37:29 UTC
[tvm-site] branch asf-site updated: deploying docs (apache/tvm@afbfb7aa7e43732cb716f8e443df696110be6afc)
This is an automated email from the ASF dual-hosted git repository.
tqchen pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/tvm-site.git
The following commit(s) were added to refs/heads/asf-site by this push:
new f4861a2313 deploying docs (apache/tvm@afbfb7aa7e43732cb716f8e443df696110be6afc)
f4861a2313 is described below
commit f4861a231399f7f2be521c7cfabc5f6f017d7452
Author: tvm-bot <95...@users.noreply.github.com>
AuthorDate: Thu Dec 1 21:37:22 2022 +0000
deploying docs (apache/tvm@afbfb7aa7e43732cb716f8e443df696110be6afc)
---
docs/_images/sphx_glr_micro_train_001.png | Bin 306809 -> 337505 bytes
docs/_images/sphx_glr_micro_train_thumb.png | Bin 22544 -> 23675 bytes
.../how_to/compile_models/from_darknet.rst.txt | 2 +-
.../how_to/compile_models/from_keras.rst.txt | 2 +-
.../how_to/compile_models/from_mxnet.rst.txt | 2 +-
.../how_to/compile_models/from_oneflow.rst.txt | 2 +-
.../how_to/compile_models/from_pytorch.rst.txt | 2 +-
.../how_to/compile_models/from_tensorflow.rst.txt | 2 +-
.../compile_models/sg_execution_times.rst.txt | 22 +-
.../deploy_models/deploy_model_on_adreno.rst.txt | 7 +-
.../deploy_models/deploy_model_on_android.rst.txt | 2 +-
.../deploy_object_detection_pytorch.rst.txt | 4 +-
.../deploy_models/deploy_prequantized.rst.txt | 6 +-
.../deploy_prequantized_tflite.rst.txt | 4 +-
.../how_to/deploy_models/deploy_quantized.rst.txt | 2 +-
.../deploy_models/deploy_ssd_gluoncv.rst.txt | 4 +-
.../deploy_models/sg_execution_times.rst.txt | 22 +-
.../extend_tvm/bring_your_own_datatypes.rst.txt | 2 +-
.../how_to/extend_tvm/sg_execution_times.rst.txt | 6 +-
.../how_to/extend_tvm/use_pass_instrument.rst.txt | 16 +-
.../optimize_operators/opt_conv_cuda.rst.txt | 2 +-
.../optimize_operators/opt_conv_tensorcore.rst.txt | 2 +-
.../how_to/optimize_operators/opt_gemm.rst.txt | 16 +-
.../optimize_operators/sg_execution_times.rst.txt | 8 +-
.../sg_execution_times.rst.txt | 14 +-
.../tune_conv2d_layer_cuda.rst.txt | 1594 +++++++++-----------
.../tune_network_cuda.rst.txt | 4 +-
.../tune_network_x86.rst.txt | 4 +-
.../tune_sparse_x86.rst.txt | 34 +-
.../tune_with_autotvm/sg_execution_times.rst.txt | 8 +-
.../tune_with_autotvm/tune_conv2d_cuda.rst.txt | 672 +++++++--
.../tune_with_autotvm/tune_relay_cuda.rst.txt | 2 +-
.../work_with_microtvm/micro_autotune.rst.txt | 16 +-
.../work_with_microtvm/micro_pytorch.rst.txt | 4 +-
.../how_to/work_with_microtvm/micro_train.rst.txt | 18 +-
.../work_with_microtvm/sg_execution_times.rst.txt | 12 +-
.../work_with_relay/sg_execution_times.rst.txt | 8 +-
.../how_to/work_with_schedules/intrin_math.rst.txt | 2 +-
.../work_with_schedules/sg_execution_times.rst.txt | 16 +-
.../how_to/work_with_schedules/tensorize.rst.txt | 2 +-
.../tutorials/autotvm/sg_execution_times.rst.txt | 4 +-
.../vta/tutorials/autotvm/tune_relay_vta.rst.txt | 2 +-
.../frontend/deploy_classification.rst.txt | 2 +-
.../tutorials/frontend/deploy_detection.rst.txt | 2 +-
.../tutorials/frontend/sg_execution_times.rst.txt | 6 +-
.../tutorials/optimize/sg_execution_times.rst.txt | 6 +-
.../topic/vta/tutorials/sg_execution_times.rst.txt | 6 +-
.../tutorial/auto_scheduler_matmul_x86.rst.txt | 6 +-
docs/_sources/tutorial/autotvm_matmul_x86.rst.txt | 20 +-
docs/_sources/tutorial/autotvm_relay_x86.rst.txt | 60 +-
.../tutorial/cross_compilation_and_rpc.rst.txt | 2 +-
docs/_sources/tutorial/intro_topi.rst.txt | 4 +-
docs/_sources/tutorial/relay_quick_start.rst.txt | 2 +-
docs/_sources/tutorial/sg_execution_times.rst.txt | 18 +-
.../tutorial/tensor_expr_get_started.rst.txt | 45 +-
docs/commit_hash | 2 +-
docs/genindex.html | 2 +
docs/how_to/compile_models/from_darknet.html | 2 +-
docs/how_to/compile_models/from_keras.html | 2 +-
docs/how_to/compile_models/from_mxnet.html | 2 +-
docs/how_to/compile_models/from_oneflow.html | 12 +-
docs/how_to/compile_models/from_pytorch.html | 13 +-
docs/how_to/compile_models/from_tensorflow.html | 2 +-
docs/how_to/compile_models/sg_execution_times.html | 22 +-
.../deploy_models/deploy_model_on_adreno.html | 3 +-
.../deploy_models/deploy_model_on_android.html | 2 +-
.../deploy_object_detection_pytorch.html | 43 +-
docs/how_to/deploy_models/deploy_prequantized.html | 8 +-
.../deploy_models/deploy_prequantized_tflite.html | 4 +-
docs/how_to/deploy_models/deploy_quantized.html | 2 +-
docs/how_to/deploy_models/deploy_ssd_gluoncv.html | 39 +-
docs/how_to/deploy_models/sg_execution_times.html | 22 +-
.../extend_tvm/bring_your_own_datatypes.html | 2 +-
docs/how_to/extend_tvm/sg_execution_times.html | 6 +-
docs/how_to/extend_tvm/use_pass_instrument.html | 16 +-
docs/how_to/optimize_operators/opt_conv_cuda.html | 2 +-
.../optimize_operators/opt_conv_tensorcore.html | 2 +-
docs/how_to/optimize_operators/opt_gemm.html | 16 +-
.../optimize_operators/sg_execution_times.html | 8 +-
.../sg_execution_times.html | 14 +-
.../tune_conv2d_layer_cuda.html | 1594 +++++++++-----------
.../tune_with_autoscheduler/tune_network_cuda.html | 4 +-
.../tune_with_autoscheduler/tune_network_x86.html | 4 +-
.../tune_with_autoscheduler/tune_sparse_x86.html | 34 +-
.../tune_with_autotvm/sg_execution_times.html | 8 +-
.../how_to/tune_with_autotvm/tune_conv2d_cuda.html | 672 +++++++--
docs/how_to/tune_with_autotvm/tune_relay_cuda.html | 2 +-
docs/how_to/work_with_microtvm/micro_autotune.html | 16 +-
docs/how_to/work_with_microtvm/micro_pytorch.html | 5 +-
docs/how_to/work_with_microtvm/micro_train.html | 16 +-
.../work_with_microtvm/sg_execution_times.html | 12 +-
.../how_to/work_with_relay/sg_execution_times.html | 8 +-
docs/how_to/work_with_schedules/intrin_math.html | 2 +-
.../work_with_schedules/sg_execution_times.html | 16 +-
docs/how_to/work_with_schedules/tensorize.html | 2 +-
docs/install/nnpack.html | 12 +-
docs/objects.inv | Bin 24055 -> 24067 bytes
docs/reference/api/doxygen/namespacemembers_c.html | 7 +-
.../api/doxygen/namespacemembers_func_c.html | 7 +-
.../api/doxygen/namespacemembers_func_v.html | 3 +
docs/reference/api/doxygen/namespacemembers_s.html | 2 +-
docs/reference/api/doxygen/namespacemembers_v.html | 3 +
.../reference/api/doxygen/namespacetvm_1_1tir.html | 29 +
.../doxygen/namespacetvm_1_1tir_1_1transform.html | 31 +
docs/reference/api/doxygen/search/all_14.js | 2 +-
docs/reference/api/doxygen/search/all_17.js | 1 +
docs/reference/api/doxygen/search/all_18.js | 2 +-
docs/reference/api/doxygen/search/all_4.js | 1 +
docs/reference/api/doxygen/search/functions_16.js | 1 +
docs/reference/api/doxygen/search/functions_17.js | 2 +-
docs/reference/api/doxygen/search/functions_3.js | 1 +
docs/reference/api/doxygen/tir_2analysis_8h.html | 6 +
.../api/doxygen/tir_2analysis_8h_source.html | 4 +-
docs/reference/api/python/auto_scheduler.html | 4 +-
docs/reference/api/python/target.html | 1 +
docs/reference/api/python/tir.html | 63 +-
.../api/typedoc/classes/bytestreamreader.html | 12 +-
.../api/typedoc/classes/cachedcallstack.html | 34 +-
docs/reference/api/typedoc/classes/dldatatype.html | 12 +-
docs/reference/api/typedoc/classes/dldevice.html | 10 +-
.../reference/api/typedoc/classes/environment.html | 12 +-
docs/reference/api/typedoc/classes/ffilibrary.html | 20 +-
.../api/typedoc/classes/graphexecutor.html | 16 +-
docs/reference/api/typedoc/classes/instance.html | 40 +-
docs/reference/api/typedoc/classes/memory.html | 34 +-
docs/reference/api/typedoc/classes/module.html | 10 +-
docs/reference/api/typedoc/classes/ndarray.html | 22 +-
.../api/typedoc/classes/packedfunccell.html | 6 +-
docs/reference/api/typedoc/classes/rpcserver.html | 14 +-
docs/reference/api/typedoc/classes/scalar.html | 6 +-
.../api/typedoc/classes/webgpucontext.html | 12 +-
docs/reference/api/typedoc/enums/argtypecode.html | 30 +-
.../api/typedoc/enums/aynccallbackcode.html | 4 +-
.../api/typedoc/enums/dldatatypecode.html | 8 +-
.../api/typedoc/enums/rpcserverstate.html | 12 +-
docs/reference/api/typedoc/enums/sizeof.html | 18 +-
docs/reference/api/typedoc/index.html | 112 +-
.../api/typedoc/interfaces/disposable.html | 2 +-
.../api/typedoc/interfaces/functioninfo.html | 6 +-
.../api/typedoc/interfaces/libraryprovider.html | 4 +-
docs/searchindex.js | 2 +-
.../vta/tutorials/autotvm/sg_execution_times.html | 4 +-
.../vta/tutorials/autotvm/tune_relay_vta.html | 2 +-
.../tutorials/frontend/deploy_classification.html | 2 +-
.../vta/tutorials/frontend/deploy_detection.html | 2 +-
.../vta/tutorials/frontend/sg_execution_times.html | 6 +-
.../vta/tutorials/optimize/sg_execution_times.html | 6 +-
docs/topic/vta/tutorials/sg_execution_times.html | 6 +-
docs/tutorial/auto_scheduler_matmul_x86.html | 5 +-
docs/tutorial/autotvm_matmul_x86.html | 20 +-
docs/tutorial/autotvm_relay_x86.html | 272 ++--
docs/tutorial/cross_compilation_and_rpc.html | 2 +-
docs/tutorial/intro_topi.html | 4 +-
docs/tutorial/relay_quick_start.html | 2 +-
docs/tutorial/sg_execution_times.html | 18 +-
docs/tutorial/tensor_expr_get_started.html | 41 +-
156 files changed, 3460 insertions(+), 2947 deletions(-)
diff --git a/docs/_images/sphx_glr_micro_train_001.png b/docs/_images/sphx_glr_micro_train_001.png
index bbd9b6f736..cdece017f1 100644
Binary files a/docs/_images/sphx_glr_micro_train_001.png and b/docs/_images/sphx_glr_micro_train_001.png differ
diff --git a/docs/_images/sphx_glr_micro_train_thumb.png b/docs/_images/sphx_glr_micro_train_thumb.png
index d75c481972..3ea3b2a601 100644
Binary files a/docs/_images/sphx_glr_micro_train_thumb.png and b/docs/_images/sphx_glr_micro_train_thumb.png differ
diff --git a/docs/_sources/how_to/compile_models/from_darknet.rst.txt b/docs/_sources/how_to/compile_models/from_darknet.rst.txt
index d10c443346..c17e069f2a 100644
--- a/docs/_sources/how_to/compile_models/from_darknet.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_darknet.rst.txt
@@ -315,7 +315,7 @@ The process is no different from other examples.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 13.849 seconds)
+ **Total running time of the script:** ( 1 minutes 13.889 seconds)
.. _sphx_glr_download_how_to_compile_models_from_darknet.py:
diff --git a/docs/_sources/how_to/compile_models/from_keras.rst.txt b/docs/_sources/how_to/compile_models/from_keras.rst.txt
index 39abcd3099..35ff6fe2ee 100644
--- a/docs/_sources/how_to/compile_models/from_keras.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_keras.rst.txt
@@ -228,7 +228,7 @@ Look up prediction top 1 index in 1000 class synset.
.. code-block:: none
Relay top-1 id: 285, class name: Egyptian cat
-
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 1s 942ms/step
+
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 1s 958ms/step
Keras top-1 id: 285, class name: Egyptian cat
diff --git a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
index 2f399ad68d..b680012603 100644
--- a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
@@ -115,7 +115,7 @@ In this section, we download a pretrained imagenet model and classify an image.
.. code-block:: none
- Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip4f6ee52a-43e5-4081-8d15-a6bd47f9b9e7 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+ Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip410ebc78-fa41-484a-a007-b6e78027fcf0 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
x (1, 3, 224, 224)
diff --git a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
index d156ceedcb..866212a82a 100644
--- a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
@@ -116,7 +116,7 @@ Load a pretrained OneFlow model and save model
.. code-block:: none
Downloading: "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip" to /workspace/.oneflow/flowvision_cache/resnet18.zip
-
0%| | 0.00/41.5M [00:00<?, ?B/s]
19%|#9 | 7.99M/41.5M [00:00<00:00, 44.6MB/s]
39%|###8 | 16.0M/41.5M [00:00<00:00, 50.2MB/s]
54%|#####3 | 22.3M/41.5M [00:00<00:00, 52.3MB/s]
66%|######6 | 27.4M/41.5M [00:00<00:00, 47.6MB/s]
82%|########2 | 34.1M/41.5M [00:00<00:00, 41.7MB/s]
100%|##########| 41.5M/41.5M [00:00<00:00, 50.3MB/s]
+
0%| | 0.00/41.5M [00:00<?, ?B/s]
20%|#9 | 8.12M/41.5M [00:00<00:00, 85.0MB/s]
39%|###9 | 16.2M/41.5M [00:00<00:00, 48.9MB/s]
63%|######2 | 26.1M/41.5M [00:00<00:00, 45.9MB/s]
77%|#######7 | 32.0M/41.5M [00:00<00:00, 46.4MB/s]
92%|#########2| 38.3M/41.5M [00:00<00:00, 36.8MB/s]
100%|##########| 41.5M/41.5M [00:01<00:00, 41.1MB/s]
diff --git a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
index 4824c795c7..e2babdf61b 100644
--- a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
@@ -98,7 +98,7 @@ Load a pretrained PyTorch model
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and will be removed in 0.15. The current behavior is equivalent to passing `weights=ResNet18_Weights.IMAGENET1K_V1`. You can also use `weights=ResNet18_Weights.DEFAULT` to get the most up-to-date weights.
warnings.warn(msg)
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
-
0%| | 0.00/44.7M [00:00<?, ?B/s]
16%|#5 | 7.12M/44.7M [00:00<00:00, 74.7MB/s]
32%|###2 | 14.3M/44.7M [00:00<00:00, 59.8MB/s]
45%|####5 | 20.2M/44.7M [00:00<00:00, 56.3MB/s]
57%|#####7 | 25.6M/44.7M [00:00<00:00, 52.4MB/s]
72%|#######1 | 32.0M/44.7M [00:00<00:00, 49.5MB/s]
90%|########9 | 40.0M/44.7M [00:00<00:00, 50.1MB/s]
100%|##########| 44.7M/44.7M [00:00<00:00, 55.8MB/s]
+
0%| | 0.00/44.7M [00:00<?, ?B/s]
18%|#7 | 7.99M/44.7M [00:00<00:00, 72.6MB/s]
36%|###6 | 16.1M/44.7M [00:00<00:00, 79.5MB/s]
54%|#####3 | 24.0M/44.7M [00:00<00:00, 80.7MB/s]
76%|#######5 | 33.8M/44.7M [00:00<00:00, 89.2MB/s]
95%|#########4| 42.3M/44.7M [00:00<00:00, 87.1MB/s]
100%|##########| 44.7M/44.7M [00:00<00:00, 87.8MB/s]
diff --git a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
index 1fbb4179f1..143803122d 100644
--- a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
@@ -416,7 +416,7 @@ Run the corresponding model on tensorflow
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 10.537 seconds)
+ **Total running time of the script:** ( 1 minutes 11.783 seconds)
.. _sphx_glr_download_how_to_compile_models_from_tensorflow.py:
diff --git a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
index d18ba0f992..3a89198615 100644
--- a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
@@ -5,26 +5,26 @@
Computation times
=================
-**05:42.910** total execution time for **how_to_compile_models** files:
+**05:47.464** total execution time for **how_to_compile_models** files:
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``) | 01:13.850 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``) | 01:13.889 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``) | 01:10.537 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``) | 01:11.783 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``) | 00:45.743 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``) | 00:47.088 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``) | 00:31.118 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``) | 00:32.668 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``) | 00:28.196 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``) | 00:27.836 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``) | 00:26.570 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``) | 00:26.782 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``) | 00:24.682 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``) | 00:24.676 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``) | 00:22.384 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``) | 00:22.396 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``) | 00:17.432 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``) | 00:17.860 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``) | 00:02.400 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``) | 00:02.486 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/deploy_models/deploy_model_on_adreno.rst.txt b/docs/_sources/how_to/deploy_models/deploy_model_on_adreno.rst.txt
index 497ed39e4b..60d001e73b 100644
--- a/docs/_sources/how_to/deploy_models/deploy_model_on_adreno.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_model_on_adreno.rst.txt
@@ -723,18 +723,13 @@ well as provides information about the model's performance
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 3348.0071 3346.9349 3355.5216 3344.4685 3.2114
+ 2759.1538 2758.6797 2762.3322 2756.9490 1.7848
-.. rst-class:: sphx-glr-timing
-
- **Total running time of the script:** ( 1 minutes 1.045 seconds)
-
-
.. _sphx_glr_download_how_to_deploy_models_deploy_model_on_adreno.py:
.. only:: html
diff --git a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
index 6622bf4ab1..4a37f095bf 100644
--- a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
@@ -433,7 +433,7 @@ Execute on TVM
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 16.0523 16.0353 16.3517 15.9120 0.1204
+ 16.3971 16.2932 17.4048 15.9455 0.4637
diff --git a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
index 65d0f3f60b..39290a25c2 100644
--- a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
@@ -127,7 +127,7 @@ Load pre-trained maskrcnn from torchvision and do tracing
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and will be removed in 0.15. The current behavior is equivalent to passing `weights=MaskRCNN_ResNet50_FPN_Weights.COCO_V1`. You can also use `weights=MaskRCNN_ResNet50_FPN_Weights.DEFAULT` to get the most up-to-date weights.
warnings.warn(msg)
Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
-
0%| | 0.00/170M [00:00<?, ?B/s]
5%|4 | 7.99M/170M [00:00<00:03, 54.8MB/s]
9%|9 | 16.0M/170M [00:00<00:02, 56.4MB/s]
13%|#3 | 22.3M/170M [00:00<00:02, 57.4MB/s]
16%|#6 | 27.8M/170M [00:00<00:02, 51.8MB/s]
19%|#9 | 32.8M/170M [00:00<00:03, 43.8MB/s]
24%|##3 | 40.0M/170M [00:00<00:02, 51.8MB/s]
27%|##7 | 46.3M/170M [00:00<00:02, 55.6MB/s]
31%|### | 51.8M/170M [00:01<00:02, 53.6MB/s]
34%|###3 | 57.1M/170M [00:01<00:02, 53.4MB/s]
38%|###8 | 65.2M/170M [00:01<00:01, 62.1MB/s]
42%|####2 | 72.0M/170M [00:01<00:01, 57.7MB/s]
48%|####8 | 82.2M/170M [00:01<00:01, 70.7MB/s]
53%|#####2 | 89.2M/170M [00:01<00:01, 67.1MB/s]
57%|#####6 | 96.0M/170M [00:01<00:01, 51.0MB/s]
61%|######1 | 104M/170M [00:01<00:01, 56.3MB/s]
66%|######5 | 112M/170M [00:02<00:01, 59.4MB/s]
71%|####### | 120M/170M [00:02<00:00, 64.9MB/s]
75%|#######5 | 128M/170M [00:02<00:00, 64.5MB/s]
80%|######## | 136M/170M [00:02<00:00, 62.2MB/s]
85%|########4 | 144M/170M [00:02<00:00, 64.9MB/s]
89%|########9 | 152M/170M [00:02<00:00, 63.2MB/s]
94%|#########4| 160M/170M [00:02<00:00, 64.2MB/s]
99%|#########8| 168M/170M [00:02<00:00, 66.7MB/s]
100%|##########| 170M/170M [00:02<00:00, 60.3MB/s]
+
0%| | 0.00/170M [00:00<?, ?B/s]
5%|4 | 7.99M/170M [00:00<00:02, 77.7MB/s]
9%|9 | 16.1M/170M [00:00<00:01, 82.1MB/s]
14%|#4 | 24.4M/170M [00:00<00:01, 84.0MB/s]
19%|#9 | 32.4M/170M [00:00<00:02, 66.5MB/s]
26%|##5 | 43.7M/170M [00:00<00:01, 82.5MB/s]
35%|###4 | 58.6M/170M [00:00<00:01, 103MB/s]
41%|#### | 69.0M/170M [00:00<00:01, 94.8MB/s]
49%|####8 | 82.5M/170M [00:00<00:00, 108MB/s]
55%|#####4 | 93.2M/170M [00:01<00:00, 105MB/s]
61%|###### | 104M/170M [00:01<00:00, 105MB/s]
67%|######6 | 114M/170M [00:01<00:00, 105MB/s]
75%|#######5 | 128M/170M [00:01<00:00, 99.5MB/s]
81%|########1 | 138M/170M [00:01<00:00, 99.3MB/s]
87%|########6 | 147M/170M [00:01<00:00, 85.8MB/s]
92%|#########1| 156M/170M [00:01<00:00, 82.9MB/s]
97%|#########6| 164M/170M [00:01<00:00, 66.8MB/s]
100%|##########| 170M/170M [00:02<00:00, 88.1MB/s]
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/torch/nn/functional.py:3897: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
for i in range(dim)
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/detection/anchor_utils.py:124: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
@@ -296,7 +296,7 @@ Get boxes with score larger than 0.9
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 3 minutes 16.530 seconds)
+ **Total running time of the script:** ( 3 minutes 18.669 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_object_detection_pytorch.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
index 936f68799c..39a6432880 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
@@ -236,7 +236,7 @@ training. Other models require a full post training calibration.
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and will be removed in 0.15. The current behavior is equivalent to passing `weights=MobileNet_V2_Weights.IMAGENET1K_V1`. You can also use `weights=MobileNet_V2_Weights.DEFAULT` to get the most up-to-date weights.
warnings.warn(msg)
Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
-
0%| | 0.00/13.6M [00:00<?, ?B/s]
59%|#####8 | 7.99M/13.6M [00:00<00:00, 57.7MB/s]
100%|##########| 13.6M/13.6M [00:00<00:00, 58.6MB/s]
+
0%| | 0.00/13.6M [00:00<?, ?B/s]
74%|#######4 | 10.1M/13.6M [00:00<00:00, 106MB/s]
100%|##########| 13.6M/13.6M [00:00<00:00, 89.6MB/s]
@@ -418,7 +418,7 @@ Here we give an example of how to measure performance of TVM compiled models.
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 90.2720 90.1835 92.6822 90.0486 0.3489
+ 90.5629 90.4978 93.3052 90.1424 0.3764
@@ -467,7 +467,7 @@ TODO
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 5.994 seconds)
+ **Total running time of the script:** ( 1 minutes 6.934 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_prequantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
index 386af6bf2d..1907d6e88d 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
@@ -432,7 +432,7 @@ Here we give an example of how to measure performance of TVM compiled models.
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 120.0999 119.9757 126.8035 119.1603 0.9080
+ 120.3317 120.1745 124.9000 119.0818 0.8623
@@ -469,7 +469,7 @@ Here we give an example of how to measure performance of TVM compiled models.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 2 minutes 22.706 seconds)
+ **Total running time of the script:** ( 2 minutes 22.579 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_prequantized_tflite.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
index 2cd030644f..dedbc4b49b 100644
--- a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
@@ -253,7 +253,7 @@ We create a Relay VM to build and execute the model.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 35.278 seconds)
+ **Total running time of the script:** ( 1 minutes 34.966 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_quantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
index fd6a066fd2..168f30791d 100644
--- a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
@@ -166,7 +166,7 @@ Convert and compile model for CPU.
data: None
input_sym_arg_type = in_param.infer_type()[0]
Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
-
0%| | 0/132723 [00:00<?, ?KB/s]
5%|4 | 6201/132723 [00:00<00:02, 62004.89KB/s]
10%|# | 13858/132723 [00:00<00:01, 70545.93KB/s]
16%|#5 | 20913/132723 [00:00<00:02, 49584.40KB/s]
21%|##1 | 28461/132723 [00:00<00:01, 57721.10KB/s]
27%|##7 | 35957/132723 [00:00<00:01, 63061.21KB/s]
33%|###2 | 43478/132723 [00:00<00:01, 66786.23KB/s]
38%|###8 | 51085/132723 [00:00<00:01, 69612.52KB/s]
44%|####4 | 58678/132723 [00:00<00:01, 71526.17KB/s]
50%|####9 | 66229/132723 [00:00<00:00, 72727.22KB/s]
56%|#####5 | 73767/132723 [00:01<00:00, 73523.38KB/s]
61%|######1 | 81449/132723 [00:01<00:00, 74510.35KB/s]
67%|######7 | 89153/132723 [00:01<00:00, 75264.21KB/s]
73%|#######2 | 96785/132723 [00:01<00:00, 75570.84KB/s]
79%|#######8 | 104499/132723 [00:01<00:00, 76033.41KB/s]
85%|########4 | 112194/132723 [00:01<00:00, 76306.90KB/s]
90%|#########
| 119841/132723 [00:01<00:00, 75937.68KB/s]
96%|#########6| 127449/132723 [00:01<00:00, 75977.86KB/s]
100%|##########| 132723/132723 [00:01<00:00, 71249.89KB/s]
+
0%| | 0/132723 [00:00<?, ?KB/s]
4%|4 | 5781/132723 [00:00<00:02, 57801.10KB/s]
10%|# | 13645/132723 [00:00<00:01, 70049.01KB/s]
16%|#5 | 20650/132723 [00:00<00:01, 61793.77KB/s]
21%|##1 | 28503/132723 [00:00<00:01, 67903.95KB/s]
27%|##7 | 36334/132723 [00:00<00:01, 71470.53KB/s]
33%|###2 | 43576/132723 [00:01<00:04, 20581.95KB/s]
37%|###7 | 49140/132723 [00:01<00:03, 23410.10KB/s]
42%|####2 | 55772/132723 [00:01<00:02, 29341.70KB/s]
48%|####7 | 63166/132723 [00:01<00:01, 36695.89KB/s]
52%|#####2 | 69102/132723 [00:01<00:01, 40470.44KB/s]
58%|#####7 | 76351/132723 [00:01<00:01, 42878.79KB/s]
62%|######1 | 81905/132723 [00:02<00:01, 45231.81KB/s]
68%|######7 | 89612/132723 [00:02<00:00, 52676.35KB/s]
74%|#######3 | 97674/132723 [00:02<00:00, 59672.59KB/s]
80%|#######9 | 105559/132723 [00:02<00:00, 64723.86KB/s]
86%|########5
| 113506/132723 [00:02<00:00, 68753.44KB/s]
92%|#########1| 121466/132723 [00:02<00:00, 71799.03KB/s]
98%|#########7| 129408/132723 [00:02<00:00, 73975.47KB/s]
100%|##########| 132723/132723 [00:02<00:00, 48532.86KB/s]
@@ -242,7 +242,7 @@ Display result
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 3 minutes 0.344 seconds)
+ **Total running time of the script:** ( 3 minutes 3.566 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_ssd_gluoncv.py:
diff --git a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
index 1b2ab9bf49..92082a0b7a 100644
--- a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
@@ -5,26 +5,26 @@
Computation times
=================
-**13:47.084** total execution time for **how_to_deploy_models** files:
+**13:47.636** total execution time for **how_to_deploy_models** files:
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``) | 03:16.530 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``) | 03:18.669 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``) | 03:00.344 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``) | 03:03.566 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``) | 02:22.706 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``) | 02:22.579 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``) | 01:35.278 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``) | 01:34.966 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``) | 01:05.994 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``) | 01:06.934 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_adreno.py` (``deploy_model_on_adreno.py``) | 01:01.045 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_adreno.py` (``deploy_model_on_adreno.py``) | 00:54.349 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``) | 00:35.542 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``) | 00:36.467 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_nano.py` (``deploy_model_on_nano.py``) | 00:25.075 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_nano.py` (``deploy_model_on_nano.py``) | 00:25.309 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``) | 00:24.564 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``) | 00:24.791 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_sparse.py` (``deploy_sparse.py``) | 00:00.006 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_sparse.py` (``deploy_sparse.py``) | 00:00.007 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
index f892a3b040..3e67362dc4 100644
--- a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
@@ -472,7 +472,7 @@ First let us define two helper functions to get the mobilenet model and a cat im
.. code-block:: none
- Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip89771a59-08c8-4305-8923-fd45f2a3d2ef from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+ Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip470f7e27-c603-4af7-aee9-57c5b98a0281 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
diff --git a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
index 87f3e38d39..c95a3e7333 100644
--- a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
@@ -5,12 +5,12 @@
Computation times
=================
-**00:47.078** total execution time for **how_to_extend_tvm** files:
+**00:48.070** total execution time for **how_to_extend_tvm** files:
+-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``) | 00:43.618 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``) | 00:44.559 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``) | 00:02.416 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``) | 00:02.467 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``) | 00:01.037 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
index 86ffcc32dd..44aed3d933 100644
--- a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
@@ -216,10 +216,10 @@ profile the execution time of each passes.
.. code-block:: none
Printing results of timing profile...
- InferType: 7197us [7197us] (46.55%; 46.55%)
- FoldScaleAxis: 8264us [7us] (53.45%; 53.45%)
- FoldConstant: 8258us [1692us] (53.41%; 99.92%)
- InferType: 6566us [6566us] (42.47%; 79.51%)
+ InferType: 7278us [7278us] (46.51%; 46.51%)
+ FoldScaleAxis: 8371us [7us] (53.49%; 53.49%)
+ FoldConstant: 8364us [1711us] (53.44%; 99.91%)
+ InferType: 6653us [6653us] (42.51%; 79.54%)
@@ -258,10 +258,10 @@ Refer to following sections and :py:func:`tvm.instrument.pass_instrument` for th
.. code-block:: none
Printing results of timing profile...
- InferType: 6591us [6591us] (44.98%; 44.98%)
- FoldScaleAxis: 8062us [5us] (55.02%; 55.02%)
- FoldConstant: 8057us [1661us] (54.99%; 99.94%)
- InferType: 6396us [6396us] (43.65%; 79.38%)
+ InferType: 6722us [6722us] (44.87%; 44.87%)
+ FoldScaleAxis: 8259us [5us] (55.13%; 55.13%)
+ FoldConstant: 8253us [1693us] (55.09%; 99.94%)
+ InferType: 6561us [6561us] (43.80%; 79.49%)
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
index dfbbfed748..2a29fe90df 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
@@ -340,7 +340,7 @@ latency of convolution.
.. code-block:: none
- Convolution: 54.383968 ms
+ Convolution: 43.233119 ms
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
index d485e73b51..bdf0c403e1 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
@@ -657,7 +657,7 @@ be able to run on our build server
.. code-block:: none
- conv2d with tensor core: 12.437085 ms
+ conv2d with tensor core: 13.369456 ms
diff --git a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
index 5ea22d7736..961fac3cac 100644
--- a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
@@ -143,8 +143,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
.. code-block:: none
- Numpy running time: 0.018901
- Baseline: 3.340735
+ Numpy running time: 0.018572
+ Baseline: 3.238095
@@ -238,7 +238,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
.. code-block:: none
- Opt1: 0.302373
+ Opt1: 0.298859
@@ -340,7 +340,7 @@ In this tutorial, we chose to vectorize the inner loop row data since it is cach
.. code-block:: none
- Opt2: 0.343603
+ Opt2: 0.344994
@@ -435,7 +435,7 @@ the access pattern for A matrix is more cache friendly.
.. code-block:: none
- Opt3: 0.117663
+ Opt3: 0.116641
@@ -559,7 +559,7 @@ flattening.
.. code-block:: none
- Opt4: 0.109533
+ Opt4: 0.109901
@@ -680,7 +680,7 @@ write to C when all the block results are ready.
.. code-block:: none
- Opt5: 0.110918
+ Opt5: 0.110611
@@ -804,7 +804,7 @@ Furthermore, we can also utilize multi-core processors to do the thread-level pa
.. code-block:: none
- Opt6: 0.146906
+ Opt6: 0.147471
diff --git a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
index 607773609a..424141fd75 100644
--- a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
@@ -5,12 +5,12 @@
Computation times
=================
-**00:34.828** total execution time for **how_to_optimize_operators** files:
+**00:34.659** total execution time for **how_to_optimize_operators** files:
+-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``) | 00:32.278 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``) | 00:31.940 | 0.0 MB |
+-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``) | 00:01.482 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``) | 00:01.560 | 0.0 MB |
+-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``) | 00:01.068 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``) | 00:01.159 | 0.0 MB |
+-----------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
index 12217c17ca..48af387ba2 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
@@ -5,18 +5,18 @@
Computation times
=================
-**08:51.166** total execution time for **how_to_tune_with_autoscheduler** files:
+**08:56.904** total execution time for **how_to_tune_with_autoscheduler** files:
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``) | 05:29.345 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``) | 05:32.158 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``) | 01:31.730 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``) | 01:32.047 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``) | 01:00.346 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``) | 01:01.196 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``) | 00:26.583 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``) | 00:28.031 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``) | 00:11.939 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``) | 00:12.104 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``) | 00:11.223 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``) | 00:11.369 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
index d162f8ae28..8eb4b73810 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
@@ -239,11 +239,11 @@ cooperative fetching, unrolling and operator fusion.
bias: Buffer(bias_2: Pointer(float32), float32, [1, 512, 1, 1], []),
compute: Buffer(compute_2: Pointer(float32), float32, [1, 512, 7, 7], [])}
buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, compute_1: compute} {
- attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 28;
+ attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 16;
allocate(conv2d_nchw: Pointer(local float32), float32, [14]), storage_scope = local;
- allocate(pad_temp.shared: Pointer(shared float32), float32, [72]), storage_scope = shared;
- allocate(kernel.shared: Pointer(shared float32), float32, [3072]), storage_scope = shared;
- attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64 {
+ allocate(pad_temp.shared: Pointer(shared float32), float32, [1296]), storage_scope = shared;
+ allocate(kernel.shared: Pointer(shared float32), float32, [4608]), storage_scope = shared;
+ attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112 {
conv2d_nchw_1: Buffer(conv2d_nchw, float32, [14], [], scope="local", align=32)[0] = 0f32
conv2d_nchw_1[1] = 0f32
conv2d_nchw_1[2] = 0f32
@@ -258,463 +258,381 @@ cooperative fetching, unrolling and operator fusion.
conv2d_nchw_1[11] = 0f32
conv2d_nchw_1[12] = 0f32
conv2d_nchw_1[13] = 0f32
- for (rc.outer.outer: int32, 0, 64) {
- for (ry.outer.outer: int32, 0, 3) {
- let cse_var_2: int32 = (rc.outer.outer*72)
- let cse_var_1: int32 = (ry.outer.outer*3)
- {
- attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64 {
- if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
- pad_temp.shared_1: Buffer(pad_temp.shared, float32, [72], [], scope="shared")[(threadIdx.x_1*4)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1*4), 9))) && (floormod((threadIdx.x_1*4), 9) < 8)), data_3: Buffer(data_2, float32, [25088], [])[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1*4), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + fl [...]
- }
- if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
- pad_temp.shared_1[((threadIdx.x_1*4) + 1)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 1), 9))) && (floormod(((threadIdx.x_1*4) + 1), 9) < 8)), data_3[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 1), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 1), 9)) - 8)], 0f32, dtype=float32)
- }
- if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
- pad_temp.shared_1[((threadIdx.x_1*4) + 2)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 2), 9))) && (floormod(((threadIdx.x_1*4) + 2), 9) < 8)), data_3[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 2), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 2), 9)) - 8)], 0f32, dtype=float32)
- }
- if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
- pad_temp.shared_1[((threadIdx.x_1*4) + 3)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 3), 9))) && (floormod(((threadIdx.x_1*4) + 3), 9) < 8)), data_3[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 3), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 3), 9)) - 8)], 0f32, dtype=float32)
- }
- }
- attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1: Buffer(kernel.shared, float32, [3072], [], scope="shared")[threadIdx.x_2] = kernel_3: Buffer(kernel_2, float32, [2359296], [])[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 64)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 64), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 128)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 128), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 192)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 36864)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 256)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 256), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 320)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 320), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 384)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 73728)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 448)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 448), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 512)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 512), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 576)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 110592)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 640)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 640), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 704)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 704), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 768)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 147456)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 832)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 832), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 896)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 896), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 960)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 184320)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1024)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1024), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1088)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1088), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1152)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 221184)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1216)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1216), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1280)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1280), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1344)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 258048)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1408)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1408), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1472)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1472), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1536)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 294912)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1600)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1600), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1664)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1664), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1728)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 331776)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1792)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1792), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1856)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1856), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1920)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 368640)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1984)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1984), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2048)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2048), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2112)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 405504)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2176)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2176), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2240)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2240), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2304)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 442368)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2368)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2368), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2432)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2432), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2496)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 479232)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2560)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2560), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2624)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2624), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2688)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 516096)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2752)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2752), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2816)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2816), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2880)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 552960)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2944)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2944), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 3008)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 3008), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[0]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[9]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[1]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[2]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[3]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[4]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[5]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[6]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[0]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[9]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[8]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[17]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[8]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[17]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[18]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[27]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[18]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[27]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[26]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[35]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[26]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[35]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[36]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[45]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[36]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[45]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[44]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[53]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[44]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[53]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[54]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[63]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[54]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[63]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[62]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[71]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[62]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[71]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+ for (rc.outer.outer: int32, 0, 32) {
+ let cse_var_2: int32 = (rc.outer.outer*784)
+ let cse_var_1: int32 = (rc.outer.outer*144)
+ {
+ attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ pad_temp.shared_1: Buffer(pad_temp.shared, float32, [1296], [], scope="shared")[threadIdx.x_1] = @tir.if_then_else(((((9 <= floormod(threadIdx.x_1, 81)) && (floormod(threadIdx.x_1, 81) < 72)) && (1 <= floormod(threadIdx.x_1, 9))) && (floormod(threadIdx.x_1, 9) < 8)), data_3: Buffer(data_2, float32, [25088], [])[((((cse_var_2 + (floordiv(threadIdx.x_1, 81)*49)) + (floordiv(floormod(threadIdx.x_1, 81), 9)*7)) + floormod(threadIdx.x_1, 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ pad_temp.shared_1[(threadIdx.x_1 + 112)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 31), 81)) && (floormod((threadIdx.x_1 + 31), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 4), 9))) && (floormod((threadIdx.x_1 + 4), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 112), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 31), 81), 9)*7)) + floormod((threadIdx.x_1 + 4), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ pad_temp.shared_1[(threadIdx.x_1 + 224)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 62), 81)) && (floormod((threadIdx.x_1 + 62), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 8), 9))) && (floormod((threadIdx.x_1 + 8), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 224), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 62), 81), 9)*7)) + floormod((threadIdx.x_1 + 8), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ pad_temp.shared_1[(threadIdx.x_1 + 336)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 12), 81)) && (floormod((threadIdx.x_1 + 12), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 3), 9))) && (floormod((threadIdx.x_1 + 3), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 336), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 12), 81), 9)*7)) + floormod((threadIdx.x_1 + 3), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ pad_temp.shared_1[(threadIdx.x_1 + 448)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 43), 81)) && (floormod((threadIdx.x_1 + 43), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 7), 9))) && (floormod((threadIdx.x_1 + 7), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 448), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 43), 81), 9)*7)) + floormod((threadIdx.x_1 + 7), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ pad_temp.shared_1[(threadIdx.x_1 + 560)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 74), 81)) && (floormod((threadIdx.x_1 + 74), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 2), 9))) && (floormod((threadIdx.x_1 + 2), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 560), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 74), 81), 9)*7)) + floormod((threadIdx.x_1 + 2), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ pad_temp.shared_1[(threadIdx.x_1 + 672)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 24), 81)) && (floormod((threadIdx.x_1 + 24), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 6), 9))) && (floormod((threadIdx.x_1 + 6), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 672), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 24), 81), 9)*7)) + floormod((threadIdx.x_1 + 6), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ pad_temp.shared_1[(threadIdx.x_1 + 784)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 55), 81)) && (floormod((threadIdx.x_1 + 55), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 1), 9))) && (floormod((threadIdx.x_1 + 1), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 784), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 55), 81), 9)*7)) + floormod((threadIdx.x_1 + 1), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ pad_temp.shared_1[(threadIdx.x_1 + 896)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 5), 81)) && (floormod((threadIdx.x_1 + 5), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 5), 9))) && (floormod((threadIdx.x_1 + 5), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 896), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 5), 81), 9)*7)) + floormod((threadIdx.x_1 + 5), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ pad_temp.shared_1[(threadIdx.x_1 + 1008)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 9) + 4), 9)) && (floormod((threadIdx.x_1 + 36), 81) < 72)) && (1 <= floormod(threadIdx.x_1, 9))) && (floormod(threadIdx.x_1, 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1008), 81)*49)) + (floormod((floordiv(threadIdx.x_1, 9) + 4), 9)*7)) + floormod(threadIdx.x_1, 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ pad_temp.shared_1[(threadIdx.x_1 + 1120)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 67), 81)) && (floormod((threadIdx.x_1 + 67), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 4), 9))) && (floormod((threadIdx.x_1 + 4), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1120), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 67), 81), 9)*7)) + floormod((threadIdx.x_1 + 4), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ if @tir.likely((threadIdx.x_1 < 64), dtype=bool) {
+ pad_temp.shared_1[(threadIdx.x_1 + 1232)] = @tir.if_then_else((((threadIdx.x_1 < 55) && (1 <= floormod((threadIdx.x_1 + 8), 9))) && (floormod((threadIdx.x_1 + 8), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1232), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 17), 81), 9)*7)) + floormod((threadIdx.x_1 + 8), 9)) - 8)], 0f32, dtype=float32)
+ }
+ attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1: Buffer(kernel.shared, float32, [4608], [], scope="shared")[threadIdx.x_2] = kernel_3: Buffer(kernel_2, float32, [2359296], [])[(((blockIdx.x*147456) + cse_var_1) + threadIdx.x_2)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 112)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 112), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 112), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 224)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 224), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 80), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 336)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 336), 144)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 16), 48)*3)) + floormod(threadIdx.x_2, 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 448)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 448), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 16), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 560)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 560), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 128), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 672)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 672), 144)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 32), 48)*3)) + floormod(threadIdx.x_2, 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 784)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 784), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 64), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 896)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 896), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 1008)] = kernel_3[((((blockIdx.x*147456) + cse_var_1) + threadIdx.x_2) + 32256)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 1120)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1120), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 112), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 1232)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1232), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 80), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 1344)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1344), 144)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 16), 48)*3)) + floormod(threadIdx.x_2, 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 1456)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1456), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 16), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 1568)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1568), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 128), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 1680)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1680), 144)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 32), 48)*3)) + floormod(threadIdx.x_2, 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 1792)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1792), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 64), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 1904)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1904), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 2016)] = kernel_3[((((blockIdx.x*147456) + cse_var_1) + threadIdx.x_2) + 64512)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 2128)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2128), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 112), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 2240)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2240), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 80), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 2352)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2352), 144)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 16), 48)*3)) + floormod(threadIdx.x_2, 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 2464)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2464), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 16), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 2576)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2576), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 128), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 2688)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2688), 144)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 32), 48)*3)) + floormod(threadIdx.x_2, 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 2800)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2800), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 64), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 2912)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2912), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 3024)] = kernel_3[((((blockIdx.x*147456) + cse_var_1) + threadIdx.x_2) + 96768)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 3136)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 3136), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 112), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 3248)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 3248), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 80), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 3360)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 3360), 144)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 16), 48)*3)) + floormod(threadIdx.x_2, 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 3472)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 3472), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 16), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 3584)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 3584), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 128), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 3696)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 3696), 144)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 32), 48)*3)) + floormod(threadIdx.x_2, 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 3808)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 3808), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 64), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 3920)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 3920), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 4032)] = kernel_3[((((blockIdx.x*147456) + cse_var_1) + threadIdx.x_2) + 129024)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 4144)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 4144), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 112), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 4256)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 4256), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 80), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 4368)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 4368), 144)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 16), 48)*3)) + floormod(threadIdx.x_2, 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 4480)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 4480), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 16), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
+ kernel.shared_1[(threadIdx.x_2 + 4592)] = kernel_3[(((((blockIdx.x*147456) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 128), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3)) + 142848)]
+ }
+ for (rc.outer.inner: int32, 0, 8) {
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((rc.outer.inner*162) + floormod(threadIdx.x, 7))]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 9)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 18)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 27)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 36)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 45)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 54)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[((rc.outer.inner*162) + floormod(threadIdx.x, 7))]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 9)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 18)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 27)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 36)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 45)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 54)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 1)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 10)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 19)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 28)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 37)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 46)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 55)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 1)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 10)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 19)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 28)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 37)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 46)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 55)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 2)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 11)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 29)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 38)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 47)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 56)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 2)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 11)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 29)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 38)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 47)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 56)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 81)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 90)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 99)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 108)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 117)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 126)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 135)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 81)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 90)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 99)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 108)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 117)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 126)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 135)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 82)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 100)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 109)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 118)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 127)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 136)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 82)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 100)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 109)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 118)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 127)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 136)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 83)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 92)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 101)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 110)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 119)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 128)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 137)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 83)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 92)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 101)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 110)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 119)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 128)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 137)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 9)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 18)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 27)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 36)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 45)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 54)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 9)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 18)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 27)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 36)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 45)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 54)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 10)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 19)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 28)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 37)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 46)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 55)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 64)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 10)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 19)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 28)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 37)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 46)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 55)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 64)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 11)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 29)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 38)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 47)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 56)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 65)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 11)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 29)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 38)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 47)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 56)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 65)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 90)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 99)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 108)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 117)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 126)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 135)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 144)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 90)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 99)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 108)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 117)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 126)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 135)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 144)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 100)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 109)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 118)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 127)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 136)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 145)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 100)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 109)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 118)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 127)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 136)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 145)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 92)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 101)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 110)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 119)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 128)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 137)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 146)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 92)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 101)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 110)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 119)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 128)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 137)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 146)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 18)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 27)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 36)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 45)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 54)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 72)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 18)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 27)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 36)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 45)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 54)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 72)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 19)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 28)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 37)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 46)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 55)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 64)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 73)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 19)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 28)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 37)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 46)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 55)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 64)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 73)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 29)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 38)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 47)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 56)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 65)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 74)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 29)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 38)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 47)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 56)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 65)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 74)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 99)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 108)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 117)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 126)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 135)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 144)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 153)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 99)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 108)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 117)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 126)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 135)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 144)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 153)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 100)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 109)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 118)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 127)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 136)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 145)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 154)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 100)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 109)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 118)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 127)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 136)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 145)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 154)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 101)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 110)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 119)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 128)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 137)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 146)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 155)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 101)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 110)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 119)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 128)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 137)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 146)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 155)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
}
}
}
for (i1.inner: int32, 0, 2) {
- for (i3.inner: int32, 0, 7) {
- compute_3: Buffer(compute_2, float32, [25088], [])[(((((floordiv(blockIdx.x, 7)*6272) + (threadIdx.x*98)) + (i1.inner*49)) + (floormod(blockIdx.x, 7)*7)) + i3.inner)] = max((conv2d_nchw_1[((i1.inner*7) + i3.inner)] + bias_3: Buffer(bias_2, float32, [512], [])[(((floordiv(blockIdx.x, 7)*128) + (threadIdx.x*2)) + i1.inner)]), 0f32)
+ for (i2.inner: int32, 0, 7) {
+ compute_3: Buffer(compute_2, float32, [25088], [])[(((((blockIdx.x*1568) + (floordiv(threadIdx.x, 7)*98)) + (i1.inner*49)) + (i2.inner*7)) + floormod(threadIdx.x, 7))] = max((conv2d_nchw_1[((i1.inner*7) + i2.inner)] + bias_3: Buffer(bias_2, float32, [512], [])[(((blockIdx.x*32) + (floordiv(threadIdx.x, 7)*2)) + i1.inner)]), 0f32)
}
}
}
@@ -770,7 +688,7 @@ We build the binary and check its correctness and performance.
.. code-block:: none
- Execution time of this operator: 0.359 ms
+ Execution time of this operator: 0.219 ms
@@ -818,36 +736,36 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
conv2d_nchw_nn_o_o_i, conv2d_nchw_nn_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_i, factor=1)
conv2d_nchw_nn_o_o_o_i, conv2d_nchw_nn_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_i, factor=1)
conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
- conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
- conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=2)
- conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=64)
+ conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=2)
+ conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=1)
+ conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=16)
conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
- conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
+ conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=7)
conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=1)
conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
- conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=7)
- conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
+ conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
+ conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=7)
conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=2)
- conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=4)
+ conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=8)
conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1)
- conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=1)
- conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=1)
- conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=3)
+ conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=3)
+ conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=3)
+ conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=1)
s[conv2d_nchw].reorder(conv2d_nchw_nn_o_o_o_o, conv2d_nchw_ff_o_o_o_o, conv2d_nchw_yy_o_o_o_o, conv2d_nchw_xx_o_o_o_o, conv2d_nchw_nn_o_o_o_i, conv2d_nchw_ff_o_o_o_i, conv2d_nchw_yy_o_o_o_i, conv2d_nchw_xx_o_o_o_i, conv2d_nchw_nn_o_o_i, conv2d_nchw_ff_o_o_i, conv2d_nchw_yy_o_o_i, conv2d_nchw_xx_o_o_i, conv2d_nchw_rc_o_o, conv2d_nchw_ry_o_o, conv2d_nchw_rx_o_o, conv2d_nchw_rc_o_i, conv2d_nchw_ry_o_i, conv2d_nchw_rx_o_i, conv2d_nchw_nn_o_i, conv2d_nchw_ff_o_i, conv2d_nchw_yy_o_i, conv2 [...]
compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1)
compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=2)
- compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=64)
+ compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=16)
compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
- compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
+ compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=7)
compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=1)
compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
- compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
- compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
+ compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
+ compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=7)
compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
@@ -867,12 +785,12 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
- kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+ kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=112)
s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
- pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=4)
+ pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
- pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+ pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=112)
s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 512)
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "unroll_explicit", True)
@@ -892,10 +810,10 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
#define int64_t long long
#define uint64_t unsigned long long
#endif
- extern "C" __global__ void __launch_bounds__(64) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+ extern "C" __global__ void __launch_bounds__(112) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
float conv2d_nchw[14];
- __shared__ float pad_temp_shared[72];
- __shared__ float kernel_shared[3072];
+ __shared__ float pad_temp_shared[1296];
+ __shared__ float kernel_shared[4608];
conv2d_nchw[0] = 0.000000e+00f;
conv2d_nchw[1] = 0.000000e+00f;
conv2d_nchw[2] = 0.000000e+00f;
@@ -910,411 +828,325 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
conv2d_nchw[11] = 0.000000e+00f;
conv2d_nchw[12] = 0.000000e+00f;
conv2d_nchw[13] = 0.000000e+00f;
- for (int rc_outer_outer = 0; rc_outer_outer < 64; ++rc_outer_outer) {
- for (int ry_outer_outer = 0; ry_outer_outer < 3; ++ry_outer_outer) {
- __syncthreads();
- if (((int)threadIdx.x) < 18) {
- pad_temp_shared[(((int)threadIdx.x) * 4)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) * 4) % 9))) && (((((int)threadIdx.x) * 4) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + (((((int)threadIdx.x) * 4) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) * 4) % 9)) - 8)] : 0.000000e+00f);
- }
- if (((int)threadIdx.x) < 18) {
- pad_temp_shared[((((int)threadIdx.x) * 4) + 1)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 1) % 9))) && ((((((int)threadIdx.x) * 4) + 1) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 1) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 1) % 9)) - 8)] : 0.000000e+00f);
- }
- if (((int)threadIdx.x) < 18) {
- pad_temp_shared[((((int)threadIdx.x) * 4) + 2)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 2) % 9))) && ((((((int)threadIdx.x) * 4) + 2) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 2) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 2) % 9)) - 8)] : 0.000000e+00f);
- }
- if (((int)threadIdx.x) < 18) {
- pad_temp_shared[((((int)threadIdx.x) * 4) + 3)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 3) % 9))) && ((((((int)threadIdx.x) * 4) + 3) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 3) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 3) % 9)) - 8)] : 0.000000e+00f);
- }
- kernel_shared[((int)threadIdx.x)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3))];
- kernel_shared[(((int)threadIdx.x) + 64)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 64) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 128)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 128) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 192)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 36864)];
- kernel_shared[(((int)threadIdx.x) + 256)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 256) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 320)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 320) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 384)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 73728)];
- kernel_shared[(((int)threadIdx.x) + 448)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 448) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 512)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 512) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 576)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 110592)];
- kernel_shared[(((int)threadIdx.x) + 640)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 640) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 704)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 704) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 768)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 147456)];
- kernel_shared[(((int)threadIdx.x) + 832)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 832) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 896)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 896) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 960)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 184320)];
- kernel_shared[(((int)threadIdx.x) + 1024)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1024) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1088)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1088) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1152)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 221184)];
- kernel_shared[(((int)threadIdx.x) + 1216)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1216) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1280)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1280) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1344)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 258048)];
- kernel_shared[(((int)threadIdx.x) + 1408)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1408) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1472)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1472) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1536)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 294912)];
- kernel_shared[(((int)threadIdx.x) + 1600)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1600) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1664)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1664) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1728)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 331776)];
- kernel_shared[(((int)threadIdx.x) + 1792)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1792) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1856)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1856) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1920)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 368640)];
- kernel_shared[(((int)threadIdx.x) + 1984)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1984) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2048)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2048) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2112)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 405504)];
- kernel_shared[(((int)threadIdx.x) + 2176)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2176) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2240)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2240) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2304)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 442368)];
- kernel_shared[(((int)threadIdx.x) + 2368)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2368) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2432)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2432) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2496)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 479232)];
- kernel_shared[(((int)threadIdx.x) + 2560)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2560) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2624)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2624) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2688)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 516096)];
- kernel_shared[(((int)threadIdx.x) + 2752)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2752) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2816)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2816) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2880)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 552960)];
- kernel_shared[(((int)threadIdx.x) + 2944)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2944) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 3008)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 3008) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- __syncthreads();
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[0] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[1] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[2] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[3] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[4] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[5] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[6] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[0] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+ for (int rc_outer_outer = 0; rc_outer_outer < 32; ++rc_outer_outer) {
+ __syncthreads();
+ pad_temp_shared[((int)threadIdx.x)] = (((((9 <= (((int)threadIdx.x) % 81)) && ((((int)threadIdx.x) % 81) < 72)) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((int)threadIdx.x) / 81) * 49)) + (((((int)threadIdx.x) % 81) / 9) * 7)) + (((int)threadIdx.x) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 112)] = (((((9 <= ((((int)threadIdx.x) + 31) % 81)) && (((((int)threadIdx.x) + 31) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 4) % 9))) && (((((int)threadIdx.x) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 112) / 81) * 49)) + ((((((int)threadIdx.x) + 31) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 4) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 224)] = (((((9 <= ((((int)threadIdx.x) + 62) % 81)) && (((((int)threadIdx.x) + 62) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 8) % 9))) && (((((int)threadIdx.x) + 8) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 224) / 81) * 49)) + ((((((int)threadIdx.x) + 62) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 8) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 336)] = (((((9 <= ((((int)threadIdx.x) + 12) % 81)) && (((((int)threadIdx.x) + 12) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 3) % 9))) && (((((int)threadIdx.x) + 3) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 336) / 81) * 49)) + ((((((int)threadIdx.x) + 12) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 3) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 448)] = (((((9 <= ((((int)threadIdx.x) + 43) % 81)) && (((((int)threadIdx.x) + 43) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 7) % 9))) && (((((int)threadIdx.x) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 448) / 81) * 49)) + ((((((int)threadIdx.x) + 43) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 7) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 560)] = (((((9 <= ((((int)threadIdx.x) + 74) % 81)) && (((((int)threadIdx.x) + 74) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 2) % 9))) && (((((int)threadIdx.x) + 2) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 560) / 81) * 49)) + ((((((int)threadIdx.x) + 74) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 2) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 672)] = (((((9 <= ((((int)threadIdx.x) + 24) % 81)) && (((((int)threadIdx.x) + 24) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 6) % 9))) && (((((int)threadIdx.x) + 6) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 672) / 81) * 49)) + ((((((int)threadIdx.x) + 24) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 6) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 784)] = (((((9 <= ((((int)threadIdx.x) + 55) % 81)) && (((((int)threadIdx.x) + 55) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 1) % 9))) && (((((int)threadIdx.x) + 1) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 784) / 81) * 49)) + ((((((int)threadIdx.x) + 55) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 1) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 896)] = (((((9 <= ((((int)threadIdx.x) + 5) % 81)) && (((((int)threadIdx.x) + 5) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 5) % 9))) && (((((int)threadIdx.x) + 5) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 896) / 81) * 49)) + ((((((int)threadIdx.x) + 5) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 5) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1008)] = (((((1 <= (((((int)threadIdx.x) / 9) + 4) % 9)) && (((((int)threadIdx.x) + 36) % 81) < 72)) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 1008) / 81) * 49)) + ((((((int)threadIdx.x) / 9) + 4) % 9) * 7)) + (((int)threadIdx.x) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1120)] = (((((9 <= ((((int)threadIdx.x) + 67) % 81)) && (((((int)threadIdx.x) + 67) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 4) % 9))) && (((((int)threadIdx.x) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 1120) / 81) * 49)) + ((((((int)threadIdx.x) + 67) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 4) % 9)) - 8)] : 0.000000e+00f);
+ if (((int)threadIdx.x) < 64) {
+ pad_temp_shared[(((int)threadIdx.x) + 1232)] = ((((((int)threadIdx.x) < 55) && (1 <= ((((int)threadIdx.x) + 8) % 9))) && (((((int)threadIdx.x) + 8) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 1232) / 81) * 49)) + (((((int)threadIdx.x) + 17) / 9) * 7)) + ((((int)threadIdx.x) + 8) % 9)) - 8)] : 0.000000e+00f);
+ }
+ kernel_shared[((int)threadIdx.x)] = kernel[(((((int)blockIdx.x) * 147456) + (rc_outer_outer * 144)) + ((int)threadIdx.x))];
+ kernel_shared[(((int)threadIdx.x) + 112)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 112) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 112) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 224)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 224) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 80) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 336)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 336) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) / 3) + 16) % 48) * 3)) + (((int)threadIdx.x) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 448)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 448) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 560)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 560) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 128) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 672)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 672) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) / 3) + 32) % 48) * 3)) + (((int)threadIdx.x) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 784)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 784) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 64) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 896)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 896) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 32) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1008)] = kernel[((((((int)blockIdx.x) * 147456) + (rc_outer_outer * 144)) + ((int)threadIdx.x)) + 32256)];
+ kernel_shared[(((int)threadIdx.x) + 1120)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1120) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 112) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1232)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1232) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 80) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1344)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1344) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) / 3) + 16) % 48) * 3)) + (((int)threadIdx.x) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1456)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1456) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1568)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1568) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 128) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1680)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1680) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) / 3) + 32) % 48) * 3)) + (((int)threadIdx.x) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1792)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1792) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 64) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1904)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1904) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 32) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2016)] = kernel[((((((int)blockIdx.x) * 147456) + (rc_outer_outer * 144)) + ((int)threadIdx.x)) + 64512)];
+ kernel_shared[(((int)threadIdx.x) + 2128)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2128) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 112) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2240)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2240) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 80) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2352)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2352) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) / 3) + 16) % 48) * 3)) + (((int)threadIdx.x) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2464)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2464) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2576)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2576) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 128) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2688)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2688) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) / 3) + 32) % 48) * 3)) + (((int)threadIdx.x) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2800)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2800) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 64) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2912)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2912) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 32) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 3024)] = kernel[((((((int)blockIdx.x) * 147456) + (rc_outer_outer * 144)) + ((int)threadIdx.x)) + 96768)];
+ kernel_shared[(((int)threadIdx.x) + 3136)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 3136) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 112) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 3248)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 3248) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 80) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 3360)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 3360) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) / 3) + 16) % 48) * 3)) + (((int)threadIdx.x) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 3472)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 3472) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 3584)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 3584) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 128) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 3696)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 3696) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) / 3) + 32) % 48) * 3)) + (((int)threadIdx.x) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 3808)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 3808) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 64) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 3920)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 3920) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 32) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 4032)] = kernel[((((((int)blockIdx.x) * 147456) + (rc_outer_outer * 144)) + ((int)threadIdx.x)) + 129024)];
+ kernel_shared[(((int)threadIdx.x) + 4144)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 4144) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 112) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 4256)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 4256) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 80) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 4368)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 4368) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) / 3) + 16) % 48) * 3)) + (((int)threadIdx.x) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 4480)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 4480) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ if (((int)threadIdx.x) < 16) {
+ kernel_shared[(((int)threadIdx.x) + 4592)] = kernel[(((((((int)blockIdx.x) * 147456) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 128) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3)) + 142848)];
+ }
+ __syncthreads();
+ for (int rc_outer_inner = 0; rc_outer_inner < 8; ++rc_outer_inner) {
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 162) + (((int)threadIdx.x) % 7))] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 9)] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 18)] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 27)] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 36)] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 45)] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 54)] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((rc_outer_inner * 162) + (((int)threadIdx.x) % 7))] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 9)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 18)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 27)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 36)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 45)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 54)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 1)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 10)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 28)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 37)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 46)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 55)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 1)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 10)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 28)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 37)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 46)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 55)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 2)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 29)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 38)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 47)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 56)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 2)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 29)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 38)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 47)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 56)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 81)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 90)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 99)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 108)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 117)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 126)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 135)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 81)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 90)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 99)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 108)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 117)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 126)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 135)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 82)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 100)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 109)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 118)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 127)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 136)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 82)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 100)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 109)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 118)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 127)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 136)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 83)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 92)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 101)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 110)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 119)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 128)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 137)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 83)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 92)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 101)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 110)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 119)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 128)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 137)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 9)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 18)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 27)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 36)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 45)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 54)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 9)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 18)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 27)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 36)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 45)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 54)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 10)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 28)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 37)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 46)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 55)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 64)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 10)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 28)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 37)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 46)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 55)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 64)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 29)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 38)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 47)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 56)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 65)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 29)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 38)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 47)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 56)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 65)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 90)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 99)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 108)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 117)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 126)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 135)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 144)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 90)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 99)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 108)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 117)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 126)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 135)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 144)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 100)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 109)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 118)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 127)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 136)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 145)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 100)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 109)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 118)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 127)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 136)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 145)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 92)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 101)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 110)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 119)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 128)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 137)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 146)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 92)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 101)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 110)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 119)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 128)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 137)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 146)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 18)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 27)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 36)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 45)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 54)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 72)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 18)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 27)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 36)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 45)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 54)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 72)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 28)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 37)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 46)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 55)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 64)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 73)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 28)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 37)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 46)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 55)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 64)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 73)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 29)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 38)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 47)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 56)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 65)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 74)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 29)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 38)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 47)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 56)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 65)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 74)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 99)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 108)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 117)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 126)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 135)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 144)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 153)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 99)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 108)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 117)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 126)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 135)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 144)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 153)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 100)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 109)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 118)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 127)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 136)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 145)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 154)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 100)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 109)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 118)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 127)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 136)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 145)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 154)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 101)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 110)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 119)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 128)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 137)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 146)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 155)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 101)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 110)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 119)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 128)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 137)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 146)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 155)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
}
}
for (int i1_inner = 0; i1_inner < 2; ++i1_inner) {
- for (int i3_inner = 0; i3_inner < 7; ++i3_inner) {
- compute[((((((((int)blockIdx.x) / 7) * 6272) + (((int)threadIdx.x) * 98)) + (i1_inner * 49)) + ((((int)blockIdx.x) % 7) * 7)) + i3_inner)] = max((conv2d_nchw[((i1_inner * 7) + i3_inner)] + bias[((((((int)blockIdx.x) / 7) * 128) + (((int)threadIdx.x) * 2)) + i1_inner)]), 0.000000e+00f);
+ for (int i2_inner = 0; i2_inner < 7; ++i2_inner) {
+ compute[(((((((int)blockIdx.x) * 1568) + ((((int)threadIdx.x) / 7) * 98)) + (i1_inner * 49)) + (i2_inner * 7)) + (((int)threadIdx.x) % 7))] = max((conv2d_nchw[((i1_inner * 7) + i2_inner)] + bias[(((((int)blockIdx.x) * 32) + ((((int)threadIdx.x) / 7) * 2)) + i1_inner)]), 0.000000e+00f);
}
}
}
@@ -1377,7 +1209,7 @@ In the example below we resume the status and do more 5 trials.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 5 minutes 29.345 seconds)
+ **Total running time of the script:** ( 5 minutes 32.158 seconds)
.. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
index b38214d915..3a9b441327 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
@@ -643,7 +643,7 @@ so we can read the log file and load the best schedules.
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 7.8755 7.8683 7.8899 7.8681 0.0102
+ 7.8762 7.8741 7.8957 7.8589 0.0151
@@ -671,7 +671,7 @@ Other Tips
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 0.346 seconds)
+ **Total running time of the script:** ( 1 minutes 1.196 seconds)
.. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_network_cuda.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
index 66a4f71a33..6ffb372162 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
@@ -662,7 +662,7 @@ so we can read the log file and load the best schedules.
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 762.2645 761.4779 763.9916 761.3242 1.2228
+ 755.9799 756.7106 757.0075 754.2215 1.2493
@@ -690,7 +690,7 @@ Other Tips
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 31.730 seconds)
+ **Total running time of the script:** ( 1 minutes 32.047 seconds)
.. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_network_x86.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
index c18de8d5bd..da0c8b4253 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
@@ -386,27 +386,27 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
placeholder_4: Buffer(placeholder_14: Pointer(float32), float32, [128, 512], []),
compute: Buffer(compute_2: Pointer(float32), float32, [128, 512], [])}
buffer_map = {placeholder_5: placeholder, placeholder_6: placeholder_1, placeholder_7: placeholder_2, placeholder_8: placeholder_3, placeholder_9: placeholder_4, compute_1: compute} {
- for (i0.outer: int32, 0, 32) "parallel" {
- allocate(compute_3: Pointer(global float32), float32, [64]), storage_scope = global;
- for (i1.outer: int32, 0, 32) {
- for (i.inner.init: int32, 0, 4) {
- for (j.init: int32, 0, 16) {
- compute_4: Buffer(compute_3, float32, [64], [])[((i.inner.init*16) + j.init)] = 0f32
+ for (i0.outer.i1.outer.fused: int32, 0, 128) "parallel" {
+ allocate(compute_3: Pointer(global float32), float32, [512]), storage_scope = global {
+ for (nb_j.inner: int32, 0, 2) {
+ for (i.inner.init: int32, 0, 16) {
+ for (j.init: int32, 0, 16) {
+ compute_4: Buffer(compute_3, float32, [512], [])[(((i.inner.init*32) + (nb_j.inner*16)) + j.init)] = 0f32
+ }
}
- }
- for (elem_idx: int32, 0, (placeholder_15: Buffer(placeholder_13, int32, [33], [])[(i1.outer + 1)] - placeholder_15[i1.outer])) {
- for (i.inner: int32, 0, 4) {
- for (j: int32, 0, 16) {
- if @tir.likely((elem_idx < (placeholder_15[(i1.outer + 1)] - placeholder_15[i1.outer])), dtype=bool) {
- let cse_var_1: int32 = ((i.inner*16) + j)
- compute_4[cse_var_1] = (compute_4[cse_var_1] + (placeholder_16: Buffer(placeholder_11, float32, [78656], [])[(((placeholder_15[i1.outer]*16) + (elem_idx*16)) + j)]*max(placeholder_17: Buffer(placeholder_10, float32, [32768], [])[(((i0.outer*1024) + (i.inner*256)) + placeholder_18: Buffer(placeholder_12, int32, [4916], [])[(placeholder_15[i1.outer] + elem_idx)])], 0f32)))
+ for (elem_idx: int32, 0, let cse_var_1: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner) in (placeholder_15: Buffer(placeholder_13, int32, [33], [])[(cse_var_1 + 1)] - placeholder_15[cse_var_1])) {
+ for (i.inner: int32, 0, 16) {
+ for (j: int32, 0, 16) {
+ let cse_var_3: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)
+ let cse_var_2: int32 = (((i.inner*32) + (nb_j.inner*16)) + j)
+ compute_4[cse_var_2] = (compute_4[cse_var_2] + (placeholder_16: Buffer(placeholder_11, float32, [78656], [])[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + j)]*max(placeholder_17: Buffer(placeholder_10, float32, [32768], [])[(((floordiv(i0.outer.i1.outer.fused, 16)*4096) + (i.inner*256)) + placeholder_18: Buffer(placeholder_12, int32, [4916], [])[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
}
}
}
}
- for (i0.inner: int32, 0, 4) {
- let cse_var_2: int32 = (((i0.outer*2048) + (i0.inner*512)) + (i1.outer*16))
- compute_5: Buffer(compute_2, float32, [65536], [])[ramp(cse_var_2, 1, 16)] = max((compute_4[ramp((i0.inner*16), 1, 16)] + placeholder_19: Buffer(placeholder_14, float32, [65536], [])[ramp(cse_var_2, 1, 16)]), broadcast(0f32, 16))
+ for (i0.inner: int32, 0, 16) {
+ let cse_var_4: int32 = (((floordiv(i0.outer.i1.outer.fused, 16)*8192) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 16)*32))
+ compute_5: Buffer(compute_2, float32, [65536], [])[ramp(cse_var_4, 1, 32)] = max((compute_4[ramp((i0.inner*32), 1, 32)] + placeholder_19: Buffer(placeholder_14, float32, [65536], [])[ramp(cse_var_4, 1, 32)]), broadcast(0f32, 32))
}
}
}
@@ -462,7 +462,7 @@ We build the binary and check its correctness and performance.
.. code-block:: none
- Execution time of this operator: 1.285 ms
+ Execution time of this operator: 1.500 ms
diff --git a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
index 212aed5295..eb22421b58 100644
--- a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
@@ -5,16 +5,16 @@
Computation times
=================
-**00:35.661** total execution time for **how_to_tune_with_autotvm** files:
+**00:36.157** total execution time for **how_to_tune_with_autotvm** files:
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``) | 00:35.626 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``) | 00:36.122 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``) | 00:00.020 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_cuda.py` (``tune_relay_cuda.py``) | 00:00.005 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_arm.py` (``tune_relay_arm.py``) | 00:00.005 | 0.0 MB |
-+--------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_mobile_gpu.py` (``tune_relay_mobile_gpu.py``) | 00:00.005 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_arm.py` (``tune_relay_arm.py``) | 00:00.005 | 0.0 MB |
++--------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
index 797b3ab6ab..0617aaaf42 100644
--- a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
@@ -266,9 +266,9 @@ for this template
device available
Get devices for measurement successfully!
No: 1 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
func = build(s, args, target_host=task.target_host, runtime=runtime)
File "/workspace/python/tvm/driver/build_module.py", line 227, in build
input_mod = lower(inputs, args, name=name, binds=binds)
@@ -301,11 +301,11 @@ for this template
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -329,7 +329,7 @@ for this template
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
@@ -357,11 +357,11 @@ for this template
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -385,15 +385,13 @@ for this template
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 32, 2, 8]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 2, 256]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,381188
- No: 2 GFLOPS: 21.21/21.21 result: MeasureResult(costs=(0.0109166595,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.188716411590576, timestamp=1669924312.333331) [('tile_f', [-1, 4, 2, 8]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1009945
- No: 3 GFLOPS: 1.81/21.21 result: MeasureResult(costs=(0.12811106849999998,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.0552613735198975, timestamp=1669924315.252629) [('tile_f', [-1, 1, 1, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 64, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,8771180
- No: 4 GFLOPS: 0.00/21.21 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 32, 2, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5725173
+ No: 2 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
func = build(s, args, target_host=task.target_host, runtime=runtime)
File "/workspace/python/tvm/driver/build_module.py", line 227, in build
input_mod = lower(inputs, args, name=name, binds=binds)
@@ -426,11 +424,11 @@ for this template
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -454,7 +452,7 @@ for this template
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
@@ -482,11 +480,11 @@ for this template
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -510,13 +508,13 @@ for this template
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 32, 2]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 256]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3862850
- No: 5 GFLOPS: 0.00/21.21 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 128, 1, 2]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 32]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1502662
+ No: 3 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
func = build(s, args, target_host=task.target_host, runtime=runtime)
File "/workspace/python/tvm/driver/build_module.py", line 227, in build
input_mod = lower(inputs, args, name=name, binds=binds)
@@ -549,11 +547,11 @@ for this template
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -577,7 +575,7 @@ for this template
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
@@ -605,11 +603,11 @@ for this template
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -633,14 +631,13 @@ for this template
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 256, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 32, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7764339
- No: 6 GFLOPS: 54.84/54.84 result: MeasureResult(costs=(0.0042212298,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4288978576660156, timestamp=1669924317.857191) [('tile_f', [-1, 8, 2, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 16]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5739593
- No: 7 GFLOPS: 0.00/54.84 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 1, 256]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,3398117
+ No: 4 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
func = build(s, args, target_host=task.target_host, runtime=runtime)
File "/workspace/python/tvm/driver/build_module.py", line 227, in build
input_mod = lower(inputs, args, name=name, binds=binds)
@@ -673,11 +670,11 @@ for this template
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -701,7 +698,7 @@ for this template
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
@@ -729,11 +726,11 @@ for this template
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -757,13 +754,13 @@ for this template
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 4, 32]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9255595
- No: 8 GFLOPS: 0.00/54.84 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 64, 1, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 64]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5775362
+ No: 5 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
func = build(s, args, target_host=task.target_host, runtime=runtime)
File "/workspace/python/tvm/driver/build_module.py", line 227, in build
input_mod = lower(inputs, args, name=name, binds=binds)
@@ -796,11 +793,11 @@ for this template
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -824,7 +821,7 @@ for this template
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
@@ -852,11 +849,11 @@ for this template
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -880,17 +877,13 @@ for this template
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 16, 2, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 32, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5283447
- No: 9 GFLOPS: 7.45/54.84 result: MeasureResult(costs=(0.03108157525,), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.272521734237671, timestamp=1669924323.2844048) [('tile_f', [-1, 4, 4, 2]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,272214
- No: 10 GFLOPS: 248.13/248.13 result: MeasureResult(costs=(0.0009329768670520231,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.2568655014038086, timestamp=1669924324.2515388) [('tile_f', [-1, 1, 4, 2]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2338892
- No: 11 GFLOPS: 24.74/248.13 result: MeasureResult(costs=(0.009356567363636364,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.166534662246704, timestamp=1669924324.9288423) [('tile_f', [-1, 4, 2, 4]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 16, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,14190
- No: 12 GFLOPS: 25.82/248.13 result: MeasureResult(costs=(0.008965818333333334,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.8921427726745605, timestamp=1669924325.6391897) [('tile_f', [-1, 2, 1, 2]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2009316
- No: 13 GFLOPS: 0.00/248.13 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 1, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 2, 128]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,950458
+ No: 6 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
func = build(s, args, target_host=task.target_host, runtime=runtime)
File "/workspace/python/tvm/driver/build_module.py", line 227, in build
input_mod = lower(inputs, args, name=name, binds=binds)
@@ -923,11 +916,11 @@ for this template
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -951,7 +944,7 @@ for this template
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
@@ -979,11 +972,11 @@ for this template
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1007,13 +1000,13 @@ for this template
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 8, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 16, 8]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4175501
- No: 14 GFLOPS: 0.00/248.13 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 16, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 512, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,1774279
+ No: 7 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
func = build(s, args, target_host=task.target_host, runtime=runtime)
File "/workspace/python/tvm/driver/build_module.py", line 227, in build
input_mod = lower(inputs, args, name=name, binds=binds)
@@ -1046,11 +1039,11 @@ for this template
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1074,7 +1067,7 @@ for this template
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
@@ -1102,11 +1095,11 @@ for this template
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1130,13 +1123,13 @@ for this template
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 16, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 16]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8452306
- No: 15 GFLOPS: 0.00/248.13 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 4, 64]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 1, 64]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4421107
+ No: 8 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
func = build(s, args, target_host=task.target_host, runtime=runtime)
File "/workspace/python/tvm/driver/build_module.py", line 227, in build
input_mod = lower(inputs, args, name=name, binds=binds)
@@ -1169,11 +1162,11 @@ for this template
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1197,7 +1190,7 @@ for this template
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
@@ -1225,11 +1218,11 @@ for this template
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1253,13 +1246,15 @@ for this template
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 128, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 8, 64]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,3073315
- No: 16 GFLOPS: 0.00/248.13 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 64, 2, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 16, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6054196
+ No: 9 GFLOPS: 62.21/62.21 result: MeasureResult(costs=(0.003721080674418605,), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.198905944824219, timestamp=1669925187.6235414) [('tile_f', [-1, 1, 2, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 8]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9397530
+ No: 10 GFLOPS: 24.14/62.21 result: MeasureResult(costs=(0.009589893636363636,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.2840280532836914, timestamp=1669925188.342286) [('tile_f', [-1, 2, 8, 16]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 1, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,5071620
+ No: 11 GFLOPS: 0.00/62.21 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
func = build(s, args, target_host=task.target_host, runtime=runtime)
File "/workspace/python/tvm/driver/build_module.py", line 227, in build
input_mod = lower(inputs, args, name=name, binds=binds)
@@ -1292,11 +1287,11 @@ for this template
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1320,7 +1315,7 @@ for this template
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
@@ -1348,11 +1343,11 @@ for this template
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1376,13 +1371,13 @@ for this template
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 2, 32]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 64, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4476532
- No: 17 GFLOPS: 0.00/248.13 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 2, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 16]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3813106
+ No: 12 GFLOPS: 0.00/62.21 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
func = build(s, args, target_host=task.target_host, runtime=runtime)
File "/workspace/python/tvm/driver/build_module.py", line 227, in build
input_mod = lower(inputs, args, name=name, binds=binds)
@@ -1415,11 +1410,11 @@ for this template
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1443,7 +1438,7 @@ for this template
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
@@ -1471,11 +1466,11 @@ for this template
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1499,13 +1494,13 @@ for this template
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 32, 1, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 32]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9056140
- No: 18 GFLOPS: 0.00/248.13 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 16, 1, 2]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 512, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10292979
+ No: 13 GFLOPS: 0.00/62.21 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
func = build(s, args, target_host=task.target_host, runtime=runtime)
File "/workspace/python/tvm/driver/build_module.py", line 227, in build
input_mod = lower(inputs, args, name=name, binds=binds)
@@ -1538,11 +1533,11 @@ for this template
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1566,7 +1561,7 @@ for this template
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
@@ -1594,11 +1589,11 @@ for this template
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1622,13 +1617,13 @@ for this template
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 2, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 256, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9937729
- No: 19 GFLOPS: 0.00/248.13 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 2, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 256, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,8934870
+ No: 14 GFLOPS: 0.00/62.21 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
func = build(s, args, target_host=task.target_host, runtime=runtime)
File "/workspace/python/tvm/driver/build_module.py", line 227, in build
input_mod = lower(inputs, args, name=name, binds=binds)
@@ -1661,11 +1656,11 @@ for this template
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1689,7 +1684,7 @@ for this template
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
@@ -1717,11 +1712,11 @@ for this template
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1745,10 +1740,381 @@ for this template
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 256, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 128, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9512853
- No: 20 GFLOPS: 1.64/248.13 result: MeasureResult(costs=(0.141428955,), error_no=MeasureErrorNo.NO_ERROR, all_cost=5.23434591293335, timestamp=1669924331.2717454) [('tile_f', [-1, 8, 4, 16]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 2, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2363418
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 32, 2, 8]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 128]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,375028
+ No: 15 GFLOPS: 291.58/291.58 result: MeasureResult(costs=(0.0007939590640394088,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.30177903175354, timestamp=1669925189.9380887) [('tile_f', [-1, 1, 2, 1]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 32, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6250210
+ No: 16 GFLOPS: 0.00/291.58 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
+ func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
+ func = build(s, args, target_host=task.target_host, runtime=runtime)
+ File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+ input_mod = lower(inputs, args, name=name, binds=binds)
+ File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+ return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
+ File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
+ tvm._ffi.base.TVMError: Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1731
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1671
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1646
+ 13: operator()
+ at ../src/driver/driver_api.cc:388
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:374
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:269
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:453
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1750
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1694
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1618
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+ Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1731
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1671
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1646
+ 13: operator()
+ at ../src/driver/driver_api.cc:388
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:374
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:269
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:453
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1750
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1694
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1618
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 32, 1, 8]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 256, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6648761
+ No: 17 GFLOPS: 76.88/291.58 result: MeasureResult(costs=(0.0030112359999999996,), error_no=MeasureErrorNo.NO_ERROR, all_cost=8.845435857772827, timestamp=1669925198.9765935) [('tile_f', [-1, 1, 1, 2]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 32, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9312435
+ No: 18 GFLOPS: 0.00/291.58 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
+ func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
+ func = build(s, args, target_host=task.target_host, runtime=runtime)
+ File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+ input_mod = lower(inputs, args, name=name, binds=binds)
+ File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+ return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
+ File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
+ tvm._ffi.base.TVMError: Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1731
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1671
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1646
+ 13: operator()
+ at ../src/driver/driver_api.cc:388
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:374
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:269
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:453
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1750
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1694
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1618
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+ Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1731
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1671
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1646
+ 13: operator()
+ at ../src/driver/driver_api.cc:388
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:374
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:269
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:453
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1750
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1694
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1618
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 64, 2, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 16, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1210896
+ No: 19 GFLOPS: 1.35/291.58 result: MeasureResult(costs=(0.17110185225000002,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.638166666030884, timestamp=1669925201.5347054) [('tile_f', [-1, 1, 2, 256]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 2, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3718218
+ No: 20 GFLOPS: 0.00/291.58 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
+ func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
+ func = build(s, args, target_host=task.target_host, runtime=runtime)
+ File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+ input_mod = lower(inputs, args, name=name, binds=binds)
+ File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+ return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
+ File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
+ tvm._ffi.base.TVMError: Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1731
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1671
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1646
+ 13: operator()
+ at ../src/driver/driver_api.cc:388
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:374
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:269
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:453
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1750
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1694
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1618
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+ Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1731
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1671
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1646
+ 13: operator()
+ at ../src/driver/driver_api.cc:388
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:374
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:269
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:453
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1750
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1694
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1618
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 128, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 16, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5860631
@@ -1803,9 +2169,9 @@ and measure running time.
Finish loading 20 records
Best config:
- [('tile_f', [-1, 1, 4, 2]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2338892
+ [('tile_f', [-1, 1, 2, 1]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 32, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6250210
Finish loading 20 records
- Time cost of this operator: 0.001360
+ Time cost of this operator: 0.001184
diff --git a/docs/_sources/how_to/tune_with_autotvm/tune_relay_cuda.rst.txt b/docs/_sources/how_to/tune_with_autotvm/tune_relay_cuda.rst.txt
index 48272354eb..b676946b63 100644
--- a/docs/_sources/how_to/tune_with_autotvm/tune_relay_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/tune_relay_cuda.rst.txt
@@ -185,7 +185,7 @@ Before tuning, we apply some configurations.
.. code-block:: none
- /workspace/python/tvm/target/target.py:393: UserWarning: Try specifying cuda arch by adding 'arch=sm_xx' to your target.
+ /workspace/python/tvm/target/target.py:397: UserWarning: Try specifying cuda arch by adding 'arch=sm_xx' to your target.
warnings.warn("Try specifying cuda arch by adding 'arch=sm_xx' to your target.")
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
index 11c7870d04..49bcd7473f 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
@@ -329,10 +329,10 @@ Timing the untuned program
########## Build without Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs Measurements(us)
--------- --- -------- ------- ----- ------ ------- ----------------
- tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 311.3 98.719 (1, 2, 10, 10, 3) 2 1 [311.3]
- tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.078 0.976 (1, 6, 10, 10) 1 1 [3.078]
- tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.963 0.305 (1, 1, 10, 10, 3) 1 1 [0.963]
- Total_time - 315.341 - - - - -
+ tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 310.6 98.708 (1, 2, 10, 10, 3) 2 1 [310.6]
+ tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.077 0.978 (1, 6, 10, 10) 1 1 [3.077]
+ tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.988 0.314 (1, 1, 10, 10, 3) 1 1 [0.988]
+ Total_time - 314.665 - - - - -
@@ -397,10 +397,10 @@ Timing the tuned program
########## Build with Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs Measurements(us)
--------- --- -------- ------- ----- ------ ------- ----------------
- tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 100.2 97.338 (1, 6, 10, 10, 1) 2 1 [100.2]
- tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.768 1.718 (1, 6, 10, 10) 1 1 [1.768]
- tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.972 0.944 (1, 1, 10, 10, 3) 1 1 [0.972]
- Total_time - 102.94 - - - - -
+ tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 102.9 97.476 (1, 6, 10, 10, 1) 2 1 [102.9]
+ tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.806 1.711 (1, 6, 10, 10) 1 1 [1.806]
+ tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.859 0.813 (1, 3, 10, 10, 1) 1 1 [0.859]
+ Total_time - 105.564 - - - - -
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_pytorch.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_pytorch.rst.txt
index 4594c65b81..96dcf62f75 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_pytorch.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_pytorch.rst.txt
@@ -109,7 +109,7 @@ download a cat image and preprocess it to use as the model input.
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/torch/ao/quantization/utils.py:281: UserWarning: must run observer before calling calculate_qparams. Returning default values.
"must run observer before calling calculate_qparams. " +
Downloading: "https://download.pytorch.org/models/quantized/mobilenet_v2_qnnpack_37f702c5.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2_qnnpack_37f702c5.pth
-
0%| | 0.00/3.42M [00:00<?, ?B/s]
84%|########4 | 2.88M/3.42M [00:00<00:00, 30.2MB/s]
100%|##########| 3.42M/3.42M [00:00<00:00, 34.4MB/s]
+
0%| | 0.00/3.42M [00:00<?, ?B/s]
100%|##########| 3.42M/3.42M [00:00<00:00, 107MB/s]
/workspace/python/tvm/relay/frontend/pytorch_utils.py:47: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
return LooseVersion(torch_ver) > ver
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/setuptools/_distutils/version.py:346: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
@@ -314,7 +314,7 @@ Look up prediction top 1 index in 1000 class synset.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 3.182 seconds)
+ **Total running time of the script:** ( 1 minutes 3.611 seconds)
.. _sphx_glr_download_how_to_work_with_microtvm_micro_pytorch.py:
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
index a2aa467919..1106035288 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
@@ -225,7 +225,7 @@ take about **2 minutes** to download the Stanford Cars, while COCO 2017 validati
.. code-block:: none
- '/tmp/tmpryi74w6s/images/random'
+ '/tmp/tmpa_1psqlr/images/random'
@@ -316,7 +316,7 @@ objects to other stuff? We can display some examples from our datasets using ``m
.. image-sg:: /how_to/work_with_microtvm/images/sphx_glr_micro_train_001.png
- :alt: [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0]
+ :alt: [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]
:srcset: /how_to/work_with_microtvm/images/sphx_glr_micro_train_001.png
:class: sphx-glr-single-img
@@ -325,8 +325,8 @@ objects to other stuff? We can display some examples from our datasets using ``m
.. code-block:: none
- /tmp/tmpryi74w6s/images/target contains 8144 images
- /tmp/tmpryi74w6s/images/random contains 5000 images
+ /tmp/tmpa_1psqlr/images/target contains 8144 images
+ /tmp/tmpa_1psqlr/images/random contains 5000 images
@@ -501,13 +501,13 @@ the time on our validation set).
.. code-block:: none
Epoch 1/3
- 328/328 - 47s - loss: 0.2985 - accuracy: 0.9078 - val_loss: 0.2031 - val_accuracy: 0.9358 - 47s/epoch - 143ms/step
+ 328/328 - 47s - loss: 0.2388 - accuracy: 0.9216 - val_loss: 0.1168 - val_accuracy: 0.9607 - 47s/epoch - 143ms/step
Epoch 2/3
- 328/328 - 43s - loss: 0.1057 - accuracy: 0.9601 - val_loss: 0.1237 - val_accuracy: 0.9603 - 43s/epoch - 132ms/step
+ 328/328 - 44s - loss: 0.1131 - accuracy: 0.9590 - val_loss: 0.1981 - val_accuracy: 0.9275 - 44s/epoch - 133ms/step
Epoch 3/3
- 328/328 - 43s - loss: 0.0696 - accuracy: 0.9750 - val_loss: 0.0888 - val_accuracy: 0.9717 - 43s/epoch - 131ms/step
+ 328/328 - 43s - loss: 0.0740 - accuracy: 0.9729 - val_loss: 0.0972 - val_accuracy: 0.9668 - 43s/epoch - 132ms/step
- <keras.callbacks.History object at 0x7fbb09d14410>
+ <keras.callbacks.History object at 0x7fe1a804d650>
@@ -864,7 +864,7 @@ Arduino tutorial for how to do that `on GitHub <https://github.com/guberti/tvm-a
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 4 minutes 41.016 seconds)
+ **Total running time of the script:** ( 4 minutes 40.874 seconds)
.. _sphx_glr_download_how_to_work_with_microtvm_micro_train.py:
diff --git a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
index f4f926f4bf..f33d427cf9 100644
--- a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
@@ -5,18 +5,18 @@
Computation times
=================
-**06:45.843** total execution time for **how_to_work_with_microtvm** files:
+**06:47.466** total execution time for **how_to_work_with_microtvm** files:
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py` (``micro_train.py``) | 04:41.016 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py` (``micro_train.py``) | 04:40.874 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_pytorch.py` (``micro_pytorch.py``) | 01:03.182 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_pytorch.py` (``micro_pytorch.py``) | 01:03.611 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``) | 00:49.591 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``) | 00:50.653 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_aot.py` (``micro_aot.py``) | 00:08.293 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_aot.py` (``micro_aot.py``) | 00:08.536 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``) | 00:03.758 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``) | 00:03.791 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_microtvm_micro_reference_vm.py` (``micro_reference_vm.py``) | 00:00.001 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
index 12618a9ae8..ff21e800c4 100644
--- a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
Computation times
=================
-**00:44.000** total execution time for **how_to_work_with_relay** files:
+**00:39.619** total execution time for **how_to_work_with_relay** files:
+----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_using_pipeline_executor.py` (``using_pipeline_executor.py``) | 00:32.381 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_using_pipeline_executor.py` (``using_pipeline_executor.py``) | 00:33.066 | 0.0 MB |
+----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``) | 00:10.108 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``) | 00:04.808 | 0.0 MB |
+----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``) | 00:01.504 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``) | 00:01.739 | 0.0 MB |
+----------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_relay_using_relay_viz.py` (``using_relay_viz.py``) | 00:00.007 | 0.0 MB |
+----------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt b/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
index 05abf38ea3..bfc08f2879 100644
--- a/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
@@ -261,7 +261,7 @@ The following example customizes CUDA lowering rule for :code:`exp`.
.. code-block:: none
- <function my_cuda_math_rule at 0x7fbaf205d950>
+ <function my_cuda_math_rule at 0x7fe221957710>
diff --git a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
index 00915fc1f1..7b91cc3c84 100644
--- a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
@@ -5,22 +5,22 @@
Computation times
=================
-**00:07.619** total execution time for **how_to_work_with_schedules** files:
+**00:04.891** total execution time for **how_to_work_with_schedules** files:
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``) | 00:05.239 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``) | 00:02.341 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``) | 00:01.064 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``) | 00:01.189 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``) | 00:00.562 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``) | 00:00.582 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``) | 00:00.542 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``) | 00:00.567 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_extern_op.py` (``extern_op.py``) | 00:00.115 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_extern_op.py` (``extern_op.py``) | 00:00.114 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_schedules_schedule_primitives.py` (``schedule_primitives.py``) | 00:00.050 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tedd.py` (``tedd.py``) | 00:00.029 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tedd.py` (``tedd.py``) | 00:00.030 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tuple_inputs.py` (``tuple_inputs.py``) | 00:00.019 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tuple_inputs.py` (``tuple_inputs.py``) | 00:00.020 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
index 3fc8b9a9b1..d165ea9c53 100644
--- a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
@@ -343,7 +343,7 @@ The importing needs to happen before the tensorized GEMV being executed.
B: Buffer(B_2: Pointer(float32), float32, [512, 64], []),
C: Buffer(C_2: Pointer(float32), float32, [1024, 512], [])}
buffer_map = {A_1: A, B_1: B, C_1: C} {
- attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmpf7dqtn78/input0.cc'\nsource_filename = \"/tmp/tmpf7dqtn78/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = alloca float*, align 8\n %8 = alloca float*, align 8\n %9 = alloca floa [...]
+ attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmpbt8ked8y/input0.cc'\nsource_filename = \"/tmp/tmpbt8ked8y/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = alloca float*, align 8\n %8 = alloca float*, align 8\n %9 = alloca floa [...]
for (i, 0, 1024) {
for (j.outer: int32, 0, 32) {
@tir.call_extern("gemv_update", @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), C_2, ((i*512) + (j.outer*16)), 16, 2, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), A_2, (i*64), 64, 1, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), B_2, (j.outer*1024), 1024, 1, dtype=handle), 16, 64, 64, dtype=int32)
diff --git a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
index 4ecd64f830..69a45e75ff 100644
--- a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**00:26.300** total execution time for **topic_vta_tutorials_autotvm** files:
+**00:26.353** total execution time for **topic_vta_tutorials_autotvm** files:
+---------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``) | 00:26.294 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``) | 00:26.347 | 0.0 MB |
+---------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_alu_vta.py` (``tune_alu_vta.py``) | 00:00.006 | 0.0 MB |
+---------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/autotvm/tune_relay_vta.rst.txt b/docs/_sources/topic/vta/tutorials/autotvm/tune_relay_vta.rst.txt
index ef00d71070..e909bd8040 100644
--- a/docs/_sources/topic/vta/tutorials/autotvm/tune_relay_vta.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/autotvm/tune_relay_vta.rst.txt
@@ -536,7 +536,7 @@ Finally, we launch tuning jobs and evaluate the end-to-end performance.
.. code-block:: none
Extract tasks...
- /workspace/python/tvm/target/target.py:277: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
+ /workspace/python/tvm/target/target.py:281: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
"target_host parameter is going to be deprecated. "
Extracted 10 conv2d tasks:
(1, 56, 56, 64, 64, 3, 3, 1, 1, 1, 1)
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
index 275a432b27..872fe56d04 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
@@ -289,7 +289,7 @@ The compilation steps are:
DeprecationWarning,
/workspace/vta/tutorials/frontend/deploy_classification.py:213: DeprecationWarning: legacy graph executor behavior of producing json / lib / params will be removed in the next release. Please see documents of tvm.contrib.graph_executor.GraphModule for the new recommended usage.
relay_prog, target=tvm.target.Target(target, host=env.target_host), params=params
- resnet18_v1 inference graph built in 28.94s!
+ resnet18_v1 inference graph built in 29.77s!
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
index 6d04420280..7c90c68d38 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
@@ -333,7 +333,7 @@ The compilation steps are:
/workspace/python/tvm/relay/build_module.py:348: DeprecationWarning: Please use input parameter mod (tvm.IRModule) instead of deprecated parameter mod (tvm.relay.function.Function)
DeprecationWarning,
- yolov3-tiny inference graph built in 19.59s!
+ yolov3-tiny inference graph built in 19.71s!
diff --git a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
index e2b7bc83c1..ff0f722d0b 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**01:41.238** total execution time for **topic_vta_tutorials_frontend** files:
+**01:41.940** total execution time for **topic_vta_tutorials_frontend** files:
+------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``) | 00:52.315 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``) | 00:52.230 | 0.0 MB |
+------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``) | 00:48.923 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``) | 00:49.709 | 0.0 MB |
+------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
index 26e0202f73..fe2d01728e 100644
--- a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**00:03.156** total execution time for **topic_vta_tutorials_optimize** files:
+**00:03.147** total execution time for **topic_vta_tutorials_optimize** files:
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``) | 00:02.714 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``) | 00:02.669 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``) | 00:00.442 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``) | 00:00.478 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
index c4e550024c..8493b76db0 100644
--- a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**00:00.782** total execution time for **topic_vta_tutorials** files:
+**00:00.877** total execution time for **topic_vta_tutorials** files:
+---------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``) | 00:00.420 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``) | 00:00.478 | 0.0 MB |
+---------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``) | 00:00.362 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``) | 00:00.398 | 0.0 MB |
+---------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
index d934867674..4db846ecff 100644
--- a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
@@ -325,7 +325,7 @@ We build the binary and check its correctness and performance.
.. code-block:: none
- Execution time of this operator: 94.283 ms
+ Execution time of this operator: 95.584 ms
@@ -425,7 +425,7 @@ resume the status and do more 5 trials.
Resume search:
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/xgboost/training.py:17: UserWarning: Old style callback is deprecated. See: https://xgboost.readthedocs.io/en/latest/python/callbacks.html
warnings.warn(f'Old style callback is deprecated. See: {link}', UserWarning)
- *E*E
+
@@ -443,7 +443,7 @@ operations.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 22.343 seconds)
+ **Total running time of the script:** ( 1 minutes 28.413 seconds)
.. _sphx_glr_download_tutorial_auto_scheduler_matmul_x86.py:
diff --git a/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt b/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
index e5e44c05cd..f2ea568956 100644
--- a/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
@@ -450,16 +450,16 @@ reduce variance, we take 5 measurements and average them.
waiting for device...
device available
Get devices for measurement successfully!
- No: 1 GFLOPS: 12.78/12.78 result: MeasureResult(costs=(0.021009644,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5190932750701904, timestamp=1669922888.4878695) [('tile_y', [-1, 128]), ('tile_x', [-1, 256])],None,87
- No: 2 GFLOPS: 13.36/13.36 result: MeasureResult(costs=(0.020093497800000002,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5232143402099609, timestamp=1669922889.0150733) [('tile_y', [-1, 256]), ('tile_x', [-1, 64])],None,68
- No: 3 GFLOPS: 0.51/13.36 result: MeasureResult(costs=(0.5271519793999999,), error_no=MeasureErrorNo.NO_ERROR, all_cost=8.584887504577637, timestamp=1669922898.3585112) [('tile_y', [-1, 128]), ('tile_x', [-1, 1])],None,7
- No: 4 GFLOPS: 4.16/13.36 result: MeasureResult(costs=(0.0645141038,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.198930025100708, timestamp=1669922900.3078878) [('tile_y', [-1, 16]), ('tile_x', [-1, 16])],None,44
- No: 5 GFLOPS: 3.63/13.36 result: MeasureResult(costs=(0.0740431838,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.3319170475006104, timestamp=1669922901.7687304) [('tile_y', [-1, 8]), ('tile_x', [-1, 8])],None,33
- No: 6 GFLOPS: 1.30/13.36 result: MeasureResult(costs=(0.2066784706,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.4441657066345215, timestamp=1669922905.9817302) [('tile_y', [-1, 2]), ('tile_x', [-1, 1])],None,1
- No: 7 GFLOPS: 11.50/13.36 result: MeasureResult(costs=(0.0233461152,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5285658836364746, timestamp=1669922906.5179656) [('tile_y', [-1, 128]), ('tile_x', [-1, 32])],None,57
- No: 8 GFLOPS: 12.30/13.36 result: MeasureResult(costs=(0.0218255396,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5507256984710693, timestamp=1669922907.077611) [('tile_y', [-1, 8]), ('tile_x', [-1, 256])],None,83
- No: 9 GFLOPS: 1.54/13.36 result: MeasureResult(costs=(0.1743730842,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.918978691101074, timestamp=1669922910.1096945) [('tile_y', [-1, 32]), ('tile_x', [-1, 4])],None,25
- No: 10 GFLOPS: 3.15/13.36 result: MeasureResult(costs=(0.08522196679999999,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4893817901611328, timestamp=1669922911.6363122) [('tile_y', [-1, 2]), ('tile_x', [-1, 8])],None,31
+ No: 1 GFLOPS: 10.94/10.94 result: MeasureResult(costs=(0.0245335456,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5536513328552246, timestamp=1669923766.7386873) [('tile_y', [-1, 2]), ('tile_x', [-1, 256])],None,81
+ No: 2 GFLOPS: 11.69/11.69 result: MeasureResult(costs=(0.022963718,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5923411846160889, timestamp=1669923767.312237) [('tile_y', [-1, 32]), ('tile_x', [-1, 32])],None,55
+ No: 3 GFLOPS: 12.35/12.35 result: MeasureResult(costs=(0.021730554399999998,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.520205020904541, timestamp=1669923768.5764506) [('tile_y', [-1, 8]), ('tile_x', [-1, 256])],None,83
+ No: 4 GFLOPS: 3.92/12.35 result: MeasureResult(costs=(0.0685608216,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.2628114223480225, timestamp=1669923770.5878031) [('tile_y', [-1, 64]), ('tile_x', [-1, 16])],None,46
+ No: 5 GFLOPS: 9.95/12.35 result: MeasureResult(costs=(0.0269886844,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6414477825164795, timestamp=1669923771.3489761) [('tile_y', [-1, 512]), ('tile_x', [-1, 128])],None,79
+ No: 6 GFLOPS: 12.63/12.63 result: MeasureResult(costs=(0.021254071,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.516197681427002, timestamp=1669923771.859938) [('tile_y', [-1, 32]), ('tile_x', [-1, 128])],None,75
+ No: 7 GFLOPS: 10.85/12.63 result: MeasureResult(costs=(0.024742156199999997,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6028096675872803, timestamp=1669923773.178632) [('tile_y', [-1, 4]), ('tile_x', [-1, 128])],None,72
+ No: 8 GFLOPS: 10.27/12.63 result: MeasureResult(costs=(0.026141716200000005,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6350319385528564, timestamp=1669923773.802411) [('tile_y', [-1, 4]), ('tile_x', [-1, 64])],None,62
+ No: 9 GFLOPS: 9.04/12.63 result: MeasureResult(costs=(0.029707870600000002,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.598355770111084, timestamp=1669923774.5139868) [('tile_y', [-1, 4]), ('tile_x', [-1, 32])],None,52
+ No: 10 GFLOPS: 2.72/12.63 result: MeasureResult(costs=(0.0988073784,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.700753927230835, timestamp=1669923776.2644334) [('tile_y', [-1, 2]), ('tile_x', [-1, 8])],None,31
diff --git a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
index 49845bd699..deef26ea54 100644
--- a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
@@ -320,7 +320,7 @@ standard deviation.
.. code-block:: none
- {'mean': 515.244299470005, 'median': 515.3066395999986, 'std': 1.5667276729237114}
+ {'mean': 516.2943852100125, 'median': 515.8442538499912, 'std': 1.816301196366439}
@@ -554,29 +554,31 @@ the tuning data to.
.. code-block:: none
-
[Task 1/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 1/25] Current/Best: 7.61/ 17.45 GFLOPS | Progress: (4/20) | 10.50 s
[Task 1/25] Current/Best: 10.80/ 18.68 GFLOPS | Progress: (8/20) | 13.52 s
[Task 1/25] Current/Best: 7.57/ 18.68 GFLOPS | Progress: (12/20) | 16.90 s
[Task 1/25] Current/Best: 19.56/ 19.56 GFLOPS | Progress: (16/20) | 19.46 s
[Task 1/25] Current/Best: 15.24/ 19.56 GFLOPS | Progress: (20/20) | 22.15 s Done.
-
[Task 2/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 2/25] Current/Best: 16.77/ 17.77 GFLOPS | Progress: (4/20) | 2.63 s
[Task 2/25] Current/Best: 4.50/ 17.77 GFLOPS | Progress: (8/20) | 3.94 s
[Task 2/25] Current/Best: 9.69/ 17.77 GFLOPS | Progress: (12/20) | 5.36 s
[Task 2/25] Current/Best: 5.41/ 17.77 GFLOPS | Progress: (16/20) | 6.60 s
[Task 2/25] Current/Best: 11.63/ 19.08 GFLOPS | Progress: (20/20) | 7.80 s Done.
-
[Task 3/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 3/25] Current/Best: 12.23/ 14.87 GFLOPS | Progress: (4/20) | 3.51 s
[Task 3/25] Current/Best: 15.31/ 15.78 GFLOPS | Progress: (8/20) | 6.08 s
[Task 3/25] Current/Best: 3.14/ 17.15 GFLOPS | Progress: (12/20) | 8.72 s
[Task 3/25] Current/Best: 8.87/ 18.26 GFLOPS | Progress: (16/20) | 11.63 s
[Task 3/25] Current/Best: 21.16/ 21.16 GFLOPS | Progress: (20/20) | 13.61 s Done.
-
[Task 4/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 4/25] Current/Best: 12.18/ 18.10 GFLOPS | Progress: (4/20) | 2.99 s
[Task 4/25] Current/Best: 22.22/ 22.22 GFLOPS | Progress: (8/20) | 5.18 s
[Task 4/25] Current/Best: 10.07/ 22.22 GFLOPS | Progress: (12/20) | 9.42 s
[Task 4/25] Current/Best: 12.38/ 22.22 GFLOPS | Progress: (16/20) | 11.29 s
[Task 4/25] Current/Best: 15.15/ 22.22 GFLOPS | Progress: (20/20) | 15.12 s Done.
-
[Task 5/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 5/25] Current/Best: 8.19/ 11.64 GFLOPS | Progress: (4/20) | 3.80 s
[Task 5/25] Current/Best: 4.75/ 14.30 GFLOPS | Progress: (8/20) | 5.83 s
[Task 5/25] Current/Best: 3.73/ 21.91 GFLOPS | Progress: (12/20) | 8.04 s
[Task 5/25] Current/Best: 22.87/ 22.87 GFLOPS | Progress: (16/20) | 9.76 s
[Task 5/25] Current/Best: 6.11/ 22.87 GFLOPS | Progress: (20/20) | 11.37 s Done.
-
[Task 6/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 6/25] Current/Best: 17.79/ 22.19 GFLOPS | Progress: (4/20) | 3.31 s
[Task 6/25] Current/Best: 9.74/ 22.19 GFLOPS | Progress: (8/20) | 5.34 s
[Task 6/25] Current/Best: 20.52/ 22.19 GFLOPS | Progress: (12/20) | 8.35 s
[Task 6/25] Current/Best: 2.61/ 22.19 GFLOPS | Progress: (16/20) | 10.63 s
[Task 6/25] Current/Best: 4.36/ 22.19 GFLOPS | Progress: (20/20) | 12.68 s Done.
-
[Task 7/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 7/25] Current/Best: 13.71/ 17.84 GFLOPS | Progress: (4/20) | 3.85 s
[Task 7/25] Current/Best: 16.28/ 17.84 GFLOPS | Progress: (8/20) | 5.70 s
[Task 7/25] Current/Best: 16.12/ 18.65 GFLOPS | Progress: (12/20) | 7.42 s
[Task 7/25] Current/Best: 18.82/ 18.82 GFLOPS | Progress: (16/20) | 9.14 s
[Task 7/25] Current/Best: 8.61/ 18.82 GFLOPS | Progress: (20/20) | 11.00 s Done.
-
[Task 8/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 8/25] Current/Best: 9.46/ 12.18 GFLOPS | Progress: (4/20) | 13.90 s
[Task 8/25] Current/Best: 15.32/ 15.32 GFLOPS | Progress: (8/20) | 19.38 s
[Task 8/25] Current/Best: 7.41/ 15.32 GFLOPS | Progress: (12/20) | 26.68 s
[Task 8/25] Current/Best: 5.03/ 15.32 GFLOPS | Progress: (16/20) | 32.91 s
[Task 8/25] Current/Best: 14.18/ 20.19 GFLOPS | Progress: (20/20) | 35.30 s
[Task 9/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 9/25] Current/Best: 5.17/ 18.86 GFLOPS | Progress: (4/20) | 4.78 s
[Task 9/25] Current/Best: 14.40/ 18.86 GFLOPS | Progress: (8/20) | 6.46 s
[Task 9/25] Current/Best: 17.13/ 18.86 GFLOPS | Progress: (12/20) | 11.02 s
[Task 9/25] Current/Best: 8.00/ 18.86 GFLOPS | Progress: (16/20) | 15.69 s
[Task 9/25] Current/Best: 16.11/ 20.89 GFLOPS | Progress: (20/
20) | 22.39 s Done.
-
[Task 10/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 10/25] Current/Best: 12.87/ 16.40 GFLOPS | Progress: (4/20) | 3.18 s
[Task 10/25] Current/Best: 10.86/ 16.40 GFLOPS | Progress: (8/20) | 5.08 s
[Task 10/25] Current/Best: 14.39/ 20.02 GFLOPS | Progress: (12/20) | 6.27 s
[Task 10/25] Current/Best: 13.26/ 20.02 GFLOPS | Progress: (16/20) | 7.62 s
[Task 10/25] Current/Best: 13.39/ 20.02 GFLOPS | Progress: (20/20) | 10.17 s Done.
-
[Task 11/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 11/25] Current/Best: 1.57/ 11.39 GFLOPS | Progress: (4/20) | 5.28 s
[Task 11/25] Current/Best: 8.87/ 15.17 GFLOPS | Progress: (8/20) | 7.63 s
[Task 11/25] Current/Best: 10.06/ 19.01 GFLOPS | Progress: (12/20) | 9.70 s
[Task 11/25] Current/Best: 11.45/ 22.15 GFLOPS | Progress: (16/20) | 12.83 s
[Task 11/25] Current/Best: 7.82/ 22.15 GFLOPS | Progress: (20/20) | 15.10 s Done.
-
[Task 12/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 12/25] Current/Best: 14.46/ 20.64 GFLOPS | Progress: (4/20) | 3.59 s
[Task 12/25] Current/Best: 12.82/ 20.64 GFLOPS | Progress: (8/20) | 5.80 s
[Task 12/25] Current/Best: 3.03/ 20.64 GFLOPS | Progress: (12/20) | 8.87 s
[Task 12/25] Current/Best: 13.68/ 20.64 GFLOPS | Progress: (16/20) | 11.93 s
[Task 12/25] Current/Best: 11.34/ 20.64 GFLOPS | Progress: (20/20) | 17.19 s Done.
-
[Task 13/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 13/25] Current/Best: 4.34/ 18.37 GFLOPS | Progress: (4/20) | 3.59 s
[Task 13/25] Current/Best: 8.56/ 21.09 GFLOPS | Progress: (8/20) | 5.51 s
[Task 13/25] Current/Best: 12.99/ 21.09 GFLOPS | Progress: (12/20) | 7.52 s
[Task 13/25] Current/Best: 17.78/ 21.09 GFLOPS | Progress: (16/20) | 10.24 s
[Task 13/25] Current/Best: 4.55/ 22.44 GFLOPS | Progress: (20/20) | 12.55 s Done.
-
[Task 14/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 14/25] Current/Best: 2.82/ 13.22 GFLOPS | Progress: (4/20) | 5.26 s
[Task 14/25] Current/Best: 15.87/ 21.06 GFLOPS | Progress: (8/20) | 6.90 s
[Task 14/25] Current/Best: 9.38/ 21.06 GFLOPS | Progress: (12/20) | 9.67 s
[Task 14/25] Current/Best: 11.67/ 21.06 GFLOPS | Progress: (16/20) | 13.49 s Done.
-
[Task 14/25] Current/Best: 8.95/ 21.06 GFLOPS | Progress: (20/20) | 19.37 s Done.
-
[Task 15/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 15/25] Current/Best: 15.89/ 19.73 GFLOPS | Progress: (4/20) | 2.81 s
[Task 15/25] Current/Best: 20.51/ 20.51 GFLOPS | Progress: (8/20) | 4.68 s
[Task 15/25] Current/Best: 12.12/ 22.84 GFLOPS | Progress: (12/20) | 7.62 s
[Task 15/25] Current/Best: 19.88/ 22.84 GFLOPS | Progress: (16/20) | 15.65 s
[Task 15/25] Current/Best: 8.45/ 22.84 GFLOPS | Progress: (20/20) | 17.23 s
[Task 16/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 16/25] Current/Best: 19.36/ 19.36 GFLOPS | Progress: (4/20) | 4.83 s
[Task 16/25] Current/Best: 14.00/ 19.36 GFLOPS | Progress: (8/20) | 6.42 s
[Task 16/25] Current/Best: 11.71/ 19.88 GFLOPS | Progress: (12/20) | 7.67 s
[Task 16/25] Current/Best: 9.54/ 19.88 GFLOPS | Progress: (16/20) | 9.47 s
[Task 16/25] Current/Best: 12.67/ 19.88 GFLOPS | Progress: (20/20) |
12.50 s Done.
-
[Task 17/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 17/25] Current/Best: 7.98/ 22.45 GFLOPS | Progress: (4/20) | 4.40 s
[Task 17/25] Current/Best: 15.36/ 22.45 GFLOPS | Progress: (8/20) | 6.71 s
[Task 17/25] Current/Best: 11.89/ 22.45 GFLOPS | Progress: (12/20) | 9.69 s
[Task 17/25] Current/Best: 16.65/ 22.45 GFLOPS | Progress: (16/20) | 11.83 s
[Task 17/25] Current/Best: 19.19/ 22.89 GFLOPS | Progress: (20/20) | 13.36 s Done.
-
[Task 18/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 18/25] Current/Best: 17.61/ 17.61 GFLOPS | Progress: (4/20) | 8.48 s
[Task 18/25] Current/Best: 21.07/ 21.07 GFLOPS | Progress: (8/20) | 10.53 s
[Task 18/25] Current/Best: 14.94/ 21.07 GFLOPS | Progress: (12/20) | 13.58 s
[Task 18/25] Current/Best: 11.99/ 21.07 GFLOPS | Progress: (16/20) | 15.61 s
[Task 18/25] Current/Best: 7.57/ 21.07 GFLOPS | Progress: (20/20) | 20.47 s Done.
-
[Task 19/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 19/25] Current/Best: 9.37/ 18.07 GFLOPS | Progress: (4/20) | 3.59 s
[Task 19/25] Current/Best: 12.02/ 18.07 GFLOPS | Progress: (8/20) | 8.84 s
[Task 19/25] Current/Best: 12.11/ 20.82 GFLOPS | Progress: (12/20) | 11.83 s
[Task 19/25] Current/Best: 5.36/ 20.82 GFLOPS | Progress: (16/20) | 14.02 s
[Task 19/25] Current/Best: 4.70/ 20.82 GFLOPS | Progress: (20/20) | 18.17 s Done.
-
[Task 20/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 20/25] Current/Best: 1.57/ 16.60 GFLOPS | Progress: (4/20) | 5.94 s
[Task 20/25] Current/Best: 9.46/ 16.60 GFLOPS | Progress: (8/20) | 8.92 s
[Task 20/25] Current/Best: 8.76/ 16.60 GFLOPS | Progress: (12/20) | 10.95 s
[Task 20/25] Current/Best: 5.20/ 17.43 GFLOPS | Progress: (16/20) | 15.30 s Done.
-
[Task 20/25] Current/Best: 9.23/ 18.22 GFLOPS | Progress: (20/20) | 18.37 s
[Task 21/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 21/25] Current/Best: 8.74/ 8.74 GFLOPS | Progress: (4/20) | 4.05 s
[Task 21/25] Current/Best: 12.06/ 16.10 GFLOPS | Progress: (8/20) | 7.40 s
[Task 21/25] Current/Best: 2.71/ 16.10 GFLOPS | Progress: (12/20) | 10.75 s
[Task 21/25] Current/Best: 6.86/ 21.95 GFLOPS | Progress: (16/20) | 12.91 s
[Task 21/25] Current/Best: 6.27/ 21.95 GFLOPS | Progress: (20/20) | 15.03 s
[Task 22/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 22/25] Current/Best: 5.13/ 21.06 GFLOPS | Progress: (4/20) | 3.43 s
[Task 22/25] Current/Best: 6.45/ 21.06 GFLOPS | Progress: (8/20) | 8.26 s
[Task 22/25] Current/Best: 14.37/ 21.06 GFLOPS | Progress: (12/20) | 9.73 s
[Task 22/25] Current/Best: 13.99/ 21.06 GFLOPS | Progress: (16/20)
| 11.42 s
[Task 22/25] Current/Best: 16.08/ 21.06 GFLOPS | Progress: (20/20) | 12.85 s Done.
-
[Task 23/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 23/25] Current/Best: 17.14/ 20.25 GFLOPS | Progress: (4/20) | 3.37 s
[Task 23/25] Current/Best: 20.76/ 20.76 GFLOPS | Progress: (8/20) | 5.99 s
[Task 23/25] Current/Best: 9.98/ 20.76 GFLOPS | Progress: (12/20) | 8.29 s
[Task 23/25] Current/Best: 18.07/ 20.76 GFLOPS | Progress: (16/20) | 10.07 s
[Task 23/25] Current/Best: 10.84/ 23.75 GFLOPS | Progress: (20/20) | 12.97 s Done.
-
[Task 24/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 24/25] Current/Best: 2.99/ 3.29 GFLOPS | Progress: (4/20) | 12.26 s
[Task 24/25] Current/Best: 8.26/ 10.09 GFLOPS | Progress: (8/20) | 22.73 s Done.
-
[Task 24/25] Current/Best: 0.97/ 10.09 GFLOPS | Progress: (12/20) | 34.64 s
[Task 24/25] Current/Best: 5.43/ 10.09 GFLOPS | Progress: (16/20) | 46.00 s
[Task 24/25] Current/Best: 3.56/ 10.09 GFLOPS | Progress: (20/20) | 57.65 s
[Task 25/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 25/25] Current/Best: 1.55/ 6.18 GFLOPS | Progress: (4/20) | 3.39 s
[Task 25/25] Current/Best: 8.94/ 8.94 GFLOPS | Progress: (8/20) | 14.07 s
[Task 25/25] Current/Best: 9.44/ 9.44 GFLOPS | Progress: (12/20) | 24.80 s
[Task 25/25] Current/Best: 8.45/ 9.44 GFLOPS | Progress: (16/20) | 35.53 s
[Task 25/25] Current/Best: 7.78/ 9.44 GFLOPS | Progress: (20/20) | 46.20 s
+
[Task 1/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 1/25] Current/Best: 14.58/ 14.58 GFLOPS | Progress: (4/20) | 6.99 s
[Task 1/25] Current/Best: 12.95/ 14.58 GFLOPS | Progress: (8/20) | 9.93 s
[Task 1/25] Current/Best: 9.07/ 22.22 GFLOPS | Progress: (12/20) | 12.53 s
[Task 1/25] Current/Best: 12.50/ 22.22 GFLOPS | Progress: (16/20) | 15.84 s
[Task 1/25] Current/Best: 3.21/ 22.22 GFLOPS | Progress: (20/20) | 18.30 s Done.
+
[Task 2/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 2/25] Current/Best: 16.94/ 17.03 GFLOPS | Progress: (4/20) | 2.90 s
[Task 2/25] Current/Best: 5.72/ 18.37 GFLOPS | Progress: (8/20) | 4.02 s
[Task 2/25] Current/Best: 17.17/ 18.37 GFLOPS | Progress: (12/20) | 5.25 s
[Task 2/25] Current/Best: 11.64/ 18.37 GFLOPS | Progress: (16/20) | 7.12 s
[Task 2/25] Current/Best: 17.97/ 18.37 GFLOPS | Progress: (20/20) | 8.38 s Done.
+
[Task 3/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 3/25] Current/Best: 3.18/ 15.42 GFLOPS | Progress: (4/20) | 4.51 s
[Task 3/25] Current/Best: 6.58/ 15.42 GFLOPS | Progress: (8/20) | 7.35 s
[Task 3/25] Current/Best: 12.88/ 23.12 GFLOPS | Progress: (12/20) | 9.22 s
[Task 3/25] Current/Best: 14.49/ 23.42 GFLOPS | Progress: (16/20) | 11.19 s
[Task 3/25] Current/Best: 10.48/ 23.42 GFLOPS | Progress: (20/20) | 13.06 s Done.
+
[Task 4/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 4/25] Current/Best: 12.52/ 15.32 GFLOPS | Progress: (4/20) | 3.27 s
[Task 4/25] Current/Best: 16.82/ 18.75 GFLOPS | Progress: (8/20) | 4.90 s
[Task 4/25] Current/Best: 13.73/ 19.53 GFLOPS | Progress: (12/20) | 6.80 s
[Task 4/25] Current/Best: 7.01/ 20.10 GFLOPS | Progress: (16/20) | 10.85 s
[Task 4/25] Current/Best: 8.81/ 21.11 GFLOPS | Progress: (20/20) | 18.57 s Done.
+
[Task 5/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 5/25] Current/Best: 15.96/ 15.96 GFLOPS | Progress: (4/20) | 3.00 s
[Task 5/25] Current/Best: 4.19/ 15.96 GFLOPS | Progress: (8/20) | 5.77 s
[Task 5/25] Current/Best: 4.35/ 18.99 GFLOPS | Progress: (12/20) | 7.75 s
[Task 5/25] Current/Best: 4.68/ 18.99 GFLOPS | Progress: (16/20) | 9.39 s
[Task 5/25] Current/Best: 13.59/ 18.99 GFLOPS | Progress: (20/20) | 11.16 s Done.
+
[Task 6/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 6/25] Current/Best: 20.45/ 20.45 GFLOPS | Progress: (4/20) | 4.02 s
[Task 6/25] Current/Best: 10.45/ 23.04 GFLOPS | Progress: (8/20) | 5.56 s
[Task 6/25] Current/Best: 11.71/ 23.04 GFLOPS | Progress: (12/20) | 9.05 s
[Task 6/25] Current/Best: 10.92/ 23.04 GFLOPS | Progress: (16/20) | 10.90 s
[Task 6/25] Current/Best: 12.98/ 23.04 GFLOPS | Progress: (20/20) | 13.59 s Done.
+
[Task 7/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 7/25] Current/Best: 14.02/ 19.23 GFLOPS | Progress: (4/20) | 3.96 s
[Task 7/25] Current/Best: 11.45/ 20.12 GFLOPS | Progress: (8/20) | 6.38 s
[Task 7/25] Current/Best: 15.50/ 20.12 GFLOPS | Progress: (12/20) | 8.76 s
[Task 7/25] Current/Best: 15.83/ 20.12 GFLOPS | Progress: (16/20) | 11.03 s
[Task 7/25] Current/Best: 15.97/ 20.12 GFLOPS | Progress: (20/20) | 13.74 s Done.
+
[Task 8/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 8/25] Current/Best: 12.07/ 20.51 GFLOPS | Progress: (4/20) | 5.36 s
[Task 8/25] Current/Best: 2.90/ 20.51 GFLOPS | Progress: (8/20) | 9.18 s
[Task 8/25] Current/Best: 12.14/ 20.51 GFLOPS | Progress: (12/20) | 12.05 s
[Task 8/25] Current/Best: 8.99/ 20.51 GFLOPS | Progress: (16/20) | 23.33 s
[Task 8/25] Current/Best: 22.95/ 22.95 GFLOPS | Progress: (20/20) | 26.00 s
[Task 9/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 9/25] Current/Best: 6.03/ 19.23 GFLOPS | Progress: (4/20) | 8.88 s
[Task 9/25] Current/Best: 19.35/ 19.35 GFLOPS | Progress: (8/20) | 10.21 s
[Task 9/25] Current/Best: 12.09/ 19.35 GFLOPS | Progress: (12/20) | 11.81 s
[Task 9/25] Current/Best: 4.81/ 19.35 GFLOPS | Progress: (16/20) | 16.97 s
[Task 9/25] Current/Best: 14.40/ 19.35 GFLOPS | Progress: (20/2
0) | 19.32 s Done.
+
[Task 10/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 10/25] Current/Best: 14.97/ 15.19 GFLOPS | Progress: (4/20) | 2.89 s
[Task 10/25] Current/Best: 4.06/ 20.03 GFLOPS | Progress: (8/20) | 4.65 s
[Task 10/25] Current/Best: 5.44/ 20.03 GFLOPS | Progress: (12/20) | 6.98 s
[Task 10/25] Current/Best: 13.12/ 20.03 GFLOPS | Progress: (16/20) | 9.55 s
[Task 10/25] Current/Best: 12.39/ 20.03 GFLOPS | Progress: (20/20) | 11.94 s Done.
+
[Task 11/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 11/25] Current/Best: 16.31/ 16.31 GFLOPS | Progress: (4/20) | 4.09 s
[Task 11/25] Current/Best: 18.37/ 21.45 GFLOPS | Progress: (8/20) | 5.89 s
[Task 11/25] Current/Best: 12.69/ 21.97 GFLOPS | Progress: (12/20) | 8.00 s
[Task 11/25] Current/Best: 6.72/ 21.97 GFLOPS | Progress: (16/20) | 11.51 s
[Task 11/25] Current/Best: 17.85/ 21.97 GFLOPS | Progress: (20/20) | 13.52 s Done.
+
[Task 12/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 12/25] Current/Best: 20.46/ 20.46 GFLOPS | Progress: (4/20) | 3.85 s
[Task 12/25] Current/Best: 20.77/ 20.77 GFLOPS | Progress: (8/20) | 6.18 s
[Task 12/25] Current/Best: 14.99/ 21.65 GFLOPS | Progress: (12/20) | 12.17 s
[Task 12/25] Current/Best: 21.79/ 21.79 GFLOPS | Progress: (16/20) | 15.77 s
[Task 12/25] Current/Best: 14.20/ 21.79 GFLOPS | Progress: (20/20) | 18.79 s Done.
+
[Task 13/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 13/25] Current/Best: 14.57/ 20.60 GFLOPS | Progress: (4/20) | 3.84 s
[Task 13/25] Current/Best: 14.02/ 21.00 GFLOPS | Progress: (8/20) | 5.78 s
[Task 13/25] Current/Best: 15.17/ 21.53 GFLOPS | Progress: (12/20) | 8.51 s
[Task 13/25] Current/Best: 11.33/ 21.53 GFLOPS | Progress: (16/20) | 11.66 s
[Task 13/25] Current/Best: 3.11/ 21.53 GFLOPS | Progress: (20/20) | 15.88 s Done.
+
[Task 14/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 14/25] Current/Best: 10.47/ 11.90 GFLOPS | Progress: (4/20) | 3.82 s
[Task 14/25] Current/Best: 15.54/ 15.54 GFLOPS | Progress: (8/20) | 6.05 s
[Task 14/25] Current/Best: 15.69/ 16.63 GFLOPS | Progress: (12/20) | 8.03 s
[Task 14/25] Current/Best: 22.20/ 22.20 GFLOPS | Progress: (16/20) | 10.12 s
[Task 14/25] Current/Best: 14.54/ 22.20 GFLOPS | Progress: (20/20) | 12.22 s
[Task 15/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 15/25] Current/Best: 17.41/ 17.41 GFLOPS | Progress: (4/20) | 3.51 s
[Task 15/25] Current/Best: 13.49/ 17.41 GFLOPS | Progress: (8/20) | 9.81 s
[Task 15/25] Current/Best: 17.21/ 17.41 GFLOPS | Progress: (12/20) | 11.64 s
[Task 15/25] Current/Best: 13.27/ 20.73 GFLOPS | Progress: (16/20) | 14.19 s
[Task 15/25] Current/Best: 18.29/ 21.40 GFLOPS | Progress: (20/20)
| 15.31 s Done.
+
[Task 16/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 16/25] Current/Best: 4.74/ 15.38 GFLOPS | Progress: (4/20) | 3.48 s
[Task 16/25] Current/Best: 10.47/ 18.29 GFLOPS | Progress: (8/20) | 5.17 s
[Task 16/25] Current/Best: 5.85/ 18.29 GFLOPS | Progress: (12/20) | 6.56 s
[Task 16/25] Current/Best: 16.52/ 18.29 GFLOPS | Progress: (16/20) | 9.26 s
[Task 16/25] Current/Best: 7.57/ 20.08 GFLOPS | Progress: (20/20) | 12.12 s Done.
+
[Task 17/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 17/25] Current/Best: 16.36/ 17.92 GFLOPS | Progress: (4/20) | 3.47 s Done.
+ Done.
+
[Task 17/25] Current/Best: 16.07/ 22.22 GFLOPS | Progress: (8/20) | 5.46 s
[Task 17/25] Current/Best: 10.30/ 22.22 GFLOPS | Progress: (12/20) | 7.94 s
[Task 17/25] Current/Best: 11.85/ 22.22 GFLOPS | Progress: (16/20) | 10.16 s
[Task 17/25] Current/Best: 3.08/ 22.22 GFLOPS | Progress: (20/20) | 12.58 s Done.
+
[Task 18/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 18/25] Current/Best: 13.30/ 19.05 GFLOPS | Progress: (4/20) | 3.29 s
[Task 18/25] Current/Best: 11.79/ 19.05 GFLOPS | Progress: (8/20) | 6.73 s
[Task 18/25] Current/Best: 5.05/ 20.57 GFLOPS | Progress: (12/20) | 12.26 s
[Task 18/25] Current/Best: 16.08/ 20.57 GFLOPS | Progress: (16/20) | 15.27 s
[Task 18/25] Current/Best: 10.17/ 20.57 GFLOPS | Progress: (20/20) | 18.69 s Done.
+
[Task 19/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 19/25] Current/Best: 12.52/ 19.43 GFLOPS | Progress: (4/20) | 3.96 s
[Task 19/25] Current/Best: 7.75/ 20.69 GFLOPS | Progress: (8/20) | 6.44 s
[Task 19/25] Current/Best: 9.01/ 20.69 GFLOPS | Progress: (12/20) | 13.37 s
[Task 19/25] Current/Best: 19.92/ 20.69 GFLOPS | Progress: (16/20) | 15.63 s
[Task 19/25] Current/Best: 11.27/ 20.69 GFLOPS | Progress: (20/20) | 18.64 s Done.
+
[Task 20/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 20/25] Current/Best: 16.27/ 16.27 GFLOPS | Progress: (4/20) | 3.10 s
[Task 20/25] Current/Best: 9.89/ 16.27 GFLOPS | Progress: (8/20) | 7.19 s
[Task 20/25] Current/Best: 10.50/ 16.27 GFLOPS | Progress: (12/20) | 10.11 s
[Task 20/25] Current/Best: 8.08/ 16.27 GFLOPS | Progress: (16/20) | 11.62 s
[Task 20/25] Current/Best: 11.06/ 16.27 GFLOPS | Progress: (20/20) | 15.24 s
[Task 21/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 21/25] Current/Best: 6.46/ 17.90 GFLOPS | Progress: (4/20) | 2.89 s
[Task 21/25] Current/Best: 8.92/ 20.19 GFLOPS | Progress: (8/20) | 5.33 s
[Task 21/25] Current/Best: 17.78/ 20.19 GFLOPS | Progress: (12/20) | 6.69 s
[Task 21/25] Current/Best: 16.52/ 20.19 GFLOPS | Progress: (16/20) | 9.23 s Done.
+
[Task 21/25] Current/Best: 5.40/ 20.19 GFLOPS | Progress: (20/20) | 10.67 s Done.
+
[Task 22/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 22/25] Current/Best: 8.15/ 14.75 GFLOPS | Progress: (4/20) | 3.20 s
[Task 22/25] Current/Best: 14.33/ 14.75 GFLOPS | Progress: (8/20) | 5.46 s
[Task 22/25] Current/Best: 11.13/ 19.80 GFLOPS | Progress: (12/20) | 7.34 s
[Task 22/25] Current/Best: 21.39/ 21.39 GFLOPS | Progress: (16/20) | 8.79 s
[Task 22/25] Current/Best: 2.69/ 21.39 GFLOPS | Progress: (20/20) | 10.93 s Done.
+
[Task 23/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 23/25] Current/Best: 20.88/ 20.88 GFLOPS | Progress: (4/20) | 4.09 s
[Task 23/25] Current/Best: 11.87/ 20.88 GFLOPS | Progress: (8/20) | 7.84 s
[Task 23/25] Current/Best: 12.29/ 20.88 GFLOPS | Progress: (12/20) | 11.03 s
[Task 23/25] Current/Best: 5.36/ 20.88 GFLOPS | Progress: (16/20) | 14.62 s
[Task 23/25] Current/Best: 18.65/ 20.88 GFLOPS | Progress: (20/20) | 16.67 s Done.
+
[Task 24/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 24/25] Current/Best: 3.77/ 10.21 GFLOPS | Progress: (4/20) | 9.12 s
[Task 24/25] Current/Best: 5.95/ 10.21 GFLOPS | Progress: (8/20) | 10.43 s
[Task 24/25] Current/Best: 1.65/ 10.21 GFLOPS | Progress: (12/20) | 20.96 s
[Task 24/25] Current/Best: 3.39/ 10.21 GFLOPS | Progress: (16/20) | 32.60 s
[Task 24/25] Current/Best: 3.06/ 10.21 GFLOPS | Progress: (20/20) | 35.55 s
[Task 25/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 25/25] Current/Best: 1.55/ 5.58 GFLOPS | Progress: (4/20) | 12.07 s Done.
+
[Task 25/25] Current/Best: 5.93/ 5.93 GFLOPS | Progress: (8/20) | 22.79 s
[Task 25/25] Current/Best: 8.33/ 9.70 GFLOPS | Progress: (12/20) | 24.68 s
[Task 25/25] Current/Best: 5.60/ 9.70 GFLOPS | Progress: (16/20) | 29.66 s
[Task 25/25] Current/Best: 8.97/ 9.70 GFLOPS | Progress: (20/20) | 31.91 s
@@ -672,8 +674,8 @@ Verify that the optimized model runs and produces the same results:
.. code-block:: none
- class='n02123045 tabby, tabby cat' with probability=0.621103
- class='n02123159 tiger cat' with probability=0.356379
+ class='n02123045 tabby, tabby cat' with probability=0.621102
+ class='n02123159 tiger cat' with probability=0.356380
class='n02124075 Egyptian cat' with probability=0.019712
class='n02129604 tiger, Panthera tigris' with probability=0.001215
class='n04040759 radiator' with probability=0.000262
@@ -730,8 +732,8 @@ improvement in comparing the optimized model to the unoptimized model.
.. code-block:: none
- optimized: {'mean': 400.2628620400037, 'median': 399.3192601999908, 'std': 3.869613145973296}
- unoptimized: {'mean': 515.244299470005, 'median': 515.3066395999986, 'std': 1.5667276729237114}
+ optimized: {'mean': 416.0831625300034, 'median': 416.72678354998425, 'std': 2.0900702015451893}
+ unoptimized: {'mean': 516.2943852100125, 'median': 515.8442538499912, 'std': 1.816301196366439}
@@ -754,7 +756,7 @@ profiling/benchmarking.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 11 minutes 16.323 seconds)
+ **Total running time of the script:** ( 10 minutes 24.624 seconds)
.. _sphx_glr_download_tutorial_autotvm_relay_x86.py:
diff --git a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
index ee4ef016f7..7d7e32aa6a 100644
--- a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
+++ b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
@@ -270,7 +270,7 @@ device and returns the measured cost. Network overhead is excluded.
.. code-block:: none
- 1.246e-07 secs/op
+ 1.261e-07 secs/op
diff --git a/docs/_sources/tutorial/intro_topi.rst.txt b/docs/_sources/tutorial/intro_topi.rst.txt
index fa1e53f921..027e221c6b 100644
--- a/docs/_sources/tutorial/intro_topi.rst.txt
+++ b/docs/_sources/tutorial/intro_topi.rst.txt
@@ -211,7 +211,7 @@ we can schedule the following series of operations ending with :code:`topi.sum`
.. code-block:: none
- /workspace/python/tvm/target/target.py:393: UserWarning: Try specifying cuda arch by adding 'arch=sm_xx' to your target.
+ /workspace/python/tvm/target/target.py:397: UserWarning: Try specifying cuda arch by adding 'arch=sm_xx' to your target.
warnings.warn("Try specifying cuda arch by adding 'arch=sm_xx' to your target.")
@main = primfn(a_1: handle, b_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
@@ -260,7 +260,7 @@ As you can see, scheduled stages of computation have been accumulated and we can
.. code-block:: none
- [stage(a, placeholder(a, 0x14cd01b0)), stage(b, placeholder(b, 0x9470f00)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min [...]
+ [stage(a, placeholder(a, 0x2d9f38b0)), stage(b, placeholder(b, 0x27751c30)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(mi [...]
diff --git a/docs/_sources/tutorial/relay_quick_start.rst.txt b/docs/_sources/tutorial/relay_quick_start.rst.txt
index f292a73060..ab6521b26b 100644
--- a/docs/_sources/tutorial/relay_quick_start.rst.txt
+++ b/docs/_sources/tutorial/relay_quick_start.rst.txt
@@ -257,7 +257,7 @@ in this example. Then the machine code will be generated as the module library.
.. code-block:: none
- /workspace/python/tvm/target/target.py:393: UserWarning: Try specifying cuda arch by adding 'arch=sm_xx' to your target.
+ /workspace/python/tvm/target/target.py:397: UserWarning: Try specifying cuda arch by adding 'arch=sm_xx' to your target.
warnings.warn("Try specifying cuda arch by adding 'arch=sm_xx' to your target.")
diff --git a/docs/_sources/tutorial/sg_execution_times.rst.txt b/docs/_sources/tutorial/sg_execution_times.rst.txt
index 78c596cc35..524c34684f 100644
--- a/docs/_sources/tutorial/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorial/sg_execution_times.rst.txt
@@ -5,24 +5,24 @@
Computation times
=================
-**14:42.507** total execution time for **tutorial** files:
+**13:41.670** total execution time for **tutorial** files:
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``) | 11:16.323 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``) | 10:24.624 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``) | 01:22.343 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``) | 01:28.413 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``) | 01:01.274 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``) | 00:59.103 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``) | 00:33.566 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``) | 00:33.920 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``) | 00:27.458 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``) | 00:14.012 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``) | 00:00.758 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``) | 00:00.767 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``) | 00:00.630 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``) | 00:00.665 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``) | 00:00.148 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``) | 00:00.158 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_tutorial_introduction.py` (``introduction.py``) | 00:00.005 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
index 427dfd285f..e7fd6d9fe1 100644
--- a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
+++ b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
@@ -294,7 +294,7 @@ helper function to run a profile of the TVM generated code.
.. code-block:: none
- Numpy running time: 0.000008
+ Numpy running time: 0.000007
naive: 0.000007
@@ -499,10 +499,10 @@ We can now compare the different schedules
.. code-block:: none
Operator Timing Performance
- numpy 7.5780500037581075e-06 1.0
- naive 6.682e-06 0.8817571798399669
- parallel 6.9544e-06 0.9177031025859124
- vector 2.4529899999999997e-05 3.2369672920916495
+ numpy 7.183859997894615e-06 1.0
+ naive 6.7121e-06 0.9343305690766699
+ parallel 7.156800000000001e-06 0.9962332230997619
+ vector 2.4543999999999997e-05 3.416547650871975
@@ -923,7 +923,7 @@ matrix multiplication.
.. code-block:: none
- Numpy running time: 0.018188
+ Numpy running time: 0.018787
@@ -981,7 +981,7 @@ optimizations.
.. code-block:: none
- none: 3.445719
+ none: 3.241931
@@ -1083,7 +1083,7 @@ schedule.
.. code-block:: none
- blocking: 0.292759
+ blocking: 0.309492
@@ -1178,7 +1178,7 @@ already cache friendly from our previous optimizations.
.. code-block:: none
- vectorization: 0.330592
+ vectorization: 0.342133
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], []),
@@ -1251,7 +1251,7 @@ more cache friendly.
.. code-block:: none
- loop permutation: 0.119376
+ loop permutation: 0.132350
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], []),
@@ -1349,7 +1349,7 @@ optimized schedule.
.. code-block:: none
- array packing: 0.110733
+ array packing: 0.109146
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], []),
@@ -1441,7 +1441,7 @@ to `C` when all the block results are ready.
.. code-block:: none
- block caching: 0.110880
+ block caching: 0.110517
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], []),
@@ -1526,7 +1526,7 @@ of thread-level parallelization.
.. code-block:: none
- parallelization: 0.146990
+ parallelization: 0.146629
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], []),
@@ -1606,13 +1606,13 @@ working, we can compare the results.
.. code-block:: none
Operator Timing Performance
- none 3.4457191219000003 1.0
- blocking 0.29275942920000003 0.08496323085050818
- vectorization 0.3305922811 0.09594289882737408
- loop permutation 0.11937611099999998 0.034644759708149085
- array packing 0.1107327744 0.03213633220891811
- block caching 0.1108800838 0.03217908363316036
- parallelization 0.1469895571 0.04265860097701424
+ none 3.2419309578 1.0
+ blocking 0.3094924632 0.09546547018694809
+ vectorization 0.34213314930000005 0.10553375557762473
+ loop permutation 0.13235001230000001 0.04082443889854267
+ array packing 0.1091457088 0.03366688255263374
+ block caching 0.1105170477 0.03408988320189204
+ parallelization 0.1466288206 0.045228853577900824
@@ -1652,11 +1652,6 @@ operations with tunable parameters that allows you to automatically optimize
the computation for specific platforms.
-.. rst-class:: sphx-glr-timing
-
- **Total running time of the script:** ( 1 minutes 1.274 seconds)
-
-
.. _sphx_glr_download_tutorial_tensor_expr_get_started.py:
.. only:: html
diff --git a/docs/commit_hash b/docs/commit_hash
index 0b11976390..2315f1af2f 100644
--- a/docs/commit_hash
+++ b/docs/commit_hash
@@ -1 +1 @@
-bf16b42edb94d016fd03ee68cf664d99c5f97e61
+afbfb7aa7e43732cb716f8e443df696110be6afc
diff --git a/docs/genindex.html b/docs/genindex.html
index 1e663ac34d..1021349aed 100644
--- a/docs/genindex.html
+++ b/docs/genindex.html
@@ -4806,6 +4806,8 @@
<li><a href="reference/api/python/tir.html#tvm.tir.vectorlow">vectorlow() (in module tvm.tir)</a>
</li>
<li><a href="reference/api/python/tir.html#tvm.tir.transform.VerifyMemory">VerifyMemory() (in module tvm.tir.transform)</a>
+</li>
+ <li><a href="reference/api/python/tir.html#tvm.tir.transform.VerifyVTCMLimit">VerifyVTCMLimit() (in module tvm.tir.transform)</a>
</li>
<li><a href="reference/api/python/autotvm.html#tvm.autotvm.task.space.VirtualAxis">VirtualAxis (class in tvm.autotvm.task.space)</a>
</li>
diff --git a/docs/how_to/compile_models/from_darknet.html b/docs/how_to/compile_models/from_darknet.html
index badd2de71e..16712313a2 100644
--- a/docs/how_to/compile_models/from_darknet.html
+++ b/docs/how_to/compile_models/from_darknet.html
@@ -585,7 +585,7 @@ class:['truck 0.9266'] left:471 top:83 right:689 bottom:169
class:['bicycle 0.9984'] left:111 top:113 right:577 bottom:447
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 13.849 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 13.889 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-darknet-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7716f96385bd5abb6e822041e285be54/from_darknet.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_darknet.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/from_keras.html b/docs/how_to/compile_models/from_keras.html
index 193696ef31..b1bb52252b 100644
--- a/docs/how_to/compile_models/from_keras.html
+++ b/docs/how_to/compile_models/from_keras.html
@@ -506,7 +506,7 @@ pip install -U tensorflow --user
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Relay top-1 id: 285, class name: Egyptian cat
1/1 [==============================] - ETA: 0s
-1/1 [==============================] - 1s 942ms/step
+1/1 [==============================] - 1s 958ms/step
Keras top-1 id: 285, class name: Egyptian cat
</pre></div>
</div>
diff --git a/docs/how_to/compile_models/from_mxnet.html b/docs/how_to/compile_models/from_mxnet.html
index c250ecc1a4..2a2d324b0d 100644
--- a/docs/how_to/compile_models/from_mxnet.html
+++ b/docs/how_to/compile_models/from_mxnet.html
@@ -440,7 +440,7 @@ to download the full example code</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"x"</span><span class="p">,</span> <a href="https://docs.python.org/3/library/stdtypes.html#tuple" title="builtins.tuple" class="sphx-glr-backref-module-builtins sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">x</span><span class="o">.</span><span class="n">shape</span></a><span class="p">)</span>
</pre></div>
</div>
-<img src="../../_images/sphx_glr_from_mxnet_001.png" srcset="../../_images/sphx_glr_from_mxnet_001.png" alt="from mxnet" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip4f6ee52a-43e5-4081-8d15-a6bd47f9b9e7 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+<img src="../../_images/sphx_glr_from_mxnet_001.png" srcset="../../_images/sphx_glr_from_mxnet_001.png" alt="from mxnet" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip410ebc78-fa41-484a-a007-b6e78027fcf0 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
x (1, 3, 224, 224)
</pre></div>
</div>
diff --git a/docs/how_to/compile_models/from_oneflow.html b/docs/how_to/compile_models/from_oneflow.html
index d1a5291e42..7d6ea7861d 100644
--- a/docs/how_to/compile_models/from_oneflow.html
+++ b/docs/how_to/compile_models/from_oneflow.html
@@ -448,12 +448,12 @@ Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdo
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip" to /workspace/.oneflow/flowvision_cache/resnet18.zip
0%| | 0.00/41.5M [00:00<?, ?B/s]
- 19%|#9 | 7.99M/41.5M [00:00<00:00, 44.6MB/s]
- 39%|###8 | 16.0M/41.5M [00:00<00:00, 50.2MB/s]
- 54%|#####3 | 22.3M/41.5M [00:00<00:00, 52.3MB/s]
- 66%|######6 | 27.4M/41.5M [00:00<00:00, 47.6MB/s]
- 82%|########2 | 34.1M/41.5M [00:00<00:00, 41.7MB/s]
-100%|##########| 41.5M/41.5M [00:00<00:00, 50.3MB/s]
+ 20%|#9 | 8.12M/41.5M [00:00<00:00, 85.0MB/s]
+ 39%|###9 | 16.2M/41.5M [00:00<00:00, 48.9MB/s]
+ 63%|######2 | 26.1M/41.5M [00:00<00:00, 45.9MB/s]
+ 77%|#######7 | 32.0M/41.5M [00:00<00:00, 46.4MB/s]
+ 92%|#########2| 38.3M/41.5M [00:00<00:00, 36.8MB/s]
+100%|##########| 41.5M/41.5M [00:01<00:00, 41.1MB/s]
</pre></div>
</div>
</div>
diff --git a/docs/how_to/compile_models/from_pytorch.html b/docs/how_to/compile_models/from_pytorch.html
index 70aa12eb7c..5d97abd407 100644
--- a/docs/how_to/compile_models/from_pytorch.html
+++ b/docs/how_to/compile_models/from_pytorch.html
@@ -431,13 +431,12 @@ be unstable.</p>
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
0%| | 0.00/44.7M [00:00<?, ?B/s]
- 16%|#5 | 7.12M/44.7M [00:00<00:00, 74.7MB/s]
- 32%|###2 | 14.3M/44.7M [00:00<00:00, 59.8MB/s]
- 45%|####5 | 20.2M/44.7M [00:00<00:00, 56.3MB/s]
- 57%|#####7 | 25.6M/44.7M [00:00<00:00, 52.4MB/s]
- 72%|#######1 | 32.0M/44.7M [00:00<00:00, 49.5MB/s]
- 90%|########9 | 40.0M/44.7M [00:00<00:00, 50.1MB/s]
-100%|##########| 44.7M/44.7M [00:00<00:00, 55.8MB/s]
+ 18%|#7 | 7.99M/44.7M [00:00<00:00, 72.6MB/s]
+ 36%|###6 | 16.1M/44.7M [00:00<00:00, 79.5MB/s]
+ 54%|#####3 | 24.0M/44.7M [00:00<00:00, 80.7MB/s]
+ 76%|#######5 | 33.8M/44.7M [00:00<00:00, 89.2MB/s]
+ 95%|#########4| 42.3M/44.7M [00:00<00:00, 87.1MB/s]
+100%|##########| 44.7M/44.7M [00:00<00:00, 87.8MB/s]
</pre></div>
</div>
</div>
diff --git a/docs/how_to/compile_models/from_tensorflow.html b/docs/how_to/compile_models/from_tensorflow.html
index 9b384e1050..d3aeba76d8 100644
--- a/docs/how_to/compile_models/from_tensorflow.html
+++ b/docs/how_to/compile_models/from_tensorflow.html
@@ -645,7 +645,7 @@ banana (score = 0.00022)
desk (score = 0.00019)
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 10.537 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 11.783 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-tensorflow-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7f1d3d1b878694c201c614c807cdebc8/from_tensorflow.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_tensorflow.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/sg_execution_times.html b/docs/how_to/compile_models/sg_execution_times.html
index 6903f50e48..d4b1a37f58 100644
--- a/docs/how_to/compile_models/sg_execution_times.html
+++ b/docs/how_to/compile_models/sg_execution_times.html
@@ -340,7 +340,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-compile-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>05:42.910</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
+<p><strong>05:47.464</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 81%" />
@@ -349,43 +349,43 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="from_darknet.html#sphx-glr-how-to-compile-models-from-darknet-py"><span class="std std-ref">Compile YOLO-V2 and YOLO-V3 in DarkNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_darknet.py</span></code>)</p></td>
-<td><p>01:13.850</p></td>
+<td><p>01:13.889</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_tensorflow.html#sphx-glr-how-to-compile-models-from-tensorflow-py"><span class="std std-ref">Compile Tensorflow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tensorflow.py</span></code>)</p></td>
-<td><p>01:10.537</p></td>
+<td><p>01:11.783</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="from_paddle.html#sphx-glr-how-to-compile-models-from-paddle-py"><span class="std std-ref">Compile PaddlePaddle Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_paddle.py</span></code>)</p></td>
-<td><p>00:45.743</p></td>
+<td><p>00:47.088</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_oneflow.html#sphx-glr-how-to-compile-models-from-oneflow-py"><span class="std std-ref">Compile OneFlow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_oneflow.py</span></code>)</p></td>
-<td><p>00:31.118</p></td>
+<td><p>00:32.668</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="from_mxnet.html#sphx-glr-how-to-compile-models-from-mxnet-py"><span class="std std-ref">Compile MXNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_mxnet.py</span></code>)</p></td>
-<td><p>00:28.196</p></td>
+<td><p>00:27.836</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_coreml.html#sphx-glr-how-to-compile-models-from-coreml-py"><span class="std std-ref">Compile CoreML Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_coreml.py</span></code>)</p></td>
-<td><p>00:26.570</p></td>
+<td><p>00:26.782</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="from_tflite.html#sphx-glr-how-to-compile-models-from-tflite-py"><span class="std std-ref">Compile TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tflite.py</span></code>)</p></td>
-<td><p>00:24.682</p></td>
+<td><p>00:24.676</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_pytorch.html#sphx-glr-how-to-compile-models-from-pytorch-py"><span class="std std-ref">Compile PyTorch Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_pytorch.py</span></code>)</p></td>
-<td><p>00:22.384</p></td>
+<td><p>00:22.396</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="from_keras.html#sphx-glr-how-to-compile-models-from-keras-py"><span class="std std-ref">Compile Keras Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_keras.py</span></code>)</p></td>
-<td><p>00:17.432</p></td>
+<td><p>00:17.860</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_onnx.html#sphx-glr-how-to-compile-models-from-onnx-py"><span class="std std-ref">Compile ONNX Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_onnx.py</span></code>)</p></td>
-<td><p>00:02.400</p></td>
+<td><p>00:02.486</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/deploy_models/deploy_model_on_adreno.html b/docs/how_to/deploy_models/deploy_model_on_adreno.html
index 3c683e1a0e..e9db92d948 100644
--- a/docs/how_to/deploy_models/deploy_model_on_adreno.html
+++ b/docs/how_to/deploy_models/deploy_model_on_adreno.html
@@ -919,10 +919,9 @@ Top5 predictions:
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 3348.0071 3346.9349 3355.5216 3344.4685 3.2114
+ 2759.1538 2758.6797 2762.3322 2756.9490 1.7848
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 1.045 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-model-on-adreno-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/2387d8448da213eb625e6b3d916327d4/deploy_model_on_adreno.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_model_on_adreno.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_model_on_android.html b/docs/how_to/deploy_models/deploy_model_on_android.html
index dca8fa92d6..ce8e4794cb 100644
--- a/docs/how_to/deploy_models/deploy_model_on_android.html
+++ b/docs/how_to/deploy_models/deploy_model_on_android.html
@@ -661,7 +661,7 @@ to the remote android device.</p>
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 16.0523 16.0353 16.3517 15.9120 0.1204
+ 16.3971 16.2932 17.4048 15.9455 0.4637
</pre></div>
</div>
</div>
diff --git a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
index 13d4230c7b..aefb6f8d50 100644
--- a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
+++ b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
@@ -453,30 +453,23 @@ be unstable.</p>
Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
0%| | 0.00/170M [00:00<?, ?B/s]
- 5%|4 | 7.99M/170M [00:00<00:03, 54.8MB/s]
- 9%|9 | 16.0M/170M [00:00<00:02, 56.4MB/s]
- 13%|#3 | 22.3M/170M [00:00<00:02, 57.4MB/s]
- 16%|#6 | 27.8M/170M [00:00<00:02, 51.8MB/s]
- 19%|#9 | 32.8M/170M [00:00<00:03, 43.8MB/s]
- 24%|##3 | 40.0M/170M [00:00<00:02, 51.8MB/s]
- 27%|##7 | 46.3M/170M [00:00<00:02, 55.6MB/s]
- 31%|### | 51.8M/170M [00:01<00:02, 53.6MB/s]
- 34%|###3 | 57.1M/170M [00:01<00:02, 53.4MB/s]
- 38%|###8 | 65.2M/170M [00:01<00:01, 62.1MB/s]
- 42%|####2 | 72.0M/170M [00:01<00:01, 57.7MB/s]
- 48%|####8 | 82.2M/170M [00:01<00:01, 70.7MB/s]
- 53%|#####2 | 89.2M/170M [00:01<00:01, 67.1MB/s]
- 57%|#####6 | 96.0M/170M [00:01<00:01, 51.0MB/s]
- 61%|######1 | 104M/170M [00:01<00:01, 56.3MB/s]
- 66%|######5 | 112M/170M [00:02<00:01, 59.4MB/s]
- 71%|####### | 120M/170M [00:02<00:00, 64.9MB/s]
- 75%|#######5 | 128M/170M [00:02<00:00, 64.5MB/s]
- 80%|######## | 136M/170M [00:02<00:00, 62.2MB/s]
- 85%|########4 | 144M/170M [00:02<00:00, 64.9MB/s]
- 89%|########9 | 152M/170M [00:02<00:00, 63.2MB/s]
- 94%|#########4| 160M/170M [00:02<00:00, 64.2MB/s]
- 99%|#########8| 168M/170M [00:02<00:00, 66.7MB/s]
-100%|##########| 170M/170M [00:02<00:00, 60.3MB/s]
+ 5%|4 | 7.99M/170M [00:00<00:02, 77.7MB/s]
+ 9%|9 | 16.1M/170M [00:00<00:01, 82.1MB/s]
+ 14%|#4 | 24.4M/170M [00:00<00:01, 84.0MB/s]
+ 19%|#9 | 32.4M/170M [00:00<00:02, 66.5MB/s]
+ 26%|##5 | 43.7M/170M [00:00<00:01, 82.5MB/s]
+ 35%|###4 | 58.6M/170M [00:00<00:01, 103MB/s]
+ 41%|#### | 69.0M/170M [00:00<00:01, 94.8MB/s]
+ 49%|####8 | 82.5M/170M [00:00<00:00, 108MB/s]
+ 55%|#####4 | 93.2M/170M [00:01<00:00, 105MB/s]
+ 61%|###### | 104M/170M [00:01<00:00, 105MB/s]
+ 67%|######6 | 114M/170M [00:01<00:00, 105MB/s]
+ 75%|#######5 | 128M/170M [00:01<00:00, 99.5MB/s]
+ 81%|########1 | 138M/170M [00:01<00:00, 99.3MB/s]
+ 87%|########6 | 147M/170M [00:01<00:00, 85.8MB/s]
+ 92%|#########1| 156M/170M [00:01<00:00, 82.9MB/s]
+ 97%|#########6| 164M/170M [00:01<00:00, 66.8MB/s]
+100%|##########| 170M/170M [00:02<00:00, 88.1MB/s]
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/torch/nn/functional.py:3897: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
for i in range(dim)
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/detection/anchor_utils.py:124: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode=& [...]
@@ -574,7 +567,7 @@ torchvision rcnn models.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Get 9 valid boxes
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes 16.530 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes 18.669 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-object-detection-pytorch-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7795da4b258c8feff986668b95ef57ad/deploy_object_detection_pytorch.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_object_detection_pytorch.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized.html b/docs/how_to/deploy_models/deploy_prequantized.html
index 0dd0e2cef0..b69a8f5876 100644
--- a/docs/how_to/deploy_models/deploy_prequantized.html
+++ b/docs/how_to/deploy_models/deploy_prequantized.html
@@ -497,8 +497,8 @@ training. Other models require a full post training calibration.</p>
Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
0%| | 0.00/13.6M [00:00<?, ?B/s]
- 59%|#####8 | 7.99M/13.6M [00:00<00:00, 57.7MB/s]
-100%|##########| 13.6M/13.6M [00:00<00:00, 58.6MB/s]
+ 74%|#######4 | 10.1M/13.6M [00:00<00:00, 106MB/s]
+100%|##########| 13.6M/13.6M [00:00<00:00, 89.6MB/s]
</pre></div>
</div>
</div>
@@ -589,7 +589,7 @@ output values are identical out of 1000 outputs from mobilenet v2.</p>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 90.2720 90.1835 92.6822 90.0486 0.3489
+ 90.5629 90.4978 93.3052 90.1424 0.3764
</pre></div>
</div>
<div class="admonition note">
@@ -628,7 +628,7 @@ This includes support for the VNNI 8 bit dot product instruction (CascadeLake or
<div class="section" id="deploy-a-quantized-tflite-model">
<h2>Deploy a quantized TFLite Model<a class="headerlink" href="#deploy-a-quantized-tflite-model" title="Permalink to this headline">¶</a></h2>
<p>TODO</p>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 5.994 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 6.934 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/fb8217c13f4351224c6cf3aacf1a87fc/deploy_prequantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized_tflite.html b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
index 79f91454ee..4d97695723 100644
--- a/docs/how_to/deploy_models/deploy_prequantized_tflite.html
+++ b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
@@ -582,7 +582,7 @@ TFLite Top-5 labels: [387 102 386 341 349]
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 120.0999 119.9757 126.8035 119.1603 0.9080
+ 120.3317 120.1745 124.9000 119.0818 0.8623
</pre></div>
</div>
<div class="admonition note">
@@ -610,7 +610,7 @@ network for ARM CPU</span></a>.</p></li>
</ul>
</div></blockquote>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 22.706 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 22.579 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-tflite-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/56691c7a27d45da61d112276334640d3/deploy_prequantized_tflite.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized_tflite.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_quantized.html b/docs/how_to/deploy_models/deploy_quantized.html
index 27f5dc10e2..9e51767cb1 100644
--- a/docs/how_to/deploy_models/deploy_quantized.html
+++ b/docs/how_to/deploy_models/deploy_quantized.html
@@ -520,7 +520,7 @@ for calibration. But the accuracy might be impacted.</p>
DeprecationWarning,
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 35.278 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 34.966 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-quantized-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7810ecf51bfc05f7d5e8a400ac3e815d/deploy_quantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_quantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
index 83efc0433c..7de5549a71 100644
--- a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
+++ b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
@@ -462,24 +462,25 @@ to your device.</p>
Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
0%| | 0/132723 [00:00<?, ?KB/s]
- 5%|4 | 6201/132723 [00:00<00:02, 62004.89KB/s]
- 10%|# | 13858/132723 [00:00<00:01, 70545.93KB/s]
- 16%|#5 | 20913/132723 [00:00<00:02, 49584.40KB/s]
- 21%|##1 | 28461/132723 [00:00<00:01, 57721.10KB/s]
- 27%|##7 | 35957/132723 [00:00<00:01, 63061.21KB/s]
- 33%|###2 | 43478/132723 [00:00<00:01, 66786.23KB/s]
- 38%|###8 | 51085/132723 [00:00<00:01, 69612.52KB/s]
- 44%|####4 | 58678/132723 [00:00<00:01, 71526.17KB/s]
- 50%|####9 | 66229/132723 [00:00<00:00, 72727.22KB/s]
- 56%|#####5 | 73767/132723 [00:01<00:00, 73523.38KB/s]
- 61%|######1 | 81449/132723 [00:01<00:00, 74510.35KB/s]
- 67%|######7 | 89153/132723 [00:01<00:00, 75264.21KB/s]
- 73%|#######2 | 96785/132723 [00:01<00:00, 75570.84KB/s]
- 79%|#######8 | 104499/132723 [00:01<00:00, 76033.41KB/s]
- 85%|########4 | 112194/132723 [00:01<00:00, 76306.90KB/s]
- 90%|######### | 119841/132723 [00:01<00:00, 75937.68KB/s]
- 96%|#########6| 127449/132723 [00:01<00:00, 75977.86KB/s]
-100%|##########| 132723/132723 [00:01<00:00, 71249.89KB/s]
+ 4%|4 | 5781/132723 [00:00<00:02, 57801.10KB/s]
+ 10%|# | 13645/132723 [00:00<00:01, 70049.01KB/s]
+ 16%|#5 | 20650/132723 [00:00<00:01, 61793.77KB/s]
+ 21%|##1 | 28503/132723 [00:00<00:01, 67903.95KB/s]
+ 27%|##7 | 36334/132723 [00:00<00:01, 71470.53KB/s]
+ 33%|###2 | 43576/132723 [00:01<00:04, 20581.95KB/s]
+ 37%|###7 | 49140/132723 [00:01<00:03, 23410.10KB/s]
+ 42%|####2 | 55772/132723 [00:01<00:02, 29341.70KB/s]
+ 48%|####7 | 63166/132723 [00:01<00:01, 36695.89KB/s]
+ 52%|#####2 | 69102/132723 [00:01<00:01, 40470.44KB/s]
+ 58%|#####7 | 76351/132723 [00:01<00:01, 42878.79KB/s]
+ 62%|######1 | 81905/132723 [00:02<00:01, 45231.81KB/s]
+ 68%|######7 | 89612/132723 [00:02<00:00, 52676.35KB/s]
+ 74%|#######3 | 97674/132723 [00:02<00:00, 59672.59KB/s]
+ 80%|#######9 | 105559/132723 [00:02<00:00, 64723.86KB/s]
+ 86%|########5 | 113506/132723 [00:02<00:00, 68753.44KB/s]
+ 92%|#########1| 121466/132723 [00:02<00:00, 71799.03KB/s]
+ 98%|#########7| 129408/132723 [00:02<00:00, 73975.47KB/s]
+100%|##########| 132723/132723 [00:02<00:00, 48532.86KB/s]
</pre></div>
</div>
<p>Create TVM runtime and do inference
@@ -518,7 +519,7 @@ Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from h
<span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
-<img src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" srcset="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" alt="deploy ssd gluoncv" class = "sphx-glr-single-img"/><p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes 0.344 seconds)</p>
+<img src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" srcset="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" alt="deploy ssd gluoncv" class = "sphx-glr-single-img"/><p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes 3.566 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-ssd-gluoncv-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/cccb17d28e5e8b2e94ea8cd5ec59f6ed/deploy_ssd_gluoncv.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_ssd_gluoncv.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/sg_execution_times.html b/docs/how_to/deploy_models/sg_execution_times.html
index 3039a01976..77f327f709 100644
--- a/docs/how_to/deploy_models/sg_execution_times.html
+++ b/docs/how_to/deploy_models/sg_execution_times.html
@@ -340,7 +340,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-deploy-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>13:47.084</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
+<p><strong>13:47.636</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 86%" />
@@ -349,43 +349,43 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_object_detection_pytorch.html#sphx-glr-how-to-deploy-models-deploy-object-detection-pytorch-py"><span class="std std-ref">Compile PyTorch Object Detection Models</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_object_detection_pytorch.py</span></code>)</p></td>
-<td><p>03:16.530</p></td>
+<td><p>03:18.669</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_ssd_gluoncv.html#sphx-glr-how-to-deploy-models-deploy-ssd-gluoncv-py"><span class="std std-ref">Deploy Single Shot Multibox Detector(SSD) model</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_ssd_gluoncv.py</span></code>)</p></td>
-<td><p>03:00.344</p></td>
+<td><p>03:03.566</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_prequantized_tflite.html#sphx-glr-how-to-deploy-models-deploy-prequantized-tflite-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM - Part 3 (TFLite)</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized_tflite.py</span></code>)</p></td>
-<td><p>02:22.706</p></td>
+<td><p>02:22.579</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_quantized.html#sphx-glr-how-to-deploy-models-deploy-quantized-py"><span class="std std-ref">Deploy a Quantized Model on Cuda</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_quantized.py</span></code>)</p></td>
-<td><p>01:35.278</p></td>
+<td><p>01:34.966</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_prequantized.html#sphx-glr-how-to-deploy-models-deploy-prequantized-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized.py</span></code>)</p></td>
-<td><p>01:05.994</p></td>
+<td><p>01:06.934</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_model_on_adreno.html#sphx-glr-how-to-deploy-models-deploy-model-on-adreno-py"><span class="std std-ref">Deploy the Pretrained Model on Adreno</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_adreno.py</span></code>)</p></td>
-<td><p>01:01.045</p></td>
+<td><p>00:54.349</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_model_on_android.html#sphx-glr-how-to-deploy-models-deploy-model-on-android-py"><span class="std std-ref">Deploy the Pretrained Model on Android</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_android.py</span></code>)</p></td>
-<td><p>00:35.542</p></td>
+<td><p>00:36.467</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_model_on_nano.html#sphx-glr-how-to-deploy-models-deploy-model-on-nano-py"><span class="std std-ref">Deploy the Pretrained Model on Jetson Nano</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_nano.py</span></code>)</p></td>
-<td><p>00:25.075</p></td>
+<td><p>00:25.309</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_model_on_rasp.html#sphx-glr-how-to-deploy-models-deploy-model-on-rasp-py"><span class="std std-ref">Deploy the Pretrained Model on Raspberry Pi</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_rasp.py</span></code>)</p></td>
-<td><p>00:24.564</p></td>
+<td><p>00:24.791</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_sparse.html#sphx-glr-how-to-deploy-models-deploy-sparse-py"><span class="std std-ref">Deploy a Hugging Face Pruned Model on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_sparse.py</span></code>)</p></td>
-<td><p>00:00.006</p></td>
+<td><p>00:00.007</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/extend_tvm/bring_your_own_datatypes.html b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
index 3e078f4ace..1962883951 100644
--- a/docs/how_to/extend_tvm/bring_your_own_datatypes.html
+++ b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
@@ -621,7 +621,7 @@ In this alpha state of the Bring Your Own Datatypes framework, we have not imple
<span class="n">module</span><span class="p">,</span> <a href="https://docs.python.org/3/library/stdtypes.html#dict" title="builtins.dict" class="sphx-glr-backref-module-builtins sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">params</span></a> <span class="o">=</span> <span class="n">get_mobilenet</span><span class="p">()</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip89771a59-08c8-4305-8923-fd45f2a3d2ef from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip470f7e27-c603-4af7-aee9-57c5b98a0281 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
</pre></div>
</div>
<p>It’s easy to execute MobileNet with native TVM:</p>
diff --git a/docs/how_to/extend_tvm/sg_execution_times.html b/docs/how_to/extend_tvm/sg_execution_times.html
index 4838a6497b..828f778dd4 100644
--- a/docs/how_to/extend_tvm/sg_execution_times.html
+++ b/docs/how_to/extend_tvm/sg_execution_times.html
@@ -340,7 +340,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-extend-tvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:47.078</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
+<p><strong>00:48.070</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 84%" />
@@ -349,11 +349,11 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="bring_your_own_datatypes.html#sphx-glr-how-to-extend-tvm-bring-your-own-datatypes-py"><span class="std std-ref">Bring Your Own Datatypes to TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">bring_your_own_datatypes.py</span></code>)</p></td>
-<td><p>00:43.618</p></td>
+<td><p>00:44.559</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="use_pass_instrument.html#sphx-glr-how-to-extend-tvm-use-pass-instrument-py"><span class="std std-ref">How to Use TVM Pass Instrument</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_instrument.py</span></code>)</p></td>
-<td><p>00:02.416</p></td>
+<td><p>00:02.467</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="use_pass_infra.html#sphx-glr-how-to-extend-tvm-use-pass-infra-py"><span class="std std-ref">How to Use TVM Pass Infra</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_infra.py</span></code>)</p></td>
diff --git a/docs/how_to/extend_tvm/use_pass_instrument.html b/docs/how_to/extend_tvm/use_pass_instrument.html
index c3dcd4fb87..7c5dfcb596 100644
--- a/docs/how_to/extend_tvm/use_pass_instrument.html
+++ b/docs/how_to/extend_tvm/use_pass_instrument.html
@@ -525,10 +525,10 @@ profile the execution time of each passes.</p>
</pre></div>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 7197us [7197us] (46.55%; 46.55%)
-FoldScaleAxis: 8264us [7us] (53.45%; 53.45%)
- FoldConstant: 8258us [1692us] (53.41%; 99.92%)
- InferType: 6566us [6566us] (42.47%; 79.51%)
+InferType: 7278us [7278us] (46.51%; 46.51%)
+FoldScaleAxis: 8371us [7us] (53.49%; 53.49%)
+ FoldConstant: 8364us [1711us] (53.44%; 99.91%)
+ InferType: 6653us [6653us] (42.51%; 79.54%)
</pre></div>
</div>
</div>
@@ -550,10 +550,10 @@ Refer to following sections and <a class="reference internal" href="../../refere
</pre></div>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 6591us [6591us] (44.98%; 44.98%)
-FoldScaleAxis: 8062us [5us] (55.02%; 55.02%)
- FoldConstant: 8057us [1661us] (54.99%; 99.94%)
- InferType: 6396us [6396us] (43.65%; 79.38%)
+InferType: 6722us [6722us] (44.87%; 44.87%)
+FoldScaleAxis: 8259us [5us] (55.13%; 55.13%)
+ FoldConstant: 8253us [1693us] (55.09%; 99.94%)
+ InferType: 6561us [6561us] (43.80%; 79.49%)
</pre></div>
</div>
<p>Register empty list to clear existing instruments.</p>
diff --git a/docs/how_to/optimize_operators/opt_conv_cuda.html b/docs/how_to/optimize_operators/opt_conv_cuda.html
index 03a0a3d95c..2f3108bf57 100644
--- a/docs/how_to/optimize_operators/opt_conv_cuda.html
+++ b/docs/how_to/optimize_operators/opt_conv_cuda.html
@@ -577,7 +577,7 @@ latency of convolution.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Convolution: </span><span class="si">%f</span><span class="s2"> ms"</span> <span class="o">%</span> <span class="p">(</span><span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">b</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span> <span class="o">*</span> <span cl [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 54.383968 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 43.233119 ms
</pre></div>
</div>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-optimize-operators-opt-conv-cuda-py">
diff --git a/docs/how_to/optimize_operators/opt_conv_tensorcore.html b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
index 08bef3175f..bb011ffb69 100644
--- a/docs/how_to/optimize_operators/opt_conv_tensorcore.html
+++ b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
@@ -914,7 +914,7 @@ be able to run on our build server</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"conv2d with tensor core: </span><span class="si">%f</span><span class="s2"> ms"</span> <span class="o">%</span> <span class="p">(</span><span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span> <span class="o">* [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 12.437085 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 13.369456 ms
</pre></div>
</div>
</div>
diff --git a/docs/how_to/optimize_operators/opt_gemm.html b/docs/how_to/optimize_operators/opt_gemm.html
index 5c89f36e97..826dac2716 100644
--- a/docs/how_to/optimize_operators/opt_gemm.html
+++ b/docs/how_to/optimize_operators/opt_gemm.html
@@ -474,8 +474,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
<span class="nb">print</span><span class="p">(</span><span class="s2">"Baseline: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.018901
-Baseline: 3.340735
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.018572
+Baseline: 3.238095
</pre></div>
</div>
<p>In TVM, we can always inspect lower level IR to debug or optimize our schedule.
@@ -534,7 +534,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt1: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.302373
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.298859
</pre></div>
</div>
<p>Here is the generated IR after blocking.</p>
@@ -600,7 +600,7 @@ vastly.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt2: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.343603
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.344994
</pre></div>
</div>
<p>Here is the generated IR after vectorization.</p>
@@ -660,7 +660,7 @@ the access pattern for A matrix is more cache friendly.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt3: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.117663
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.116641
</pre></div>
</div>
<p>Here is the generated IR after loop permutation.</p>
@@ -742,7 +742,7 @@ flattening.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt4: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.109533
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.109901
</pre></div>
</div>
<p>Here is the generated IR after array packing.</p>
@@ -827,7 +827,7 @@ write to C when all the block results are ready.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt5: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.110918
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.110611
</pre></div>
</div>
<p>Here is the generated IR after blocking.</p>
@@ -916,7 +916,7 @@ write to C when all the block results are ready.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt6: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">opt6_time</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.146906
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.147471
</pre></div>
</div>
<p>Here is the generated IR after parallelization.</p>
diff --git a/docs/how_to/optimize_operators/sg_execution_times.html b/docs/how_to/optimize_operators/sg_execution_times.html
index 452455db8d..124eeb11dd 100644
--- a/docs/how_to/optimize_operators/sg_execution_times.html
+++ b/docs/how_to/optimize_operators/sg_execution_times.html
@@ -340,7 +340,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-optimize-operators-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:34.828</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
+<p><strong>00:34.659</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 83%" />
@@ -349,15 +349,15 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="opt_gemm.html#sphx-glr-how-to-optimize-operators-opt-gemm-py"><span class="std std-ref">How to optimize GEMM on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_gemm.py</span></code>)</p></td>
-<td><p>00:32.278</p></td>
+<td><p>00:31.940</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="opt_conv_tensorcore.html#sphx-glr-how-to-optimize-operators-opt-conv-tensorcore-py"><span class="std std-ref">How to optimize convolution using TensorCores</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_tensorcore.py</span></code>)</p></td>
-<td><p>00:01.482</p></td>
+<td><p>00:01.560</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="opt_conv_cuda.html#sphx-glr-how-to-optimize-operators-opt-conv-cuda-py"><span class="std std-ref">How to optimize convolution on GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_cuda.py</span></code>)</p></td>
-<td><p>00:01.068</p></td>
+<td><p>00:01.159</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
index 70130c8242..a9f843487a 100644
--- a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
+++ b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
@@ -340,7 +340,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-tune-with-autoscheduler-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>08:51.166</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
+<p><strong>08:56.904</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 85%" />
@@ -349,27 +349,27 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_conv2d_layer_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py"><span class="std std-ref">Auto-scheduling a Convolution Layer for GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_layer_cuda.py</span></code>)</p></td>
-<td><p>05:29.345</p></td>
+<td><p>05:32.158</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_network_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-x86-py"><span class="std std-ref">Auto-scheduling a Neural Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_x86.py</span></code>)</p></td>
-<td><p>01:31.730</p></td>
+<td><p>01:32.047</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_network_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-cuda-py"><span class="std std-ref">Auto-scheduling a Neural Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_cuda.py</span></code>)</p></td>
-<td><p>01:00.346</p></td>
+<td><p>01:01.196</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_sparse_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-sparse-x86-py"><span class="std std-ref">Auto-scheduling Sparse Matrix Multiplication on CPU with Custom Sketch Rule</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_sparse_x86.py</span></code>)</p></td>
-<td><p>00:26.583</p></td>
+<td><p>00:28.031</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_network_arm.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-arm-py"><span class="std std-ref">Auto-scheduling a Neural Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_arm.py</span></code>)</p></td>
-<td><p>00:11.939</p></td>
+<td><p>00:12.104</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_network_mali.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-mali-py"><span class="std std-ref">Auto-scheduling a Neural Network for mali GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_mali.py</span></code>)</p></td>
-<td><p>00:11.223</p></td>
+<td><p>00:11.369</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
index 281cd9ad0e..60bb02d218 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
@@ -503,11 +503,11 @@ cooperative fetching, unrolling and operator fusion.</p>
bias: Buffer(bias_2: Pointer(float32), float32, [1, 512, 1, 1], []),
compute: Buffer(compute_2: Pointer(float32), float32, [1, 512, 7, 7], [])}
buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, compute_1: compute} {
- attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 28;
+ attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 16;
allocate(conv2d_nchw: Pointer(local float32), float32, [14]), storage_scope = local;
- allocate(pad_temp.shared: Pointer(shared float32), float32, [72]), storage_scope = shared;
- allocate(kernel.shared: Pointer(shared float32), float32, [3072]), storage_scope = shared;
- attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64 {
+ allocate(pad_temp.shared: Pointer(shared float32), float32, [1296]), storage_scope = shared;
+ allocate(kernel.shared: Pointer(shared float32), float32, [4608]), storage_scope = shared;
+ attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112 {
conv2d_nchw_1: Buffer(conv2d_nchw, float32, [14], [], scope="local", align=32)[0] = 0f32
conv2d_nchw_1[1] = 0f32
conv2d_nchw_1[2] = 0f32
@@ -522,463 +522,381 @@ cooperative fetching, unrolling and operator fusion.</p>
conv2d_nchw_1[11] = 0f32
conv2d_nchw_1[12] = 0f32
conv2d_nchw_1[13] = 0f32
- for (rc.outer.outer: int32, 0, 64) {
- for (ry.outer.outer: int32, 0, 3) {
- let cse_var_2: int32 = (rc.outer.outer*72)
- let cse_var_1: int32 = (ry.outer.outer*3)
- {
- attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64 {
- if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
- pad_temp.shared_1: Buffer(pad_temp.shared, float32, [72], [], scope="shared")[(threadIdx.x_1*4)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1*4), 9))) && (floormod((threadIdx.x_1*4), 9) < 8)), data_3: Buffer(data_2, float32, [25088], [])[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1*4), 9)*49)) + (ry.outer.out [...]
- }
- if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
- pad_temp.shared_1[((threadIdx.x_1*4) + 1)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 1), 9))) && (floormod(((threadIdx.x_1*4) + 1), 9) < 8)), data_3[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 1), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 1), 9)) - 8)], [...]
- }
- if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
- pad_temp.shared_1[((threadIdx.x_1*4) + 2)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 2), 9))) && (floormod(((threadIdx.x_1*4) + 2), 9) < 8)), data_3[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 2), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 2), 9)) - 8)], [...]
- }
- if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
- pad_temp.shared_1[((threadIdx.x_1*4) + 3)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 3), 9))) && (floormod(((threadIdx.x_1*4) + 3), 9) < 8)), data_3[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 3), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 3), 9)) - 8)], [...]
- }
- }
- attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1: Buffer(kernel.shared, float32, [3072], [], scope="shared")[threadIdx.x_2] = kernel_3: Buffer(kernel_2, float32, [2359296], [])[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 64)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 64), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 128)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 128), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 192)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 36864)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 256)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 256), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 320)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 320), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 384)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 73728)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 448)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 448), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 512)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 512), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 576)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 110592)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 640)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 640), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 704)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 704), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 768)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 147456)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 832)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 832), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 896)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 896), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 960)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 184320)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1024)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1024), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1088)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1088), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1152)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 221184)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1216)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1216), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1280)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1280), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1344)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 258048)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1408)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1408), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1472)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1472), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1536)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 294912)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1600)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1600), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1664)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1664), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1728)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 331776)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1792)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1792), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1856)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1856), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1920)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 368640)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1984)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1984), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2048)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2048), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2112)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 405504)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2176)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2176), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2240)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2240), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2304)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 442368)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2368)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2368), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2432)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2432), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2496)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 479232)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2560)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2560), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2624)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2624), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2688)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 516096)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2752)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2752), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2816)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2816), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2880)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 552960)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2944)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2944), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 3008)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 3008), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[0]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[9]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[1]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[2]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[3]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[4]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[5]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[6]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[0]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[9]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[8]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[17]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[8]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[17]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[18]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[27]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[18]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[27]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[26]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[35]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[26]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[35]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[36]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[45]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[36]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[45]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[44]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[53]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[44]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[53]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[54]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[63]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[54]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[63]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[62]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[71]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[62]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[71]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+ for (rc.outer.outer: int32, 0, 32) {
+ let cse_var_2: int32 = (rc.outer.outer*784)
+ let cse_var_1: int32 = (rc.outer.outer*144)
+ {
+ attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ pad_temp.shared_1: Buffer(pad_temp.shared, float32, [1296], [], scope="shared")[threadIdx.x_1] = @tir.if_then_else(((((9 <= floormod(threadIdx.x_1, 81)) && (floormod(threadIdx.x_1, 81) < 72)) && (1 <= floormod(threadIdx.x_1, 9))) && (floormod(threadIdx.x_1, 9) < 8)), data_3: Buffer(data_2, float32, [25088], [])[((((cse_var_2 + (floordiv(threadIdx.x_1, 81)*49)) + (floordiv(floormod(threadIdx.x_1, 81), 9)*7)) + floormod(threadIdx.x_1, 9 [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ pad_temp.shared_1[(threadIdx.x_1 + 112)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 31), 81)) && (floormod((threadIdx.x_1 + 31), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 4), 9))) && (floormod((threadIdx.x_1 + 4), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 112), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 31), 81), 9)*7)) + floormod((threadIdx.x_1 + 4), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ pad_temp.shared_1[(threadIdx.x_1 + 224)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 62), 81)) && (floormod((threadIdx.x_1 + 62), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 8), 9))) && (floormod((threadIdx.x_1 + 8), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 224), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 62), 81), 9)*7)) + floormod((threadIdx.x_1 + 8), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ pad_temp.shared_1[(threadIdx.x_1 + 336)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 12), 81)) && (floormod((threadIdx.x_1 + 12), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 3), 9))) && (floormod((threadIdx.x_1 + 3), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 336), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 12), 81), 9)*7)) + floormod((threadIdx.x_1 + 3), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ pad_temp.shared_1[(threadIdx.x_1 + 448)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 43), 81)) && (floormod((threadIdx.x_1 + 43), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 7), 9))) && (floormod((threadIdx.x_1 + 7), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 448), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 43), 81), 9)*7)) + floormod((threadIdx.x_1 + 7), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ pad_temp.shared_1[(threadIdx.x_1 + 560)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 74), 81)) && (floormod((threadIdx.x_1 + 74), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 2), 9))) && (floormod((threadIdx.x_1 + 2), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 560), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 74), 81), 9)*7)) + floormod((threadIdx.x_1 + 2), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ pad_temp.shared_1[(threadIdx.x_1 + 672)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 24), 81)) && (floormod((threadIdx.x_1 + 24), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 6), 9))) && (floormod((threadIdx.x_1 + 6), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 672), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 24), 81), 9)*7)) + floormod((threadIdx.x_1 + 6), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ pad_temp.shared_1[(threadIdx.x_1 + 784)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 55), 81)) && (floormod((threadIdx.x_1 + 55), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 1), 9))) && (floormod((threadIdx.x_1 + 1), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 784), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 55), 81), 9)*7)) + floormod((threadIdx.x_1 + 1), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ pad_temp.shared_1[(threadIdx.x_1 + 896)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 5), 81)) && (floormod((threadIdx.x_1 + 5), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 5), 9))) && (floormod((threadIdx.x_1 + 5), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 896), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 5), 81), 9)*7)) + floormod((threadIdx.x_1 + 5), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ pad_temp.shared_1[(threadIdx.x_1 + 1008)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 9) + 4), 9)) && (floormod((threadIdx.x_1 + 36), 81) < 72)) && (1 <= floormod(threadIdx.x_1, 9))) && (floormod(threadIdx.x_1, 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1008), 81)*49)) + (floormod((floordiv(threadIdx.x_1, 9) + 4), 9)*7)) + floormod(threadIdx.x_1, 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ pad_temp.shared_1[(threadIdx.x_1 + 1120)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 67), 81)) && (floormod((threadIdx.x_1 + 67), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 4), 9))) && (floormod((threadIdx.x_1 + 4), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1120), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 67), 81), 9)*7)) + floormod((threadIdx.x_1 + 4), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ if @tir.likely((threadIdx.x_1 < 64), dtype=bool) {
+ pad_temp.shared_1[(threadIdx.x_1 + 1232)] = @tir.if_then_else((((threadIdx.x_1 < 55) && (1 <= floormod((threadIdx.x_1 + 8), 9))) && (floormod((threadIdx.x_1 + 8), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1232), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 17), 81), 9)*7)) + floormod((threadIdx.x_1 + 8), 9)) - 8)], 0f32, dtype=float32)
+ }
+ attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1: Buffer(kernel.shared, float32, [4608], [], scope="shared")[threadIdx.x_2] = kernel_3: Buffer(kernel_2, float32, [2359296], [])[(((blockIdx.x*147456) + cse_var_1) + threadIdx.x_2)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 112)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 112), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 112), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 224)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 224), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 80), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 336)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 336), 144)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 16), 48)*3)) + floormod(threadIdx.x_2, 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 448)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 448), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 16), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 560)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 560), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 128), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 672)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 672), 144)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 32), 48)*3)) + floormod(threadIdx.x_2, 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 784)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 784), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 64), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 896)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 896), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 1008)] = kernel_3[((((blockIdx.x*147456) + cse_var_1) + threadIdx.x_2) + 32256)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 1120)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1120), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 112), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 1232)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1232), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 80), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 1344)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1344), 144)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 16), 48)*3)) + floormod(threadIdx.x_2, 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 1456)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1456), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 16), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 1568)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1568), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 128), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 1680)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1680), 144)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 32), 48)*3)) + floormod(threadIdx.x_2, 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 1792)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1792), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 64), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 1904)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1904), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 2016)] = kernel_3[((((blockIdx.x*147456) + cse_var_1) + threadIdx.x_2) + 64512)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 2128)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2128), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 112), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 2240)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2240), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 80), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 2352)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2352), 144)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 16), 48)*3)) + floormod(threadIdx.x_2, 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 2464)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2464), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 16), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 2576)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2576), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 128), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 2688)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2688), 144)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 32), 48)*3)) + floormod(threadIdx.x_2, 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 2800)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2800), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 64), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 2912)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2912), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 3024)] = kernel_3[((((blockIdx.x*147456) + cse_var_1) + threadIdx.x_2) + 96768)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 3136)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 3136), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 112), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 3248)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 3248), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 80), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 3360)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 3360), 144)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 16), 48)*3)) + floormod(threadIdx.x_2, 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 3472)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 3472), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 16), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 3584)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 3584), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 128), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 3696)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 3696), 144)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 32), 48)*3)) + floormod(threadIdx.x_2, 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 3808)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 3808), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 64), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 3920)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 3920), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 4032)] = kernel_3[((((blockIdx.x*147456) + cse_var_1) + threadIdx.x_2) + 129024)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 4144)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 4144), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 112), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 4256)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 4256), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 80), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 4368)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 4368), 144)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 16), 48)*3)) + floormod(threadIdx.x_2, 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ kernel.shared_1[(threadIdx.x_2 + 4480)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 4480), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 16), 144), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+ if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
+ kernel.shared_1[(threadIdx.x_2 + 4592)] = kernel_3[(((((blockIdx.x*147456) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 128), 144), 3)*3)) + floormod((threadIdx.x_2 + 2), 3)) + 142848)]
+ }
+ for (rc.outer.inner: int32, 0, 8) {
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((rc.outer.inner*162) + floormod(threadIdx.x, 7))]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 9)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 18)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 27)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 36)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 45)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 54)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[((rc.outer.inner*162) + floormod(threadIdx.x, 7))]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 9)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 18)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 27)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 36)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 45)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 54)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 1)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 10)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 19)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 28)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 37)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 46)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 55)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 1)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 10)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 19)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 28)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 37)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 46)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 55)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 2)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 11)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 29)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 38)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 47)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 56)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 2)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 11)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 29)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 38)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 47)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 56)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 81)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 90)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 99)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 108)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 117)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 126)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 135)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 81)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 90)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 99)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 108)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 117)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 126)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 135)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 82)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 100)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 109)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 118)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 127)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 136)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 82)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 100)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 109)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 118)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 127)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 136)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 83)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 92)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 101)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 110)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 119)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 128)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 137)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 83)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 92)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 101)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 110)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 119)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 128)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 137)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 9)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 18)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 27)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 36)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 45)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 54)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 9)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 18)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 27)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 36)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 45)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 54)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 10)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 19)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 28)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 37)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 46)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 55)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 64)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 10)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 19)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 28)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 37)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 46)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 55)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 64)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 11)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 29)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 38)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 47)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 56)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 65)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 11)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 29)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 38)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 47)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 56)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 65)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 90)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 99)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 108)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 117)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 126)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 135)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 144)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 90)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 99)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 108)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 117)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 126)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 135)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 144)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 100)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 109)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 118)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 127)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 136)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 145)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 100)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 109)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 118)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 127)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 136)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 145)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 92)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 101)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 110)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 119)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 128)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 137)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 146)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 92)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 101)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 110)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 119)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 128)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 137)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 146)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 18)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 27)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 36)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 45)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 54)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 72)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 18)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 27)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 36)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 45)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 54)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 72)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 19)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 28)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 37)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 46)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 55)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 64)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 73)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 19)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 28)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 37)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 46)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 55)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 64)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 73)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 29)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 38)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 47)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 56)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 65)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 74)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 29)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 38)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 47)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 56)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 65)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 74)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 99)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 108)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 117)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 126)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 135)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 144)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 153)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 99)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 108)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 117)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 126)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 135)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 144)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 153)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 100)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 109)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 118)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 127)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 136)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 145)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 154)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 100)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 109)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 118)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 127)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 136)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 145)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 154)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 101)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 110)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 119)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 128)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 137)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 146)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 155)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 101)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 110)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 119)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 128)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 137)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 146)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + floormod(threadIdx.x, 7)) + 155)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
}
}
}
for (i1.inner: int32, 0, 2) {
- for (i3.inner: int32, 0, 7) {
- compute_3: Buffer(compute_2, float32, [25088], [])[(((((floordiv(blockIdx.x, 7)*6272) + (threadIdx.x*98)) + (i1.inner*49)) + (floormod(blockIdx.x, 7)*7)) + i3.inner)] = max((conv2d_nchw_1[((i1.inner*7) + i3.inner)] + bias_3: Buffer(bias_2, float32, [512], [])[(((floordiv(blockIdx.x, 7)*128) + (threadIdx.x*2)) + i1.inner)]), 0f32)
+ for (i2.inner: int32, 0, 7) {
+ compute_3: Buffer(compute_2, float32, [25088], [])[(((((blockIdx.x*1568) + (floordiv(threadIdx.x, 7)*98)) + (i1.inner*49)) + (i2.inner*7)) + floormod(threadIdx.x, 7))] = max((conv2d_nchw_1[((i1.inner*7) + i2.inner)] + bias_3: Buffer(bias_2, float32, [512], [])[(((blockIdx.x*32) + (floordiv(threadIdx.x, 7)*2)) + i1.inner)]), 0f32)
}
}
}
@@ -1016,7 +934,7 @@ cooperative fetching, unrolling and operator fusion.</p>
<span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 0.359 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 0.219 ms
</pre></div>
</div>
</div>
@@ -1045,36 +963,36 @@ conv2d_nchw_nn_o_i, conv2d_nchw_nn_i = s[conv2d_nchw].split(conv2d_nchw_nn, fact
conv2d_nchw_nn_o_o_i, conv2d_nchw_nn_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_i, factor=1)
conv2d_nchw_nn_o_o_o_i, conv2d_nchw_nn_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_i, factor=1)
conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
-conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
-conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=2)
-conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=64)
+conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=2)
+conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=1)
+conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=16)
conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
-conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
+conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=7)
conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=1)
conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
-conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=7)
-conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
+conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
+conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=7)
conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=2)
-conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=4)
+conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=8)
conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1)
-conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=1)
-conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=1)
-conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=3)
+conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=3)
+conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=3)
+conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=1)
s[conv2d_nchw].reorder(conv2d_nchw_nn_o_o_o_o, conv2d_nchw_ff_o_o_o_o, conv2d_nchw_yy_o_o_o_o, conv2d_nchw_xx_o_o_o_o, conv2d_nchw_nn_o_o_o_i, conv2d_nchw_ff_o_o_o_i, conv2d_nchw_yy_o_o_o_i, conv2d_nchw_xx_o_o_o_i, conv2d_nchw_nn_o_o_i, conv2d_nchw_ff_o_o_i, conv2d_nchw_yy_o_o_i, conv2d_nchw_xx_o_o_i, conv2d_nchw_rc_o_o, conv2d_nchw_ry_o_o, conv2d_nchw_rx_o_o, conv2d_nchw_rc_o_i, conv2d_nchw_ry_o_i, conv2d_nchw_rx_o_i, conv2d_nchw_nn_o_i, conv2d_nchw_ff_o_i, conv2d_nchw_yy_o_i, conv2d_nc [...]
compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1)
compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=2)
-compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=64)
+compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=16)
compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
-compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
+compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=7)
compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=1)
compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
-compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
-compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
+compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
+compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=7)
compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
@@ -1094,12 +1012,12 @@ s[compute].bind(compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused, te.thread
kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=112)
s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
-pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=4)
+pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=112)
s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 512)
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "unroll_explicit", True)
@@ -1119,10 +1037,10 @@ CUDA source code:
#define int64_t long long
#define uint64_t unsigned long long
#endif
-extern "C" __global__ void __launch_bounds__(64) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+extern "C" __global__ void __launch_bounds__(112) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
float conv2d_nchw[14];
- __shared__ float pad_temp_shared[72];
- __shared__ float kernel_shared[3072];
+ __shared__ float pad_temp_shared[1296];
+ __shared__ float kernel_shared[4608];
conv2d_nchw[0] = 0.000000e+00f;
conv2d_nchw[1] = 0.000000e+00f;
conv2d_nchw[2] = 0.000000e+00f;
@@ -1137,411 +1055,325 @@ extern "C" __global__ void __launch_bounds__(64) default_function_kern
conv2d_nchw[11] = 0.000000e+00f;
conv2d_nchw[12] = 0.000000e+00f;
conv2d_nchw[13] = 0.000000e+00f;
- for (int rc_outer_outer = 0; rc_outer_outer < 64; ++rc_outer_outer) {
- for (int ry_outer_outer = 0; ry_outer_outer < 3; ++ry_outer_outer) {
- __syncthreads();
- if (((int)threadIdx.x) < 18) {
- pad_temp_shared[(((int)threadIdx.x) * 4)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) * 4) % 9))) && (((((int)threadIdx.x) * 4) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + (((((int)threadIdx.x) * 4) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) * 4) % 9)) - 8)] : 0.000000e+00f);
- }
- if (((int)threadIdx.x) < 18) {
- pad_temp_shared[((((int)threadIdx.x) * 4) + 1)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 1) % 9))) && ((((((int)threadIdx.x) * 4) + 1) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 1) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 1) % 9)) - 8)] : 0.000000e+00f);
- }
- if (((int)threadIdx.x) < 18) {
- pad_temp_shared[((((int)threadIdx.x) * 4) + 2)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 2) % 9))) && ((((((int)threadIdx.x) * 4) + 2) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 2) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 2) % 9)) - 8)] : 0.000000e+00f);
- }
- if (((int)threadIdx.x) < 18) {
- pad_temp_shared[((((int)threadIdx.x) * 4) + 3)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 3) % 9))) && ((((((int)threadIdx.x) * 4) + 3) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 3) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 3) % 9)) - 8)] : 0.000000e+00f);
- }
- kernel_shared[((int)threadIdx.x)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3))];
- kernel_shared[(((int)threadIdx.x) + 64)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 64) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 128)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 128) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 192)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 36864)];
- kernel_shared[(((int)threadIdx.x) + 256)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 256) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 320)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 320) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 384)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 73728)];
- kernel_shared[(((int)threadIdx.x) + 448)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 448) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 512)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 512) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 576)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 110592)];
- kernel_shared[(((int)threadIdx.x) + 640)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 640) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 704)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 704) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 768)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 147456)];
- kernel_shared[(((int)threadIdx.x) + 832)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 832) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 896)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 896) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 960)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 184320)];
- kernel_shared[(((int)threadIdx.x) + 1024)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1024) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1088)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1088) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1152)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 221184)];
- kernel_shared[(((int)threadIdx.x) + 1216)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1216) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1280)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1280) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1344)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 258048)];
- kernel_shared[(((int)threadIdx.x) + 1408)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1408) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1472)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1472) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1536)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 294912)];
- kernel_shared[(((int)threadIdx.x) + 1600)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1600) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1664)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1664) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1728)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 331776)];
- kernel_shared[(((int)threadIdx.x) + 1792)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1792) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1856)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1856) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1920)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 368640)];
- kernel_shared[(((int)threadIdx.x) + 1984)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1984) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2048)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2048) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2112)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 405504)];
- kernel_shared[(((int)threadIdx.x) + 2176)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2176) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2240)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2240) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2304)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 442368)];
- kernel_shared[(((int)threadIdx.x) + 2368)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2368) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2432)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2432) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2496)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 479232)];
- kernel_shared[(((int)threadIdx.x) + 2560)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2560) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2624)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2624) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2688)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 516096)];
- kernel_shared[(((int)threadIdx.x) + 2752)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2752) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2816)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2816) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2880)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 552960)];
- kernel_shared[(((int)threadIdx.x) + 2944)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2944) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 3008)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 3008) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- __syncthreads();
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[0] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[1] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[2] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[3] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[4] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[5] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[6] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[0] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+ for (int rc_outer_outer = 0; rc_outer_outer < 32; ++rc_outer_outer) {
+ __syncthreads();
+ pad_temp_shared[((int)threadIdx.x)] = (((((9 <= (((int)threadIdx.x) % 81)) && ((((int)threadIdx.x) % 81) < 72)) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((int)threadIdx.x) / 81) * 49)) + (((((int)threadIdx.x) % 81) / 9) * 7)) + (((int)threadIdx.x) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 112)] = (((((9 <= ((((int)threadIdx.x) + 31) % 81)) && (((((int)threadIdx.x) + 31) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 4) % 9))) && (((((int)threadIdx.x) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 112) / 81) * 49)) + ((((((int)threadIdx.x) + 31) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 4) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 224)] = (((((9 <= ((((int)threadIdx.x) + 62) % 81)) && (((((int)threadIdx.x) + 62) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 8) % 9))) && (((((int)threadIdx.x) + 8) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 224) / 81) * 49)) + ((((((int)threadIdx.x) + 62) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 8) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 336)] = (((((9 <= ((((int)threadIdx.x) + 12) % 81)) && (((((int)threadIdx.x) + 12) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 3) % 9))) && (((((int)threadIdx.x) + 3) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 336) / 81) * 49)) + ((((((int)threadIdx.x) + 12) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 3) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 448)] = (((((9 <= ((((int)threadIdx.x) + 43) % 81)) && (((((int)threadIdx.x) + 43) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 7) % 9))) && (((((int)threadIdx.x) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 448) / 81) * 49)) + ((((((int)threadIdx.x) + 43) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 7) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 560)] = (((((9 <= ((((int)threadIdx.x) + 74) % 81)) && (((((int)threadIdx.x) + 74) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 2) % 9))) && (((((int)threadIdx.x) + 2) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 560) / 81) * 49)) + ((((((int)threadIdx.x) + 74) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 2) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 672)] = (((((9 <= ((((int)threadIdx.x) + 24) % 81)) && (((((int)threadIdx.x) + 24) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 6) % 9))) && (((((int)threadIdx.x) + 6) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 672) / 81) * 49)) + ((((((int)threadIdx.x) + 24) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 6) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 784)] = (((((9 <= ((((int)threadIdx.x) + 55) % 81)) && (((((int)threadIdx.x) + 55) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 1) % 9))) && (((((int)threadIdx.x) + 1) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 784) / 81) * 49)) + ((((((int)threadIdx.x) + 55) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 1) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 896)] = (((((9 <= ((((int)threadIdx.x) + 5) % 81)) && (((((int)threadIdx.x) + 5) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 5) % 9))) && (((((int)threadIdx.x) + 5) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 896) / 81) * 49)) + ((((((int)threadIdx.x) + 5) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 5) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1008)] = (((((1 <= (((((int)threadIdx.x) / 9) + 4) % 9)) && (((((int)threadIdx.x) + 36) % 81) < 72)) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 1008) / 81) * 49)) + ((((((int)threadIdx.x) / 9) + 4) % 9) * 7)) + (((int)threadIdx.x) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1120)] = (((((9 <= ((((int)threadIdx.x) + 67) % 81)) && (((((int)threadIdx.x) + 67) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 4) % 9))) && (((((int)threadIdx.x) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 1120) / 81) * 49)) + ((((((int)threadIdx.x) + 67) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 4) % 9)) - 8)] : 0.000000e+00f);
+ if (((int)threadIdx.x) < 64) {
+ pad_temp_shared[(((int)threadIdx.x) + 1232)] = ((((((int)threadIdx.x) < 55) && (1 <= ((((int)threadIdx.x) + 8) % 9))) && (((((int)threadIdx.x) + 8) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 1232) / 81) * 49)) + (((((int)threadIdx.x) + 17) / 9) * 7)) + ((((int)threadIdx.x) + 8) % 9)) - 8)] : 0.000000e+00f);
+ }
+ kernel_shared[((int)threadIdx.x)] = kernel[(((((int)blockIdx.x) * 147456) + (rc_outer_outer * 144)) + ((int)threadIdx.x))];
+ kernel_shared[(((int)threadIdx.x) + 112)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 112) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 112) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 224)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 224) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 80) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 336)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 336) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) / 3) + 16) % 48) * 3)) + (((int)threadIdx.x) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 448)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 448) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 560)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 560) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 128) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 672)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 672) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) / 3) + 32) % 48) * 3)) + (((int)threadIdx.x) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 784)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 784) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 64) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 896)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 896) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 32) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1008)] = kernel[((((((int)blockIdx.x) * 147456) + (rc_outer_outer * 144)) + ((int)threadIdx.x)) + 32256)];
+ kernel_shared[(((int)threadIdx.x) + 1120)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1120) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 112) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1232)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1232) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 80) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1344)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1344) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) / 3) + 16) % 48) * 3)) + (((int)threadIdx.x) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1456)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1456) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1568)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1568) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 128) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1680)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1680) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) / 3) + 32) % 48) * 3)) + (((int)threadIdx.x) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1792)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1792) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 64) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1904)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1904) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 32) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2016)] = kernel[((((((int)blockIdx.x) * 147456) + (rc_outer_outer * 144)) + ((int)threadIdx.x)) + 64512)];
+ kernel_shared[(((int)threadIdx.x) + 2128)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2128) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 112) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2240)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2240) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 80) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2352)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2352) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) / 3) + 16) % 48) * 3)) + (((int)threadIdx.x) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2464)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2464) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2576)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2576) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 128) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2688)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2688) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) / 3) + 32) % 48) * 3)) + (((int)threadIdx.x) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2800)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2800) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 64) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2912)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2912) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 32) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 3024)] = kernel[((((((int)blockIdx.x) * 147456) + (rc_outer_outer * 144)) + ((int)threadIdx.x)) + 96768)];
+ kernel_shared[(((int)threadIdx.x) + 3136)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 3136) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 112) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 3248)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 3248) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 80) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 3360)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 3360) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) / 3) + 16) % 48) * 3)) + (((int)threadIdx.x) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 3472)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 3472) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 3584)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 3584) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 128) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 3696)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 3696) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) / 3) + 32) % 48) * 3)) + (((int)threadIdx.x) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 3808)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 3808) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 64) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 3920)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 3920) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 32) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 4032)] = kernel[((((((int)blockIdx.x) * 147456) + (rc_outer_outer * 144)) + ((int)threadIdx.x)) + 129024)];
+ kernel_shared[(((int)threadIdx.x) + 4144)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 4144) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 112) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 4256)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 4256) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 80) % 144) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 4368)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 4368) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) / 3) + 16) % 48) * 3)) + (((int)threadIdx.x) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 4480)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 4480) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ if (((int)threadIdx.x) < 16) {
+ kernel_shared[(((int)threadIdx.x) + 4592)] = kernel[(((((((int)blockIdx.x) * 147456) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 128) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3)) + 142848)];
+ }
+ __syncthreads();
+ for (int rc_outer_inner = 0; rc_outer_inner < 8; ++rc_outer_inner) {
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 162) + (((int)threadIdx.x) % 7))] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 9)] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 18)] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 27)] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 36)] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 45)] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 54)] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((rc_outer_inner * 162) + (((int)threadIdx.x) % 7))] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 9)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 18)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 27)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 36)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 45)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 54)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 1)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 10)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 28)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 37)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 46)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 55)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 1)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 10)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 28)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 37)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 46)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 55)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 2)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 29)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 38)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 47)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 56)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 2)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 29)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 38)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 47)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 56)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 81)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 90)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 99)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 108)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 117)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 126)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 135)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 81)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 90)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 99)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 108)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 117)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 126)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 135)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 82)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 100)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 109)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 118)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 127)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 136)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 82)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 100)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 109)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 118)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 127)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 136)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 83)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 92)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 101)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 110)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 119)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 128)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 137)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 83)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 92)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 101)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 110)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 119)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 128)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 137)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 9)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 18)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 27)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 36)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 45)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 54)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 9)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 18)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 27)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 36)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 45)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 54)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 10)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 28)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 37)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 46)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 55)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 64)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 10)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 28)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 37)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 46)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 55)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 64)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 29)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 38)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 47)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 56)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 65)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 29)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 38)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 47)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 56)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 65)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 90)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 99)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 108)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 117)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 126)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 135)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 144)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 90)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 99)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 108)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 117)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 126)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 135)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 144)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 100)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 109)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 118)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 127)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 136)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 145)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 100)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 109)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 118)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 127)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 136)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 145)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 92)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 101)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 110)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 119)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 128)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 137)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 146)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 92)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 101)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 110)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 119)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 128)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 137)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 146)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 18)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 27)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 36)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 45)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 54)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 72)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 18)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 27)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 36)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 45)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 54)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 72)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 28)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 37)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 46)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 55)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 64)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 73)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 28)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 37)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 46)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 55)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 64)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 73)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 29)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 38)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 47)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 56)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 65)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 74)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 29)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 38)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 47)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 56)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 65)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 74)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 99)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 108)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 117)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 126)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 135)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 144)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 153)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 99)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 108)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 117)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 126)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 135)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 144)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 153)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 100)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 109)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 118)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 127)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 136)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 145)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 154)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 100)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 109)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 118)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 127)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 136)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 145)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 154)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 101)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 110)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 119)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 128)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 137)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 146)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 155)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 101)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 110)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 119)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 128)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 137)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 146)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + (((int)threadIdx.x) % 7)) + 155)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
}
}
for (int i1_inner = 0; i1_inner < 2; ++i1_inner) {
- for (int i3_inner = 0; i3_inner < 7; ++i3_inner) {
- compute[((((((((int)blockIdx.x) / 7) * 6272) + (((int)threadIdx.x) * 98)) + (i1_inner * 49)) + ((((int)blockIdx.x) % 7) * 7)) + i3_inner)] = max((conv2d_nchw[((i1_inner * 7) + i3_inner)] + bias[((((((int)blockIdx.x) / 7) * 128) + (((int)threadIdx.x) * 2)) + i1_inner)]), 0.000000e+00f);
+ for (int i2_inner = 0; i2_inner < 7; ++i2_inner) {
+ compute[(((((((int)blockIdx.x) * 1568) + ((((int)threadIdx.x) / 7) * 98)) + (i1_inner * 49)) + (i2_inner * 7)) + (((int)threadIdx.x) % 7))] = max((conv2d_nchw[((i1_inner * 7) + i2_inner)] + bias[(((((int)blockIdx.x) * 32) + ((((int)threadIdx.x) / 7) * 2)) + i1_inner)]), 0.000000e+00f);
}
}
}
@@ -1579,7 +1411,7 @@ In the example below we resume the status and do more 5 trials.</p>
Get devices for measurement successfully!
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 5 minutes 29.345 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 5 minutes 32.158 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/e3e540f3b477c0c52d8eb73e674e8ffd/tune_conv2d_layer_cuda.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_conv2d_layer_cuda.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
index 277d062b2a..058a067d5f 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
@@ -915,7 +915,7 @@ so we can read the log file and load the best schedules.</p>
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 7.8755 7.8683 7.8899 7.8681 0.0102
+ 7.8762 7.8741 7.8957 7.8589 0.0151
</pre></div>
</div>
</div>
@@ -937,7 +937,7 @@ to learn how to use the RPC Tracker and RPC Server.
To use the RPC Tracker in auto-scheduler, replace the runner in <code class="code docutils literal notranslate"><span class="pre">TuningOptions</span></code>
with <a class="reference internal" href="../../reference/api/python/auto_scheduler.html#tvm.auto_scheduler.RPCRunner" title="tvm.auto_scheduler.RPCRunner"><code class="xref any py py-class docutils literal notranslate"><span class="pre">auto_scheduler.RPCRunner</span></code></a>.</p></li>
</ol>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 0.346 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 1.196 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-network-cuda-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/eafe360d52540634c9eea0fa89e804bd/tune_network_cuda.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_network_cuda.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_x86.html b/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
index ece360e57d..3be23d2d3b 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
@@ -934,7 +934,7 @@ so we can read the log file and load the best schedules.</p>
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 762.2645 761.4779 763.9916 761.3242 1.2228
+ 755.9799 756.7106 757.0075 754.2215 1.2493
</pre></div>
</div>
</div>
@@ -956,7 +956,7 @@ to learn how to use the RPC Tracker and RPC Server.
To use the RPC Tracker in auto-scheduler, replace the runner in <code class="code docutils literal notranslate"><span class="pre">TuningOptions</span></code>
with <a class="reference internal" href="../../reference/api/python/auto_scheduler.html#tvm.auto_scheduler.RPCRunner" title="tvm.auto_scheduler.RPCRunner"><code class="xref any py py-class docutils literal notranslate"><span class="pre">auto_scheduler.RPCRunner</span></code></a>.</p></li>
</ol>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 31.730 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 32.047 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-network-x86-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/e416b94ca1090b0897c0f6e0df95b911/tune_network_x86.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_network_x86.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html b/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
index 6bc8d85d51..0a8c2b12c0 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
@@ -632,27 +632,27 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
placeholder_4: Buffer(placeholder_14: Pointer(float32), float32, [128, 512], []),
compute: Buffer(compute_2: Pointer(float32), float32, [128, 512], [])}
buffer_map = {placeholder_5: placeholder, placeholder_6: placeholder_1, placeholder_7: placeholder_2, placeholder_8: placeholder_3, placeholder_9: placeholder_4, compute_1: compute} {
- for (i0.outer: int32, 0, 32) "parallel" {
- allocate(compute_3: Pointer(global float32), float32, [64]), storage_scope = global;
- for (i1.outer: int32, 0, 32) {
- for (i.inner.init: int32, 0, 4) {
- for (j.init: int32, 0, 16) {
- compute_4: Buffer(compute_3, float32, [64], [])[((i.inner.init*16) + j.init)] = 0f32
+ for (i0.outer.i1.outer.fused: int32, 0, 128) "parallel" {
+ allocate(compute_3: Pointer(global float32), float32, [512]), storage_scope = global {
+ for (nb_j.inner: int32, 0, 2) {
+ for (i.inner.init: int32, 0, 16) {
+ for (j.init: int32, 0, 16) {
+ compute_4: Buffer(compute_3, float32, [512], [])[(((i.inner.init*32) + (nb_j.inner*16)) + j.init)] = 0f32
+ }
}
- }
- for (elem_idx: int32, 0, (placeholder_15: Buffer(placeholder_13, int32, [33], [])[(i1.outer + 1)] - placeholder_15[i1.outer])) {
- for (i.inner: int32, 0, 4) {
- for (j: int32, 0, 16) {
- if @tir.likely((elem_idx < (placeholder_15[(i1.outer + 1)] - placeholder_15[i1.outer])), dtype=bool) {
- let cse_var_1: int32 = ((i.inner*16) + j)
- compute_4[cse_var_1] = (compute_4[cse_var_1] + (placeholder_16: Buffer(placeholder_11, float32, [78656], [])[(((placeholder_15[i1.outer]*16) + (elem_idx*16)) + j)]*max(placeholder_17: Buffer(placeholder_10, float32, [32768], [])[(((i0.outer*1024) + (i.inner*256)) + placeholder_18: Buffer(placeholder_12, int32, [4916], [])[(placeholder_15[i1.outer] + elem_idx)])], 0f32)))
+ for (elem_idx: int32, 0, let cse_var_1: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner) in (placeholder_15: Buffer(placeholder_13, int32, [33], [])[(cse_var_1 + 1)] - placeholder_15[cse_var_1])) {
+ for (i.inner: int32, 0, 16) {
+ for (j: int32, 0, 16) {
+ let cse_var_3: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)
+ let cse_var_2: int32 = (((i.inner*32) + (nb_j.inner*16)) + j)
+ compute_4[cse_var_2] = (compute_4[cse_var_2] + (placeholder_16: Buffer(placeholder_11, float32, [78656], [])[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + j)]*max(placeholder_17: Buffer(placeholder_10, float32, [32768], [])[(((floordiv(i0.outer.i1.outer.fused, 16)*4096) + (i.inner*256)) + placeholder_18: Buffer(placeholder_12, int32, [4916], [])[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
}
}
}
}
- for (i0.inner: int32, 0, 4) {
- let cse_var_2: int32 = (((i0.outer*2048) + (i0.inner*512)) + (i1.outer*16))
- compute_5: Buffer(compute_2, float32, [65536], [])[ramp(cse_var_2, 1, 16)] = max((compute_4[ramp((i0.inner*16), 1, 16)] + placeholder_19: Buffer(placeholder_14, float32, [65536], [])[ramp(cse_var_2, 1, 16)]), broadcast(0f32, 16))
+ for (i0.inner: int32, 0, 16) {
+ let cse_var_4: int32 = (((floordiv(i0.outer.i1.outer.fused, 16)*8192) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 16)*32))
+ compute_5: Buffer(compute_2, float32, [65536], [])[ramp(cse_var_4, 1, 32)] = max((compute_4[ramp((i0.inner*32), 1, 32)] + placeholder_19: Buffer(placeholder_14, float32, [65536], [])[ramp(cse_var_4, 1, 32)]), broadcast(0f32, 32))
}
}
}
@@ -690,7 +690,7 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
<span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 1.285 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 1.500 ms
</pre></div>
</div>
<div class="admonition note">
diff --git a/docs/how_to/tune_with_autotvm/sg_execution_times.html b/docs/how_to/tune_with_autotvm/sg_execution_times.html
index 9fc31efdbb..9e5171305e 100644
--- a/docs/how_to/tune_with_autotvm/sg_execution_times.html
+++ b/docs/how_to/tune_with_autotvm/sg_execution_times.html
@@ -340,7 +340,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-tune-with-autotvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:35.661</strong> total execution time for <strong>how_to_tune_with_autotvm</strong> files:</p>
+<p><strong>00:36.157</strong> total execution time for <strong>how_to_tune_with_autotvm</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 84%" />
@@ -349,7 +349,7 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_conv2d_cuda.html#sphx-glr-how-to-tune-with-autotvm-tune-conv2d-cuda-py"><span class="std std-ref">Tuning High Performance Convolution on NVIDIA GPUs</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_cuda.py</span></code>)</p></td>
-<td><p>00:35.626</p></td>
+<td><p>00:36.122</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_relay_x86.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-x86-py"><span class="std std-ref">Auto-tuning a Convolutional Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_x86.py</span></code>)</p></td>
@@ -360,11 +360,11 @@
<td><p>00:00.005</p></td>
<td><p>0.0 MB</p></td>
</tr>
-<tr class="row-even"><td><p><a class="reference internal" href="tune_relay_arm.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-arm-py"><span class="std std-ref">Auto-tuning a Convolutional Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_arm.py</span></code>)</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="tune_relay_mobile_gpu.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-mobile-gpu-py"><span class="std std-ref">Auto-tuning a Convolutional Network for Mobile GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_mobile_gpu.py</span></code>)</p></td>
<td><p>00:00.005</p></td>
<td><p>0.0 MB</p></td>
</tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="tune_relay_mobile_gpu.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-mobile-gpu-py"><span class="std std-ref">Auto-tuning a Convolutional Network for Mobile GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_mobile_gpu.py</span></code>)</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="tune_relay_arm.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-arm-py"><span class="std std-ref">Auto-tuning a Convolutional Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_arm.py</span></code>)</p></td>
<td><p>00:00.005</p></td>
<td><p>0.0 MB</p></td>
</tr>
diff --git a/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html b/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
index 056dfca75e..806d099ec9 100644
--- a/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
+++ b/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
@@ -568,9 +568,9 @@ waiting for device...
device available
Get devices for measurement successfully!
No: 1 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
func = build(s, args, target_host=task.target_host, runtime=runtime)
File "/workspace/python/tvm/driver/build_module.py", line 227, in build
input_mod = lower(inputs, args, name=name, binds=binds)
@@ -603,11 +603,11 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -631,7 +631,7 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
@@ -659,11 +659,11 @@ Traceback (most recent call last):
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -687,15 +687,13 @@ Traceback (most recent call last):
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 32, 2, 8]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 2, 256]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,381188
-No: 2 GFLOPS: 21.21/21.21 result: MeasureResult(costs=(0.0109166595,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.188716411590576, timestamp=1669924312.333331) [('tile_f', [-1, 4, 2, 8]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1009945
-No: 3 GFLOPS: 1.81/21.21 result: MeasureResult(costs=(0.12811106849999998,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.0552613735198975, timestamp=1669924315.252629) [('tile_f', [-1, 1, 1, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 64, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,8771180
-No: 4 GFLOPS: 0.00/21.21 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 32, 2, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5725173
+No: 2 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
func = build(s, args, target_host=task.target_host, runtime=runtime)
File "/workspace/python/tvm/driver/build_module.py", line 227, in build
input_mod = lower(inputs, args, name=name, binds=binds)
@@ -728,11 +726,11 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -756,7 +754,7 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
@@ -784,11 +782,11 @@ Traceback (most recent call last):
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -812,13 +810,13 @@ Traceback (most recent call last):
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 32, 2]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 256]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3862850
-No: 5 GFLOPS: 0.00/21.21 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 128, 1, 2]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 32]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1502662
+No: 3 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
func = build(s, args, target_host=task.target_host, runtime=runtime)
File "/workspace/python/tvm/driver/build_module.py", line 227, in build
input_mod = lower(inputs, args, name=name, binds=binds)
@@ -851,11 +849,11 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -879,7 +877,7 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
@@ -907,11 +905,11 @@ Traceback (most recent call last):
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -935,14 +933,13 @@ Traceback (most recent call last):
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 256, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 32, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7764339
-No: 6 GFLOPS: 54.84/54.84 result: MeasureResult(costs=(0.0042212298,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4288978576660156, timestamp=1669924317.857191) [('tile_f', [-1, 8, 2, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 16]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5739593
-No: 7 GFLOPS: 0.00/54.84 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 1, 256]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,3398117
+No: 4 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
func = build(s, args, target_host=task.target_host, runtime=runtime)
File "/workspace/python/tvm/driver/build_module.py", line 227, in build
input_mod = lower(inputs, args, name=name, binds=binds)
@@ -975,11 +972,11 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1003,7 +1000,7 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
@@ -1031,11 +1028,11 @@ Traceback (most recent call last):
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1059,13 +1056,13 @@ Traceback (most recent call last):
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 4, 32]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9255595
-No: 8 GFLOPS: 0.00/54.84 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 64, 1, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 64]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5775362
+No: 5 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
func = build(s, args, target_host=task.target_host, runtime=runtime)
File "/workspace/python/tvm/driver/build_module.py", line 227, in build
input_mod = lower(inputs, args, name=name, binds=binds)
@@ -1098,11 +1095,11 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1126,7 +1123,7 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
@@ -1154,11 +1151,11 @@ Traceback (most recent call last):
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1182,17 +1179,13 @@ Traceback (most recent call last):
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 16, 2, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 32, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5283447
-No: 9 GFLOPS: 7.45/54.84 result: MeasureResult(costs=(0.03108157525,), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.272521734237671, timestamp=1669924323.2844048) [('tile_f', [-1, 4, 4, 2]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,272214
-No: 10 GFLOPS: 248.13/248.13 result: MeasureResult(costs=(0.0009329768670520231,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.2568655014038086, timestamp=1669924324.2515388) [('tile_f', [-1, 1, 4, 2]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2338892
-No: 11 GFLOPS: 24.74/248.13 result: MeasureResult(costs=(0.009356567363636364,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.166534662246704, timestamp=1669924324.9288423) [('tile_f', [-1, 4, 2, 4]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 16, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,14190
-No: 12 GFLOPS: 25.82/248.13 result: MeasureResult(costs=(0.008965818333333334,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.8921427726745605, timestamp=1669924325.6391897) [('tile_f', [-1, 2, 1, 2]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2009316
-No: 13 GFLOPS: 0.00/248.13 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 1, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 2, 128]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,950458
+No: 6 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
func = build(s, args, target_host=task.target_host, runtime=runtime)
File "/workspace/python/tvm/driver/build_module.py", line 227, in build
input_mod = lower(inputs, args, name=name, binds=binds)
@@ -1225,11 +1218,11 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1253,7 +1246,7 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
@@ -1281,11 +1274,11 @@ Traceback (most recent call last):
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1309,13 +1302,13 @@ Traceback (most recent call last):
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 8, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 16, 8]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4175501
-No: 14 GFLOPS: 0.00/248.13 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 16, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 512, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,1774279
+No: 7 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
func = build(s, args, target_host=task.target_host, runtime=runtime)
File "/workspace/python/tvm/driver/build_module.py", line 227, in build
input_mod = lower(inputs, args, name=name, binds=binds)
@@ -1348,11 +1341,11 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1376,7 +1369,7 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
@@ -1404,11 +1397,11 @@ Traceback (most recent call last):
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1432,13 +1425,13 @@ Traceback (most recent call last):
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 16, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 16]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8452306
-No: 15 GFLOPS: 0.00/248.13 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 4, 64]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 1, 64]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4421107
+No: 8 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
func = build(s, args, target_host=task.target_host, runtime=runtime)
File "/workspace/python/tvm/driver/build_module.py", line 227, in build
input_mod = lower(inputs, args, name=name, binds=binds)
@@ -1471,11 +1464,11 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1499,7 +1492,7 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
@@ -1527,11 +1520,11 @@ Traceback (most recent call last):
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1555,13 +1548,15 @@ Traceback (most recent call last):
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 128, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 8, 64]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,3073315
-No: 16 GFLOPS: 0.00/248.13 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 64, 2, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 16, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6054196
+No: 9 GFLOPS: 62.21/62.21 result: MeasureResult(costs=(0.003721080674418605,), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.198905944824219, timestamp=1669925187.6235414) [('tile_f', [-1, 1, 2, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 8]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9397530
+No: 10 GFLOPS: 24.14/62.21 result: MeasureResult(costs=(0.009589893636363636,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.2840280532836914, timestamp=1669925188.342286) [('tile_f', [-1, 2, 8, 16]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 1, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,5071620
+No: 11 GFLOPS: 0.00/62.21 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
func = build(s, args, target_host=task.target_host, runtime=runtime)
File "/workspace/python/tvm/driver/build_module.py", line 227, in build
input_mod = lower(inputs, args, name=name, binds=binds)
@@ -1594,11 +1589,11 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1622,7 +1617,7 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
@@ -1650,11 +1645,11 @@ Traceback (most recent call last):
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1678,13 +1673,13 @@ Traceback (most recent call last):
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 2, 32]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 64, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4476532
-No: 17 GFLOPS: 0.00/248.13 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 2, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 16]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3813106
+No: 12 GFLOPS: 0.00/62.21 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
func = build(s, args, target_host=task.target_host, runtime=runtime)
File "/workspace/python/tvm/driver/build_module.py", line 227, in build
input_mod = lower(inputs, args, name=name, binds=binds)
@@ -1717,11 +1712,11 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1745,7 +1740,7 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
@@ -1773,11 +1768,11 @@ Traceback (most recent call last):
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1801,13 +1796,13 @@ Traceback (most recent call last):
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 32, 1, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 32]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9056140
-No: 18 GFLOPS: 0.00/248.13 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 16, 1, 2]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 512, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10292979
+No: 13 GFLOPS: 0.00/62.21 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
func = build(s, args, target_host=task.target_host, runtime=runtime)
File "/workspace/python/tvm/driver/build_module.py", line 227, in build
input_mod = lower(inputs, args, name=name, binds=binds)
@@ -1840,11 +1835,11 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1868,7 +1863,7 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
@@ -1896,11 +1891,11 @@ Traceback (most recent call last):
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1924,13 +1919,13 @@ Traceback (most recent call last):
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 2, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 256, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9937729
-No: 19 GFLOPS: 0.00/248.13 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 2, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 256, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,8934870
+No: 14 GFLOPS: 0.00/62.21 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
func = build(s, args, target_host=task.target_host, runtime=runtime)
File "/workspace/python/tvm/driver/build_module.py", line 227, in build
input_mod = lower(inputs, args, name=name, binds=binds)
@@ -1963,11 +1958,11 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -1991,7 +1986,7 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
@@ -2019,11 +2014,11 @@ Traceback (most recent call last):
14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
at ../include/tvm/runtime/packed_func.h:1646
13: operator()
- at ../src/driver/driver_api.cc:389
+ at ../src/driver/driver_api.cc:388
12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:375
+ at ../src/driver/driver_api.cc:374
11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:270
+ at ../src/driver/driver_api.cc:269
10: tvm::transform::Pass::operator()(tvm::IRModule) const
at ../src/ir/transform.cc:258
9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
@@ -2047,10 +2042,381 @@ Traceback (most recent call last):
0: operator()
at ../src/runtime/c_runtime_api.cc:534
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 256, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 128, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9512853
-No: 20 GFLOPS: 1.64/248.13 result: MeasureResult(costs=(0.141428955,), error_no=MeasureErrorNo.NO_ERROR, all_cost=5.23434591293335, timestamp=1669924331.2717454) [('tile_f', [-1, 8, 4, 16]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 2, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2363418
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 32, 2, 8]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 128]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,375028
+No: 15 GFLOPS: 291.58/291.58 result: MeasureResult(costs=(0.0007939590640394088,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.30177903175354, timestamp=1669925189.9380887) [('tile_f', [-1, 1, 2, 1]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 32, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6250210
+No: 16 GFLOPS: 0.00/291.58 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
+ func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
+ func = build(s, args, target_host=task.target_host, runtime=runtime)
+ File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+ input_mod = lower(inputs, args, name=name, binds=binds)
+ File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+ return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
+ File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
+tvm._ffi.base.TVMError: Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1731
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1671
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1646
+ 13: operator()
+ at ../src/driver/driver_api.cc:388
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:374
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:269
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:453
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1750
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1694
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1618
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1731
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1671
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1646
+ 13: operator()
+ at ../src/driver/driver_api.cc:388
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:374
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:269
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:453
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1750
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1694
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1618
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 32, 1, 8]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 256, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6648761
+No: 17 GFLOPS: 76.88/291.58 result: MeasureResult(costs=(0.0030112359999999996,), error_no=MeasureErrorNo.NO_ERROR, all_cost=8.845435857772827, timestamp=1669925198.9765935) [('tile_f', [-1, 1, 1, 2]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 32, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9312435
+No: 18 GFLOPS: 0.00/291.58 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
+ func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
+ func = build(s, args, target_host=task.target_host, runtime=runtime)
+ File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+ input_mod = lower(inputs, args, name=name, binds=binds)
+ File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+ return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
+ File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
+tvm._ffi.base.TVMError: Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1731
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1671
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1646
+ 13: operator()
+ at ../src/driver/driver_api.cc:388
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:374
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:269
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:453
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1750
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1694
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1618
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1731
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1671
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1646
+ 13: operator()
+ at ../src/driver/driver_api.cc:388
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:374
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:269
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:453
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1750
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1694
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1618
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 64, 2, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 16, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1210896
+No: 19 GFLOPS: 1.35/291.58 result: MeasureResult(costs=(0.17110185225000002,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.638166666030884, timestamp=1669925201.5347054) [('tile_f', [-1, 1, 2, 256]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 2, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3718218
+No: 20 GFLOPS: 0.00/291.58 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
+ func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
+ func = build(s, args, target_host=task.target_host, runtime=runtime)
+ File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+ input_mod = lower(inputs, args, name=name, binds=binds)
+ File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+ return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
+ File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
+tvm._ffi.base.TVMError: Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1731
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1671
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1646
+ 13: operator()
+ at ../src/driver/driver_api.cc:388
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:374
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:269
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:453
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1750
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1694
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1618
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1731
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1671
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1646
+ 13: operator()
+ at ../src/driver/driver_api.cc:388
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:374
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:269
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:453
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1750
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1694
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1618
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 128, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 16, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5860631
</pre></div>
</div>
<p>Finally we can inspect the best config from log file, check correctness,
@@ -2089,9 +2455,9 @@ and measure running time.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Finish loading 20 records
Best config:
-[('tile_f', [-1, 1, 4, 2]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2338892
+[('tile_f', [-1, 1, 2, 1]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 32, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6250210
Finish loading 20 records
-Time cost of this operator: 0.001360
+Time cost of this operator: 0.001184
</pre></div>
</div>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autotvm-tune-conv2d-cuda-py">
diff --git a/docs/how_to/tune_with_autotvm/tune_relay_cuda.html b/docs/how_to/tune_with_autotvm/tune_relay_cuda.html
index d02178a85e..cf7f1857e5 100644
--- a/docs/how_to/tune_with_autotvm/tune_relay_cuda.html
+++ b/docs/how_to/tune_with_autotvm/tune_relay_cuda.html
@@ -492,7 +492,7 @@ We can also load models from MXNet, ONNX and TensorFlow.</p>
<span class="p">}</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>/workspace/python/tvm/target/target.py:393: UserWarning: Try specifying cuda arch by adding 'arch=sm_xx' to your target.
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>/workspace/python/tvm/target/target.py:397: UserWarning: Try specifying cuda arch by adding 'arch=sm_xx' to your target.
warnings.warn("Try specifying cuda arch by adding 'arch=sm_xx' to your target.")
</pre></div>
</div>
diff --git a/docs/how_to/work_with_microtvm/micro_autotune.html b/docs/how_to/work_with_microtvm/micro_autotune.html
index 009d7b7c29..5b8f078c8e 100644
--- a/docs/how_to/work_with_microtvm/micro_autotune.html
+++ b/docs/how_to/work_with_microtvm/micro_autotune.html
@@ -598,10 +598,10 @@ the tuned operator.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>########## Build without Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs Measurements(us)
--------- --- -------- ------- ----- ------ ------- ----------------
-tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 311.3 98.719 (1, 2, 10, 10, 3) 2 1 [311.3]
-tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.078 0.976 (1, 6, 10, 10) 1 1 [3.078]
-tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.963 0.305 (1, 1, 10, 10, 3) 1 1 [0.963]
-Total_time - 315.341 - - - - -
+tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 310.6 98.708 (1, 2, 10, 10, 3) 2 1 [310.6]
+tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.077 0.978 (1, 6, 10, 10) 1 1 [3.077]
+tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.988 0.314 (1, 1, 10, 10, 3) 1 1 [0.988]
+Total_time - 314.665 - - - - -
</pre></div>
</div>
</div>
@@ -653,10 +653,10 @@ Total_time -
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>########## Build with Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs Measurements(us)
--------- --- -------- ------- ----- ------ ------- ----------------
-tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 100.2 97.338 (1, 6, 10, 10, 1) 2 1 [100.2]
-tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.768 1.718 (1, 6, 10, 10) 1 1 [1.768]
-tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.972 0.944 (1, 1, 10, 10, 3) 1 1 [0.972]
-Total_time - 102.94 - - - - -
+tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 102.9 97.476 (1, 6, 10, 10, 1) 2 1 [102.9]
+tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.806 1.711 (1, 6, 10, 10) 1 1 [1.806]
+tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.859 0.813 (1, 3, 10, 10, 1) 1 1 [0.859]
+Total_time - 105.564 - - - - -
</pre></div>
</div>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-autotune-py">
diff --git a/docs/how_to/work_with_microtvm/micro_pytorch.html b/docs/how_to/work_with_microtvm/micro_pytorch.html
index 02b2323c22..94fce2fc7c 100644
--- a/docs/how_to/work_with_microtvm/micro_pytorch.html
+++ b/docs/how_to/work_with_microtvm/micro_pytorch.html
@@ -440,8 +440,7 @@ download a cat image and preprocess it to use as the model input.</p>
Downloading: "https://download.pytorch.org/models/quantized/mobilenet_v2_qnnpack_37f702c5.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2_qnnpack_37f702c5.pth
0%| | 0.00/3.42M [00:00<?, ?B/s]
- 84%|########4 | 2.88M/3.42M [00:00<00:00, 30.2MB/s]
-100%|##########| 3.42M/3.42M [00:00<00:00, 34.4MB/s]
+100%|##########| 3.42M/3.42M [00:00<00:00, 107MB/s]
/workspace/python/tvm/relay/frontend/pytorch_utils.py:47: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
return LooseVersion(torch_ver) > ver
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/setuptools/_distutils/version.py:346: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
@@ -565,7 +564,7 @@ via the host <cite>main.cc`</cite> or if a Zephyr emulated board is selected as
Torch top-1 id: 282, class name: tiger cat
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 3.182 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 3.611 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-pytorch-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/12b9ecc04c41abaa12022061771821d1/micro_pytorch.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">micro_pytorch.py</span></code></a></p>
diff --git a/docs/how_to/work_with_microtvm/micro_train.html b/docs/how_to/work_with_microtvm/micro_train.html
index 66ade91df7..852ec3468c 100644
--- a/docs/how_to/work_with_microtvm/micro_train.html
+++ b/docs/how_to/work_with_microtvm/micro_train.html
@@ -530,7 +530,7 @@ take about <strong>2 minutes</strong> to download the Stanford Cars, while COCO
<a href="https://docs.python.org/3/library/shutil.html#shutil.move" title="shutil.move" class="sphx-glr-backref-module-shutil sphx-glr-backref-type-py-function"><span class="n">shutil</span><span class="o">.</span><span class="n">move</span></a><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><a href="https://docs.python.org/3/library/stdtypes.html#str" title="builtins.str" class="sphx-glr-backref-module-builtins sphx-glr-backref-typ [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>'/tmp/tmpryi74w6s/images/random'
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>'/tmp/tmpa_1psqlr/images/random'
</pre></div>
</div>
</div>
@@ -590,8 +590,8 @@ objects to other stuff? We can display some examples from our datasets using <co
<span class="n">plt</span><span class="o">.</span><span class="n">axis</span><span class="p">(</span><span class="s2">"off"</span><span class="p">)</span>
</pre></div>
</div>
-<img src="../../_images/sphx_glr_micro_train_001.png" srcset="../../_images/sphx_glr_micro_train_001.png" alt="[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0]" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>/tmp/tmpryi74w6s/images/target contains 8144 images
-/tmp/tmpryi74w6s/images/random contains 5000 images
+<img src="../../_images/sphx_glr_micro_train_001.png" srcset="../../_images/sphx_glr_micro_train_001.png" alt="[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>/tmp/tmpa_1psqlr/images/target contains 8144 images
+/tmp/tmpa_1psqlr/images/random contains 5000 images
</pre></div>
</div>
</div>
@@ -703,13 +703,13 @@ the time on our validation set).</p>
</pre></div>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Epoch 1/3
-328/328 - 47s - loss: 0.2985 - accuracy: 0.9078 - val_loss: 0.2031 - val_accuracy: 0.9358 - 47s/epoch - 143ms/step
+328/328 - 47s - loss: 0.2388 - accuracy: 0.9216 - val_loss: 0.1168 - val_accuracy: 0.9607 - 47s/epoch - 143ms/step
Epoch 2/3
-328/328 - 43s - loss: 0.1057 - accuracy: 0.9601 - val_loss: 0.1237 - val_accuracy: 0.9603 - 43s/epoch - 132ms/step
+328/328 - 44s - loss: 0.1131 - accuracy: 0.9590 - val_loss: 0.1981 - val_accuracy: 0.9275 - 44s/epoch - 133ms/step
Epoch 3/3
-328/328 - 43s - loss: 0.0696 - accuracy: 0.9750 - val_loss: 0.0888 - val_accuracy: 0.9717 - 43s/epoch - 131ms/step
+328/328 - 43s - loss: 0.0740 - accuracy: 0.9729 - val_loss: 0.0972 - val_accuracy: 0.9668 - 43s/epoch - 132ms/step
-<keras.callbacks.History object at 0x7fbb09d14410>
+<keras.callbacks.History object at 0x7fe1a804d650>
</pre></div>
</div>
</div>
@@ -971,7 +971,7 @@ as intended.</p>
<p>From here, we could modify the model to read live images from the camera - we have another
Arduino tutorial for how to do that <a class="reference external" href="https://github.com/guberti/tvm-arduino-demos/tree/master/examples/person_detection">on GitHub</a>. Alternatively, we could also
<a class="reference external" href="https://tvm.apache.org/docs/how_to/work_with_microtvm/micro_autotune.html">use TVM’s autotuning capabilities</a> to dramatically improve the model’s performance.</p>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 4 minutes 41.016 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 4 minutes 40.874 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-train-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/b52cec46baf4f78d6bcd94cbe269c8a6/micro_train.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">micro_train.py</span></code></a></p>
diff --git a/docs/how_to/work_with_microtvm/sg_execution_times.html b/docs/how_to/work_with_microtvm/sg_execution_times.html
index 78ab187701..4efeeccd48 100644
--- a/docs/how_to/work_with_microtvm/sg_execution_times.html
+++ b/docs/how_to/work_with_microtvm/sg_execution_times.html
@@ -340,7 +340,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-work-with-microtvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>06:45.843</strong> total execution time for <strong>how_to_work_with_microtvm</strong> files:</p>
+<p><strong>06:47.466</strong> total execution time for <strong>how_to_work_with_microtvm</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 83%" />
@@ -349,23 +349,23 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="micro_train.html#sphx-glr-how-to-work-with-microtvm-micro-train-py"><span class="std std-ref">Training Vision Models for microTVM on Arduino</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_train.py</span></code>)</p></td>
-<td><p>04:41.016</p></td>
+<td><p>04:40.874</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="micro_pytorch.html#sphx-glr-how-to-work-with-microtvm-micro-pytorch-py"><span class="std std-ref">microTVM PyTorch Tutorial</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_pytorch.py</span></code>)</p></td>
-<td><p>01:03.182</p></td>
+<td><p>01:03.611</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="micro_autotune.html#sphx-glr-how-to-work-with-microtvm-micro-autotune-py"><span class="std std-ref">Autotuning with microTVM</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_autotune.py</span></code>)</p></td>
-<td><p>00:49.591</p></td>
+<td><p>00:50.653</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="micro_aot.html#sphx-glr-how-to-work-with-microtvm-micro-aot-py"><span class="std std-ref">microTVM Host-Driven AoT</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_aot.py</span></code>)</p></td>
-<td><p>00:08.293</p></td>
+<td><p>00:08.536</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="micro_tflite.html#sphx-glr-how-to-work-with-microtvm-micro-tflite-py"><span class="std std-ref">microTVM with TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_tflite.py</span></code>)</p></td>
-<td><p>00:03.758</p></td>
+<td><p>00:03.791</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="micro_reference_vm.html#sphx-glr-how-to-work-with-microtvm-micro-reference-vm-py"><span class="std std-ref">microTVM Reference Virtual Machines</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_reference_vm.py</span></code>)</p></td>
diff --git a/docs/how_to/work_with_relay/sg_execution_times.html b/docs/how_to/work_with_relay/sg_execution_times.html
index a6ec68a473..037ed8586b 100644
--- a/docs/how_to/work_with_relay/sg_execution_times.html
+++ b/docs/how_to/work_with_relay/sg_execution_times.html
@@ -340,7 +340,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-work-with-relay-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:44.000</strong> total execution time for <strong>how_to_work_with_relay</strong> files:</p>
+<p><strong>00:39.619</strong> total execution time for <strong>how_to_work_with_relay</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 84%" />
@@ -349,15 +349,15 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="using_pipeline_executor.html#sphx-glr-how-to-work-with-relay-using-pipeline-executor-py"><span class="std std-ref">Using Pipeline Executor in Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_pipeline_executor.py</span></code>)</p></td>
-<td><p>00:32.381</p></td>
+<td><p>00:33.066</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="using_external_lib.html#sphx-glr-how-to-work-with-relay-using-external-lib-py"><span class="std std-ref">Using External Libraries in Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_external_lib.py</span></code>)</p></td>
-<td><p>00:10.108</p></td>
+<td><p>00:04.808</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="build_gcn.html#sphx-glr-how-to-work-with-relay-build-gcn-py"><span class="std std-ref">Building a Graph Convolutional Network</span></a> (<code class="docutils literal notranslate"><span class="pre">build_gcn.py</span></code>)</p></td>
-<td><p>00:01.504</p></td>
+<td><p>00:01.739</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="using_relay_viz.html#sphx-glr-how-to-work-with-relay-using-relay-viz-py"><span class="std std-ref">Use Relay Visualizer to Visualize Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_relay_viz.py</span></code>)</p></td>
diff --git a/docs/how_to/work_with_schedules/intrin_math.html b/docs/how_to/work_with_schedules/intrin_math.html
index cbb922d5d0..317a9cb6ec 100644
--- a/docs/how_to/work_with_schedules/intrin_math.html
+++ b/docs/how_to/work_with_schedules/intrin_math.html
@@ -535,7 +535,7 @@ The following example customizes CUDA lowering rule for <code class="code docuti
<a href="../../reference/api/python/ir.html#tvm.ir.register_intrin_lowering" title="tvm.ir.register_intrin_lowering" class="sphx-glr-backref-module-tvm-ir sphx-glr-backref-type-py-function"><span class="n">register_intrin_lowering</span></a><span class="p">(</span><span class="s2">"tir.exp"</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">"cuda"</span><span class="p">,</span> <span class="n">f</span><span class="o">= [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span><function my_cuda_math_rule at 0x7fbaf205d950>
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span><function my_cuda_math_rule at 0x7fe221957710>
</pre></div>
</div>
<p>Register the rule to TVM with override option to override existing rule.
diff --git a/docs/how_to/work_with_schedules/sg_execution_times.html b/docs/how_to/work_with_schedules/sg_execution_times.html
index 511fe17c68..46153b7f00 100644
--- a/docs/how_to/work_with_schedules/sg_execution_times.html
+++ b/docs/how_to/work_with_schedules/sg_execution_times.html
@@ -340,7 +340,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-work-with-schedules-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:07.619</strong> total execution time for <strong>how_to_work_with_schedules</strong> files:</p>
+<p><strong>00:04.891</strong> total execution time for <strong>how_to_work_with_schedules</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 83%" />
@@ -349,23 +349,23 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="intrin_math.html#sphx-glr-how-to-work-with-schedules-intrin-math-py"><span class="std std-ref">Intrinsics and Math Functions</span></a> (<code class="docutils literal notranslate"><span class="pre">intrin_math.py</span></code>)</p></td>
-<td><p>00:05.239</p></td>
+<td><p>00:02.341</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tensorize.html#sphx-glr-how-to-work-with-schedules-tensorize-py"><span class="std std-ref">Use Tensorize to Leverage Hardware Intrinsics</span></a> (<code class="docutils literal notranslate"><span class="pre">tensorize.py</span></code>)</p></td>
-<td><p>00:01.064</p></td>
+<td><p>00:01.189</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="reduction.html#sphx-glr-how-to-work-with-schedules-reduction-py"><span class="std std-ref">Reduction</span></a> (<code class="docutils literal notranslate"><span class="pre">reduction.py</span></code>)</p></td>
-<td><p>00:00.562</p></td>
+<td><p>00:00.582</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="scan.html#sphx-glr-how-to-work-with-schedules-scan-py"><span class="std std-ref">Scan and Recurrent Kernel</span></a> (<code class="docutils literal notranslate"><span class="pre">scan.py</span></code>)</p></td>
-<td><p>00:00.542</p></td>
+<td><p>00:00.567</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="extern_op.html#sphx-glr-how-to-work-with-schedules-extern-op-py"><span class="std std-ref">External Tensor Functions</span></a> (<code class="docutils literal notranslate"><span class="pre">extern_op.py</span></code>)</p></td>
-<td><p>00:00.115</p></td>
+<td><p>00:00.114</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="schedule_primitives.html#sphx-glr-how-to-work-with-schedules-schedule-primitives-py"><span class="std std-ref">Schedule Primitives in TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">schedule_primitives.py</span></code>)</p></td>
@@ -373,11 +373,11 @@
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="tedd.html#sphx-glr-how-to-work-with-schedules-tedd-py"><span class="std std-ref">Use Tensor Expression Debug Display (TEDD) for Visualization</span></a> (<code class="docutils literal notranslate"><span class="pre">tedd.py</span></code>)</p></td>
-<td><p>00:00.029</p></td>
+<td><p>00:00.030</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tuple_inputs.html#sphx-glr-how-to-work-with-schedules-tuple-inputs-py"><span class="std std-ref">Compute and Reduce with Tuple Inputs</span></a> (<code class="docutils literal notranslate"><span class="pre">tuple_inputs.py</span></code>)</p></td>
-<td><p>00:00.019</p></td>
+<td><p>00:00.020</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/work_with_schedules/tensorize.html b/docs/how_to/work_with_schedules/tensorize.html
index e2e37c25a5..5c8a8fc178 100644
--- a/docs/how_to/work_with_schedules/tensorize.html
+++ b/docs/how_to/work_with_schedules/tensorize.html
@@ -586,7 +586,7 @@ The importing needs to happen before the tensorized GEMV being executed.</p>
B: Buffer(B_2: Pointer(float32), float32, [512, 64], []),
C: Buffer(C_2: Pointer(float32), float32, [1024, 512], [])}
buffer_map = {A_1: A, B_1: B, C_1: C} {
- attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmpf7dqtn78/input0.cc'\nsource_filename = \"/tmp/tmpf7dqtn78/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = allo [...]
+ attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmpbt8ked8y/input0.cc'\nsource_filename = \"/tmp/tmpbt8ked8y/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = allo [...]
for (i, 0, 1024) {
for (j.outer: int32, 0, 32) {
@tir.call_extern("gemv_update", @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), C_2, ((i*512) + (j.outer*16)), 16, 2, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), A_2, (i*64), 64, 1, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), B_2, (j.outer*1024), 1024, 1, dtype=handle), 16, 64, 64, dtype=int32)
diff --git a/docs/install/nnpack.html b/docs/install/nnpack.html
index 1ef28de467..23d2181e9d 100644
--- a/docs/install/nnpack.html
+++ b/docs/install/nnpack.html
@@ -229,7 +229,17 @@
<p class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
<ul class="current">
<li class="toctree-l1 current"><a class="reference internal" href="index.html">Installing TVM</a><ul class="current">
-<li class="toctree-l2"><a class="reference internal" href="from_source.html">Install from Source</a></li>
+<li class="toctree-l2 current"><a class="reference internal" href="from_source.html">Install from Source</a><ul class="current">
+<li class="toctree-l3"><a class="reference internal" href="from_source.html#developers-get-source-from-github">Developers: Get Source from Github</a></li>
+<li class="toctree-l3"><a class="reference internal" href="from_source.html#build-the-shared-library">Build the Shared Library</a></li>
+<li class="toctree-l3"><a class="reference internal" href="from_source.html#python-package-installation">Python Package Installation</a></li>
+<li class="toctree-l3 current"><a class="reference internal" href="from_source.html#install-contrib-libraries">Install Contrib Libraries</a><ul class="current">
+<li class="toctree-l4 current"><a class="current reference internal" href="#">NNPACK Contrib Installation</a></li>
+</ul>
+</li>
+<li class="toctree-l3"><a class="reference internal" href="from_source.html#enable-c-tests">Enable C++ Tests</a></li>
+</ul>
+</li>
<li class="toctree-l2"><a class="reference internal" href="docker.html">Docker Images</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">NNPACK Contrib Installation</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#conditions">Conditions</a></li>
diff --git a/docs/objects.inv b/docs/objects.inv
index d9c7eed1b0..57d123a748 100644
Binary files a/docs/objects.inv and b/docs/objects.inv differ
diff --git a/docs/reference/api/doxygen/namespacemembers_c.html b/docs/reference/api/doxygen/namespacemembers_c.html
index 9d5612a458..cd8a7af130 100644
--- a/docs/reference/api/doxygen/namespacemembers_c.html
+++ b/docs/reference/api/doxygen/namespacemembers_c.html
@@ -61,6 +61,9 @@ $(function() {
<div class="textblock">Here is a list of all namespace members with links to the namespace documentation for each member:</div>
<h3><a id="index_c"></a>- c -</h3><ul>
+<li>CalculateAllocatedBytes()
+: <a class="el" href="namespacetvm_1_1tir.html#aefb6b3b92f913b5f057ffd14db3a1870">tvm::tir</a>
+</li>
<li>CalculateConstantBytes()
: <a class="el" href="namespacetvm_1_1tir.html#a7314714e14035c9b1096737920c689ee">tvm::tir</a>
</li>
@@ -68,7 +71,7 @@ $(function() {
: <a class="el" href="namespacetvm_1_1tir.html#accfaeb3b1ce8601eab8d9b575d66f025">tvm::tir</a>
</li>
<li>CalculateExtentsSize()
-: <a class="el" href="namespacetvm_1_1tir_1_1usmp.html#a1529c901d8116a3ff0331a38b8e0e076">tvm::tir::usmp</a>
+: <a class="el" href="namespacetvm_1_1tir_1_1usmp.html#ad2424e3662cdcad9a18b496ba42ca10d">tvm::tir::usmp</a>
</li>
<li>CalculateModuleWorkspaceSize()
: <a class="el" href="namespacetvm_1_1tir_1_1usmp.html#a40a26630428319adf281826355d3e56f">tvm::tir::usmp</a>
@@ -172,7 +175,7 @@ $(function() {
: <a class="el" href="namespacetvm_1_1arith.html#a1d555a0cc363f669a49fad7e5f7d69d0">tvm::arith</a>
</li>
<li>compute()
-: <a class="el" href="namespacetvm_1_1te.html#a18b58076bec4041d2dc7614def49af27">tvm::te</a>
+: <a class="el" href="namespacetvm_1_1te.html#a86ab2c98eaf03c2d8286ccf19ab54f81">tvm::te</a>
</li>
<li>compute_scope
: <a class="el" href="namespacetvm_1_1tir_1_1attr.html#a00a6b89838348f152d844cead81b5016">tvm::tir::attr</a>
diff --git a/docs/reference/api/doxygen/namespacemembers_func_c.html b/docs/reference/api/doxygen/namespacemembers_func_c.html
index 45a1d9f8da..75a4516f42 100644
--- a/docs/reference/api/doxygen/namespacemembers_func_c.html
+++ b/docs/reference/api/doxygen/namespacemembers_func_c.html
@@ -61,6 +61,9 @@ $(function() {
 
<h3><a id="index_c"></a>- c -</h3><ul>
+<li>CalculateAllocatedBytes()
+: <a class="el" href="namespacetvm_1_1tir.html#aefb6b3b92f913b5f057ffd14db3a1870">tvm::tir</a>
+</li>
<li>CalculateConstantBytes()
: <a class="el" href="namespacetvm_1_1tir.html#a7314714e14035c9b1096737920c689ee">tvm::tir</a>
</li>
@@ -68,7 +71,7 @@ $(function() {
: <a class="el" href="namespacetvm_1_1tir.html#accfaeb3b1ce8601eab8d9b575d66f025">tvm::tir</a>
</li>
<li>CalculateExtentsSize()
-: <a class="el" href="namespacetvm_1_1tir_1_1usmp.html#a1529c901d8116a3ff0331a38b8e0e076">tvm::tir::usmp</a>
+: <a class="el" href="namespacetvm_1_1tir_1_1usmp.html#ad2424e3662cdcad9a18b496ba42ca10d">tvm::tir::usmp</a>
</li>
<li>CalculateModuleWorkspaceSize()
: <a class="el" href="namespacetvm_1_1tir_1_1usmp.html#a40a26630428319adf281826355d3e56f">tvm::tir::usmp</a>
@@ -151,7 +154,7 @@ $(function() {
: <a class="el" href="namespacetvm_1_1tir_1_1transform.html#ac6dbf3a491d01da405c1ce6d5944ee85">tvm::tir::transform</a>
</li>
<li>compute()
-: <a class="el" href="namespacetvm_1_1te.html#a86ab2c98eaf03c2d8286ccf19ab54f81">tvm::te</a>
+: <a class="el" href="namespacetvm_1_1te.html#afe4f57aeb3dd5ae9c0b58135e14d67ca">tvm::te</a>
</li>
<li>Concat()
: <a class="el" href="namespacetvm_1_1runtime.html#a4a8b9d4d20b7993992145f73efa2abf2">tvm::runtime</a>
diff --git a/docs/reference/api/doxygen/namespacemembers_func_v.html b/docs/reference/api/doxygen/namespacemembers_func_v.html
index 87e403a3e0..a170a36835 100644
--- a/docs/reference/api/doxygen/namespacemembers_func_v.html
+++ b/docs/reference/api/doxygen/namespacemembers_func_v.html
@@ -97,6 +97,9 @@ $(function() {
: <a class="el" href="namespacetvm_1_1tir_1_1transform.html#ac51a104ab4d2c60a4f6ed0e827efab18">tvm::tir::transform</a>
, <a class="el" href="namespacetvm_1_1tir.html#a3b38edd60b6ff952cefb74842a8ae826">tvm::tir</a>
</li>
+<li>VerifyVTCMLimit()
+: <a class="el" href="namespacetvm_1_1tir_1_1transform.html#aee836ad2372e1eac4a51f30f3f03f32c">tvm::tir::transform</a>
+</li>
<li>VerifyWellFormed()
: <a class="el" href="namespacetvm_1_1tir.html#aee3d251f82ef3a0f446ea23f8980d84e">tvm::tir</a>
</li>
diff --git a/docs/reference/api/doxygen/namespacemembers_s.html b/docs/reference/api/doxygen/namespacemembers_s.html
index 4e182cd946..f45d371db9 100644
--- a/docs/reference/api/doxygen/namespacemembers_s.html
+++ b/docs/reference/api/doxygen/namespacemembers_s.html
@@ -383,7 +383,7 @@ $(function() {
: <a class="el" href="namespacetvm_1_1tir.html#ae06122cce8e8888b5ed3568e7a4368bc">tvm::tir</a>
</li>
<li>subtract()
-: <a class="el" href="namespacetvm_1_1topi.html#a1447c5af8653fa4fcbe69ee287a0a8fa">tvm::topi</a>
+: <a class="el" href="namespacetvm_1_1topi.html#a3030be37f9db43ef90a5b2cc0997acd6">tvm::topi</a>
</li>
<li>sum()
: <a class="el" href="namespacetvm.html#afdad0c0329bd39949ba8d296cfb85d76">tvm</a>
diff --git a/docs/reference/api/doxygen/namespacemembers_v.html b/docs/reference/api/doxygen/namespacemembers_v.html
index 10630c952d..6092ac6662 100644
--- a/docs/reference/api/doxygen/namespacemembers_v.html
+++ b/docs/reference/api/doxygen/namespacemembers_v.html
@@ -97,6 +97,9 @@ $(function() {
: <a class="el" href="namespacetvm_1_1tir_1_1transform.html#ac51a104ab4d2c60a4f6ed0e827efab18">tvm::tir::transform</a>
, <a class="el" href="namespacetvm_1_1tir.html#a3b38edd60b6ff952cefb74842a8ae826">tvm::tir</a>
</li>
+<li>VerifyVTCMLimit()
+: <a class="el" href="namespacetvm_1_1tir_1_1transform.html#aee836ad2372e1eac4a51f30f3f03f32c">tvm::tir::transform</a>
+</li>
<li>VerifyWellFormed()
: <a class="el" href="namespacetvm_1_1tir.html#aee3d251f82ef3a0f446ea23f8980d84e">tvm::tir</a>
</li>
diff --git a/docs/reference/api/doxygen/namespacetvm_1_1tir.html b/docs/reference/api/doxygen/namespacetvm_1_1tir.html
index 8796996ed9..deb54610f7 100644
--- a/docs/reference/api/doxygen/namespacetvm_1_1tir.html
+++ b/docs/reference/api/doxygen/namespacetvm_1_1tir.html
@@ -757,6 +757,9 @@ Functions</h2></td></tr>
<tr class="memitem:a24f9f5bc52105a2a38a7a97390c55f18"><td class="memItemLeft" align="right" valign="top">size_t </td><td class="memItemRight" valign="bottom"><a class="el" href="namespacetvm_1_1tir.html#a24f9f5bc52105a2a38a7a97390c55f18">CalculateWorkspaceBytes</a> (const <a class="el" href="classtvm_1_1tir_1_1PrimFunc.html">PrimFunc</a> &func, const <a class="el" href="classtvm_1_1Integer.html">Integer</a> &workspace_byte_alignment)</td></tr>
<tr class="memdesc:a24f9f5bc52105a2a38a7a97390c55f18"><td class="mdescLeft"> </td><td class="mdescRight">Calculate the workspace size in bytes needed by the TIR allocates inside the TIR <a class="el" href="classtvm_1_1tir_1_1PrimFunc.html" title="Managed reference to PrimFuncNode. ">PrimFunc</a>. <a href="#a24f9f5bc52105a2a38a7a97390c55f18">More...</a><br /></td></tr>
<tr class="separator:a24f9f5bc52105a2a38a7a97390c55f18"><td class="memSeparator" colspan="2"> </td></tr>
+<tr class="memitem:aefb6b3b92f913b5f057ffd14db3a1870"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classtvm_1_1runtime_1_1Map.html">tvm::Map</a>< <a class="el" href="classtvm_1_1runtime_1_1String.html">String</a>, <a class="el" href="classtvm_1_1Integer.html">Integer</a> > </td><td class="memItemRight" valign="bottom"><a class="el" href="namespacetvm_1_1tir.html#aefb6b3b92f913b5f057ffd14db3a1870">CalculateAllocatedBytes</a> (const <a class="el" href=" [...]
+<tr class="memdesc:aefb6b3b92f913b5f057ffd14db3a1870"><td class="mdescLeft"> </td><td class="mdescRight">Calculate the allocated memory per scope in bytes needed inside the TIR <a class="el" href="classtvm_1_1tir_1_1PrimFunc.html" title="Managed reference to PrimFuncNode. ">PrimFunc</a>. <a href="#aefb6b3b92f913b5f057ffd14db3a1870">More...</a><br /></td></tr>
+<tr class="separator:aefb6b3b92f913b5f057ffd14db3a1870"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:abbd3ced524b506f532aa1d8ae36dadf3"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classtvm_1_1runtime_1_1Map.html">Map</a>< <a class="el" href="classtvm_1_1tir_1_1Buffer.html">Buffer</a>, <a class="el" href="classtvm_1_1runtime_1_1Optional.html">Optional</a>< <a class="el" href="classtvm_1_1tir_1_1Stmt.html">Stmt</a> > > </td><td class="memItemRight" valign="bottom"><a class="el" href="namespacetvm_1_1tir.html#abbd3ced524b506f53 [...]
<tr class="memdesc:abbd3ced524b506f532aa1d8ae36dadf3"><td class="mdescLeft"> </td><td class="mdescRight">Detect the lowest common ancestor(LCA) of buffer access, including both high-level access(BufferLoad, BufferStore) and low-level access(Load, Store and opaque access). The LCA may be a <a class="el" href="classtvm_1_1tir_1_1For.html" title="Managed reference to ForNode. ">For</a> loop or a <a class="el" href="classtvm_1_1tir_1_1Block.html" title="Managed reference to BlockNode. " [...]
<tr class="separator:abbd3ced524b506f532aa1d8ae36dadf3"><td class="memSeparator" colspan="2"> </td></tr>
@@ -1563,6 +1566,32 @@ template<typename K , typename V > </div>
</dd>
</dl>
+</div>
+</div>
+<a id="aefb6b3b92f913b5f057ffd14db3a1870"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aefb6b3b92f913b5f057ffd14db3a1870">◆ </a></span>CalculateAllocatedBytes()</h2>
+
+<div class="memitem">
+<div class="memproto">
+ <table class="memname">
+ <tr>
+ <td class="memname"><a class="el" href="classtvm_1_1runtime_1_1Map.html">tvm::Map</a><<a class="el" href="classtvm_1_1runtime_1_1String.html">String</a>, <a class="el" href="classtvm_1_1Integer.html">Integer</a>> tvm::tir::CalculateAllocatedBytes </td>
+ <td>(</td>
+ <td class="paramtype">const <a class="el" href="classtvm_1_1tir_1_1PrimFunc.html">PrimFunc</a> & </td>
+ <td class="paramname"><em>func</em></td><td>)</td>
+ <td></td>
+ </tr>
+ </table>
+</div><div class="memdoc">
+
+<p>Calculate the allocated memory per scope in bytes needed inside the TIR <a class="el" href="classtvm_1_1tir_1_1PrimFunc.html" title="Managed reference to PrimFuncNode. ">PrimFunc</a>. </p>
+<dl class="params"><dt>Parameters</dt><dd>
+ <table class="params">
+ <tr><td class="paramname">func</td><td>The TIR <a class="el" href="classtvm_1_1tir_1_1PrimFunc.html" title="Managed reference to PrimFuncNode. ">PrimFunc</a> for which the the allocated memory size to be calculated </td></tr>
+ </table>
+ </dd>
+</dl>
+
</div>
</div>
<a id="a7314714e14035c9b1096737920c689ee"></a>
diff --git a/docs/reference/api/doxygen/namespacetvm_1_1tir_1_1transform.html b/docs/reference/api/doxygen/namespacetvm_1_1tir_1_1transform.html
index 62ba0987fa..fcc6b811e0 100644
--- a/docs/reference/api/doxygen/namespacetvm_1_1tir_1_1transform.html
+++ b/docs/reference/api/doxygen/namespacetvm_1_1tir_1_1transform.html
@@ -80,6 +80,9 @@ Functions</h2></td></tr>
<tr class="memitem:a70a059926c2ea81dcf437eff35f05e3e"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classtvm_1_1transform_1_1Pass.html">Pass</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="namespacetvm_1_1tir_1_1transform.html#a70a059926c2ea81dcf437eff35f05e3e">VerifyGPUCode</a> (<a class="el" href="classtvm_1_1runtime_1_1Map.html">Map</a>< <a class="el" href="classtvm_1_1runtime_1_1String.html">String</a>, <a class="el" href="classt [...]
<tr class="memdesc:a70a059926c2ea81dcf437eff35f05e3e"><td class="mdescLeft"> </td><td class="mdescRight">Pass variant of VerifyGPUCode. <a href="#a70a059926c2ea81dcf437eff35f05e3e">More...</a><br /></td></tr>
<tr class="separator:a70a059926c2ea81dcf437eff35f05e3e"><td class="memSeparator" colspan="2"> </td></tr>
+<tr class="memitem:aee836ad2372e1eac4a51f30f3f03f32c"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classtvm_1_1transform_1_1Pass.html">Pass</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="namespacetvm_1_1tir_1_1transform.html#aee836ad2372e1eac4a51f30f3f03f32c">VerifyVTCMLimit</a> (const <a class="el" href="classtvm_1_1Integer.html">Integer</a> &limit)</td></tr>
+<tr class="memdesc:aee836ad2372e1eac4a51f30f3f03f32c"><td class="mdescLeft"> </td><td class="mdescRight">Pass to checks if the size of the allocated vtcm memory satisfies the limit. <a href="#aee836ad2372e1eac4a51f30f3f03f32c">More...</a><br /></td></tr>
+<tr class="separator:aee836ad2372e1eac4a51f30f3f03f32c"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:aea27d24b6e7852652d258268d8537b66"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classtvm_1_1transform_1_1Pass.html">Pass</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="namespacetvm_1_1tir_1_1transform.html#aea27d24b6e7852652d258268d8537b66">OOBChecker</a> ()</td></tr>
<tr class="memdesc:aea27d24b6e7852652d258268d8537b66"><td class="mdescLeft"> </td><td class="mdescRight">Statically check TIR code for out of bounds array access. <a href="#aea27d24b6e7852652d258268d8537b66">More...</a><br /></td></tr>
<tr class="separator:aea27d24b6e7852652d258268d8537b66"><td class="memSeparator" colspan="2"> </td></tr>
@@ -1769,6 +1772,34 @@ Functions</h2></td></tr>
<dl class="section return"><dt>Returns</dt><dd>The pass. </dd></dl>
<dl class="section see"><dt>See also</dt><dd><a class="el" href="namespacetvm_1_1tir.html#a3b38edd60b6ff952cefb74842a8ae826" title="Verifies whether the IR stmt or Expr is in SSA form. That is: each Var is defined and assigned once(i...">tvm::tir::VerifySSA</a> </dd></dl>
+</div>
+</div>
+<a id="aee836ad2372e1eac4a51f30f3f03f32c"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aee836ad2372e1eac4a51f30f3f03f32c">◆ </a></span>VerifyVTCMLimit()</h2>
+
+<div class="memitem">
+<div class="memproto">
+ <table class="memname">
+ <tr>
+ <td class="memname"><a class="el" href="classtvm_1_1transform_1_1Pass.html">Pass</a> tvm::tir::transform::VerifyVTCMLimit </td>
+ <td>(</td>
+ <td class="paramtype">const <a class="el" href="classtvm_1_1Integer.html">Integer</a> & </td>
+ <td class="paramname"><em>limit</em></td><td>)</td>
+ <td></td>
+ </tr>
+ </table>
+</div><div class="memdoc">
+
+<p>Pass to checks if the size of the allocated vtcm memory satisfies the limit. </p>
+<dl class="params"><dt>Parameters</dt><dd>
+ <table class="params">
+ <tr><td class="paramname">limit</td><td>The limit to check.</td></tr>
+ </table>
+ </dd>
+</dl>
+<dl class="section return"><dt>Returns</dt><dd>The pass. </dd></dl>
+<dl class="section see"><dt>See also</dt><dd><a class="el" href="namespacetvm_1_1tir.html#aefb6b3b92f913b5f057ffd14db3a1870" title="Calculate the allocated memory per scope in bytes needed inside the TIR PrimFunc. ...">tvm::tir::CalculateAllocatedBytes</a> </dd></dl>
+
</div>
</div>
</div><!-- contents -->
diff --git a/docs/reference/api/doxygen/search/all_14.js b/docs/reference/api/doxygen/search/all_14.js
index e2ecf350cb..2823ef9a80 100644
--- a/docs/reference/api/doxygen/search/all_14.js
+++ b/docs/reference/api/doxygen/search/all_14.js
@@ -179,7 +179,7 @@ var searchData=
['setvalue_3c_20uint64_5ft_20_3e',['SetValue< uint64_t >',['../namespacetvm_1_1detail.html#acb3382242cbf538f64edae13e4ec5a84',1,'tvm::detail']]],
['shallowcopy',['ShallowCopy',['../classtvm_1_1IRModuleNode.html#a86bbdc4b857ce5958a2b5f29e1d6fcb6',1,'tvm::IRModuleNode']]],
['shallowcopyirmodule',['ShallowCopyIRModule',['../classtvm_1_1IRModule.html#aea8b821cf92cf525bd87bf15f5d31889',1,'tvm::IRModule']]],
- ['shape',['Shape',['../classtvm_1_1runtime_1_1NDArray.html#ad273c7bc59b73fb026fd64fc764cbebc',1,'tvm::runtime::NDArray::Shape()'],['../classtvm_1_1TensorTypeNode.html#a98fa347833e4504dd6f8056d9863a708',1,'tvm::TensorTypeNode::shape()'],['../classtvm_1_1meta__schedule_1_1TensorInfoNode.html#ac16d3b10f7c68eefb27e55e865bb304c',1,'tvm::meta_schedule::TensorInfoNode::shape()'],['../structtvm_1_1relay_1_1InitOpAttrs.html#aaaec76cc5ea9a543c4ea174a6b38bf5e',1,'tvm::relay::InitOpAttrs::shape()' [...]
+ ['shape',['Shape',['../classtvm_1_1runtime_1_1NDArray.html#ad273c7bc59b73fb026fd64fc764cbebc',1,'tvm::runtime::NDArray::Shape()'],['../classtvm_1_1TensorTypeNode.html#a98fa347833e4504dd6f8056d9863a708',1,'tvm::TensorTypeNode::shape()'],['../classtvm_1_1meta__schedule_1_1TensorInfoNode.html#ac16d3b10f7c68eefb27e55e865bb304c',1,'tvm::meta_schedule::TensorInfoNode::shape()'],['../structtvm_1_1relay_1_1InitOpAttrs.html#aaaec76cc5ea9a543c4ea174a6b38bf5e',1,'tvm::relay::InitOpAttrs::shape()' [...]
['shape_5f',['shape_',['../classtvm_1_1runtime_1_1NDArray_1_1ContainerBase.html#aa5597a1760c9f8c9d1fd51584b1283fb',1,'tvm::runtime::NDArray::ContainerBase']]],
['shape_5fbackward_5frule',['shape_backward_rule',['../classtvm_1_1tir_1_1BijectiveLayoutNode.html#a0befdd0a2371c0d12970e8ac6623b59b',1,'tvm::tir::BijectiveLayoutNode']]],
['shape_5fcount',['shape_count',['../structTVMGraphExecutorGraphAttr.html#a182b228582f1186f2a15de50a25b3375',1,'TVMGraphExecutorGraphAttr']]],
diff --git a/docs/reference/api/doxygen/search/all_17.js b/docs/reference/api/doxygen/search/all_17.js
index 7c48d4421a..15144945c8 100644
--- a/docs/reference/api/doxygen/search/all_17.js
+++ b/docs/reference/api/doxygen/search/all_17.js
@@ -50,6 +50,7 @@ var searchData=
['verifygpucode',['VerifyGPUCode',['../classtvm_1_1meta__schedule_1_1Postproc.html#a7106b1742068c45966d6be5f4b8394aa',1,'tvm::meta_schedule::Postproc::VerifyGPUCode()'],['../namespacetvm_1_1tir.html#a53dfcb6ef7e178a83fda0bbb5dddcb39',1,'tvm::tir::VerifyGPUCode()'],['../namespacetvm_1_1tir_1_1transform.html#a70a059926c2ea81dcf437eff35f05e3e',1,'tvm::tir::transform::VerifyGPUCode()']]],
['verifymemory',['VerifyMemory',['../namespacetvm_1_1tir.html#ac69bcf127843e5e671379e44ab47ca27',1,'tvm::tir::VerifyMemory()'],['../namespacetvm_1_1tir_1_1transform.html#a32d0b0ed966cf019d5e607bc90f284af',1,'tvm::tir::transform::VerifyMemory()']]],
['verifyssa',['VerifySSA',['../namespacetvm_1_1tir.html#a3b38edd60b6ff952cefb74842a8ae826',1,'tvm::tir::VerifySSA()'],['../namespacetvm_1_1tir_1_1transform.html#ac51a104ab4d2c60a4f6ed0e827efab18',1,'tvm::tir::transform::VerifySSA()']]],
+ ['verifyvtcmlimit',['VerifyVTCMLimit',['../namespacetvm_1_1tir_1_1transform.html#aee836ad2372e1eac4a51f30f3f03f32c',1,'tvm::tir::transform']]],
['verifywellformed',['VerifyWellFormed',['../namespacetvm_1_1tir.html#aee3d251f82ef3a0f446ea23f8980d84e',1,'tvm::tir']]],
['version',['version',['../structTVMMetadata.html#ade3312efd4c0e5beaf390959621b2f52',1,'TVMMetadata::version()'],['../classtvm_1_1runtime_1_1metadata_1_1MetadataNode.html#a894ca3986db9874b364ea83982e54f21',1,'tvm::runtime::metadata::MetadataNode::version()']]],
['vid',['vid',['../classtvm_1_1relay_1_1VarNode.html#af191a8eedf3575d626771c9c820ddb63',1,'tvm::relay::VarNode']]],
diff --git a/docs/reference/api/doxygen/search/all_18.js b/docs/reference/api/doxygen/search/all_18.js
index 98227ddaec..9b5a247be8 100644
--- a/docs/reference/api/doxygen/search/all_18.js
+++ b/docs/reference/api/doxygen/search/all_18.js
@@ -7,7 +7,7 @@ var searchData=
['weight_5fbits',['weight_bits',['../structtvm_1_1relay_1_1BinaryConv2DAttrs.html#a70aa926aba4fc774c15786358315141a',1,'tvm::relay::BinaryConv2DAttrs::weight_bits()'],['../structtvm_1_1relay_1_1BinaryDenseAttrs.html#a6647c2d9d1d3108c6f552ff4271f2625',1,'tvm::relay::BinaryDenseAttrs::weight_bits()']]],
['weight_5flayout',['weight_layout',['../structtvm_1_1relay_1_1DensePackAttrs.html#a7f12601cad15b4a65de4ce1bc4dd929c',1,'tvm::relay::DensePackAttrs']]],
['wellformed',['WellFormed',['../namespacetvm_1_1relay.html#a4e4cdd4e3db74bf68b315d5730890ca8',1,'tvm::relay']]],
- ['where',['where',['../namespacetvm_1_1topi.html#af011847b6e7f72f1bec25eee05c80590',1,'tvm::topi::where()'],['../namespacetvm_1_1script_1_1ir__builder_1_1tir.html#a454a28cc9ed56389b7b09b5b45a3097e',1,'tvm::script::ir_builder::tir::Where()']]],
+ ['where',['Where',['../namespacetvm_1_1script_1_1ir__builder_1_1tir.html#a454a28cc9ed56389b7b09b5b45a3097e',1,'tvm::script::ir_builder::tir::Where()'],['../namespacetvm_1_1topi.html#af011847b6e7f72f1bec25eee05c80590',1,'tvm::topi::where()']]],
['while',['While',['../classtvm_1_1tir_1_1While.html',1,'tvm::tir::While'],['../classtvm_1_1tir_1_1While.html#a5e1bf25f5caab0218e2224f19140351b',1,'tvm::tir::While::While()'],['../namespacetvm_1_1script_1_1ir__builder_1_1tir.html#a6df09cb19f1e26c3fa0e1e0251c3c0bb',1,'tvm::script::ir_builder::tir::While()']]],
['whiledoc',['WhileDoc',['../classtvm_1_1script_1_1printer_1_1WhileDoc.html',1,'tvm::script::printer::WhileDoc'],['../classtvm_1_1script_1_1printer_1_1WhileDoc.html#ac7a684d98115b7cda3ae6f7ab4df01a0',1,'tvm::script::printer::WhileDoc::WhileDoc()']]],
['whiledocnode',['WhileDocNode',['../classtvm_1_1script_1_1printer_1_1WhileDocNode.html',1,'tvm::script::printer']]],
diff --git a/docs/reference/api/doxygen/search/all_4.js b/docs/reference/api/doxygen/search/all_4.js
index 7e4c3501af..d0186a90c6 100644
--- a/docs/reference/api/doxygen/search/all_4.js
+++ b/docs/reference/api/doxygen/search/all_4.js
@@ -14,6 +14,7 @@ var searchData=
['cachewrite',['CacheWrite',['../classtvm_1_1tir_1_1ScheduleNode.html#a22ce23b6475acf7ce2fe9c1ab5292568',1,'tvm::tir::ScheduleNode']]],
['cachewritestep',['CacheWriteStep',['../classtvm_1_1auto__scheduler_1_1CacheWriteStep.html',1,'tvm::auto_scheduler::CacheWriteStep'],['../classtvm_1_1auto__scheduler_1_1CacheWriteStep.html#ad2c24762f35f7f9ebe85d7c03cba1c8e',1,'tvm::auto_scheduler::CacheWriteStep::CacheWriteStep(int stage_id, String scope_name)'],['../classtvm_1_1auto__scheduler_1_1CacheWriteStep.html#a371ff5005c7312db88088f35f11dabcb',1,'tvm::auto_scheduler::CacheWriteStep::CacheWriteStep(dmlc::JSONReader *reader)']]],
['cachewritestepnode',['CacheWriteStepNode',['../classtvm_1_1auto__scheduler_1_1CacheWriteStepNode.html',1,'tvm::auto_scheduler']]],
+ ['calculateallocatedbytes',['CalculateAllocatedBytes',['../namespacetvm_1_1tir.html#aefb6b3b92f913b5f057ffd14db3a1870',1,'tvm::tir']]],
['calculateconstantbytes',['CalculateConstantBytes',['../namespacetvm_1_1tir.html#a7314714e14035c9b1096737920c689ee',1,'tvm::tir']]],
['calculateexprcomplexity',['CalculateExprComplexity',['../namespacetvm_1_1tir.html#accfaeb3b1ce8601eab8d9b575d66f025',1,'tvm::tir']]],
['calculateextentssize',['CalculateExtentsSize',['../namespacetvm_1_1tir_1_1usmp.html#ad2424e3662cdcad9a18b496ba42ca10d',1,'tvm::tir::usmp::CalculateExtentsSize(const AllocateNode *op)'],['../namespacetvm_1_1tir_1_1usmp.html#a1529c901d8116a3ff0331a38b8e0e076',1,'tvm::tir::usmp::CalculateExtentsSize(const AllocateConstNode *op)']]],
diff --git a/docs/reference/api/doxygen/search/functions_16.js b/docs/reference/api/doxygen/search/functions_16.js
index 2267243551..265fa5434c 100644
--- a/docs/reference/api/doxygen/search/functions_16.js
+++ b/docs/reference/api/doxygen/search/functions_16.js
@@ -19,6 +19,7 @@ var searchData=
['verifygpucode',['VerifyGPUCode',['../classtvm_1_1meta__schedule_1_1Postproc.html#a7106b1742068c45966d6be5f4b8394aa',1,'tvm::meta_schedule::Postproc::VerifyGPUCode()'],['../namespacetvm_1_1tir.html#a53dfcb6ef7e178a83fda0bbb5dddcb39',1,'tvm::tir::VerifyGPUCode()'],['../namespacetvm_1_1tir_1_1transform.html#a70a059926c2ea81dcf437eff35f05e3e',1,'tvm::tir::transform::VerifyGPUCode()']]],
['verifymemory',['VerifyMemory',['../namespacetvm_1_1tir.html#ac69bcf127843e5e671379e44ab47ca27',1,'tvm::tir::VerifyMemory()'],['../namespacetvm_1_1tir_1_1transform.html#a32d0b0ed966cf019d5e607bc90f284af',1,'tvm::tir::transform::VerifyMemory()']]],
['verifyssa',['VerifySSA',['../namespacetvm_1_1tir.html#a3b38edd60b6ff952cefb74842a8ae826',1,'tvm::tir::VerifySSA()'],['../namespacetvm_1_1tir_1_1transform.html#ac51a104ab4d2c60a4f6ed0e827efab18',1,'tvm::tir::transform::VerifySSA()']]],
+ ['verifyvtcmlimit',['VerifyVTCMLimit',['../namespacetvm_1_1tir_1_1transform.html#aee836ad2372e1eac4a51f30f3f03f32c',1,'tvm::tir::transform']]],
['verifywellformed',['VerifyWellFormed',['../namespacetvm_1_1tir.html#aee3d251f82ef3a0f446ea23f8980d84e',1,'tvm::tir']]],
['version',['version',['../classtvm_1_1runtime_1_1metadata_1_1MetadataNode.html#a894ca3986db9874b364ea83982e54f21',1,'tvm::runtime::metadata::MetadataNode']]],
['virtual_5fdevice',['virtual_device',['../classtvm_1_1RelayExprNode.html#ac4e639dd9f33f304800851364f471eb1',1,'tvm::RelayExprNode']]],
diff --git a/docs/reference/api/doxygen/search/functions_17.js b/docs/reference/api/doxygen/search/functions_17.js
index a95e301536..f6a4870ba3 100644
--- a/docs/reference/api/doxygen/search/functions_17.js
+++ b/docs/reference/api/doxygen/search/functions_17.js
@@ -2,7 +2,7 @@ var searchData=
[
['warning',['Warning',['../classtvm_1_1Diagnostic.html#a407ef56844eec306451c1ce2ca9f248c',1,'tvm::Diagnostic']]],
['wellformed',['WellFormed',['../namespacetvm_1_1relay.html#a4e4cdd4e3db74bf68b315d5730890ca8',1,'tvm::relay']]],
- ['where',['where',['../namespacetvm_1_1topi.html#af011847b6e7f72f1bec25eee05c80590',1,'tvm::topi::where()'],['../namespacetvm_1_1script_1_1ir__builder_1_1tir.html#a454a28cc9ed56389b7b09b5b45a3097e',1,'tvm::script::ir_builder::tir::Where()']]],
+ ['where',['Where',['../namespacetvm_1_1script_1_1ir__builder_1_1tir.html#a454a28cc9ed56389b7b09b5b45a3097e',1,'tvm::script::ir_builder::tir::Where()'],['../namespacetvm_1_1topi.html#af011847b6e7f72f1bec25eee05c80590',1,'tvm::topi::where()']]],
['while',['While',['../classtvm_1_1tir_1_1While.html#a5e1bf25f5caab0218e2224f19140351b',1,'tvm::tir::While::While()'],['../namespacetvm_1_1script_1_1ir__builder_1_1tir.html#a6df09cb19f1e26c3fa0e1e0251c3c0bb',1,'tvm::script::ir_builder::tir::While()']]],
['whiledoc',['WhileDoc',['../classtvm_1_1script_1_1printer_1_1WhileDoc.html#ac7a684d98115b7cda3ae6f7ab4df01a0',1,'tvm::script::printer::WhileDoc']]],
['with',['With',['../classtvm_1_1With.html#a19fcda1557550b2a5f2e942f08bd38f2',1,'tvm::With::With(Args &&... args)'],['../classtvm_1_1With.html#a9704ce4379a8f1475670abd6f937f24c',1,'tvm::With::With(const With &other)=delete'],['../classtvm_1_1With.html#a30223d74db8edd8200bc5586b5d4ca2f',1,'tvm::With::With(With &&other)=delete']]],
diff --git a/docs/reference/api/doxygen/search/functions_3.js b/docs/reference/api/doxygen/search/functions_3.js
index 621b3a80c0..a7830ec74e 100644
--- a/docs/reference/api/doxygen/search/functions_3.js
+++ b/docs/reference/api/doxygen/search/functions_3.js
@@ -9,6 +9,7 @@ var searchData=
['cachereadstep',['CacheReadStep',['../classtvm_1_1auto__scheduler_1_1CacheReadStep.html#a5e0dd0b6d5f746e96e4fec058edc98dc',1,'tvm::auto_scheduler::CacheReadStep::CacheReadStep(int stage_id, String scope_name, const Array< Integer > &reader_stage_ids)'],['../classtvm_1_1auto__scheduler_1_1CacheReadStep.html#a6bc3dd6558c83d1d74cc7163cccbea7f',1,'tvm::auto_scheduler::CacheReadStep::CacheReadStep(dmlc::JSONReader *reader)']]],
['cachewrite',['CacheWrite',['../classtvm_1_1tir_1_1ScheduleNode.html#a22ce23b6475acf7ce2fe9c1ab5292568',1,'tvm::tir::ScheduleNode']]],
['cachewritestep',['CacheWriteStep',['../classtvm_1_1auto__scheduler_1_1CacheWriteStep.html#ad2c24762f35f7f9ebe85d7c03cba1c8e',1,'tvm::auto_scheduler::CacheWriteStep::CacheWriteStep(int stage_id, String scope_name)'],['../classtvm_1_1auto__scheduler_1_1CacheWriteStep.html#a371ff5005c7312db88088f35f11dabcb',1,'tvm::auto_scheduler::CacheWriteStep::CacheWriteStep(dmlc::JSONReader *reader)']]],
+ ['calculateallocatedbytes',['CalculateAllocatedBytes',['../namespacetvm_1_1tir.html#aefb6b3b92f913b5f057ffd14db3a1870',1,'tvm::tir']]],
['calculateconstantbytes',['CalculateConstantBytes',['../namespacetvm_1_1tir.html#a7314714e14035c9b1096737920c689ee',1,'tvm::tir']]],
['calculateexprcomplexity',['CalculateExprComplexity',['../namespacetvm_1_1tir.html#accfaeb3b1ce8601eab8d9b575d66f025',1,'tvm::tir']]],
['calculateextentssize',['CalculateExtentsSize',['../namespacetvm_1_1tir_1_1usmp.html#ad2424e3662cdcad9a18b496ba42ca10d',1,'tvm::tir::usmp::CalculateExtentsSize(const AllocateNode *op)'],['../namespacetvm_1_1tir_1_1usmp.html#a1529c901d8116a3ff0331a38b8e0e076',1,'tvm::tir::usmp::CalculateExtentsSize(const AllocateConstNode *op)']]],
diff --git a/docs/reference/api/doxygen/tir_2analysis_8h.html b/docs/reference/api/doxygen/tir_2analysis_8h.html
index 692c6a7bbc..a64bcf0d00 100644
--- a/docs/reference/api/doxygen/tir_2analysis_8h.html
+++ b/docs/reference/api/doxygen/tir_2analysis_8h.html
@@ -162,6 +162,9 @@ Functions</h2></td></tr>
<tr class="memitem:a24f9f5bc52105a2a38a7a97390c55f18"><td class="memItemLeft" align="right" valign="top">size_t </td><td class="memItemRight" valign="bottom"><a class="el" href="namespacetvm_1_1tir.html#a24f9f5bc52105a2a38a7a97390c55f18">tvm::tir::CalculateWorkspaceBytes</a> (const PrimFunc &func, const Integer &workspace_byte_alignment)</td></tr>
<tr class="memdesc:a24f9f5bc52105a2a38a7a97390c55f18"><td class="mdescLeft"> </td><td class="mdescRight">Calculate the workspace size in bytes needed by the TIR allocates inside the TIR <a class="el" href="classtvm_1_1tir_1_1PrimFunc.html" title="Managed reference to PrimFuncNode. ">PrimFunc</a>. <a href="namespacetvm_1_1tir.html#a24f9f5bc52105a2a38a7a97390c55f18">More...</a><br /></td></tr>
<tr class="separator:a24f9f5bc52105a2a38a7a97390c55f18"><td class="memSeparator" colspan="2"> </td></tr>
+<tr class="memitem:aefb6b3b92f913b5f057ffd14db3a1870"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classtvm_1_1runtime_1_1Map.html">tvm::Map</a>< String, Integer > </td><td class="memItemRight" valign="bottom"><a class="el" href="namespacetvm_1_1tir.html#aefb6b3b92f913b5f057ffd14db3a1870">tvm::tir::CalculateAllocatedBytes</a> (const PrimFunc &func)</td></tr>
+<tr class="memdesc:aefb6b3b92f913b5f057ffd14db3a1870"><td class="mdescLeft"> </td><td class="mdescRight">Calculate the allocated memory per scope in bytes needed inside the TIR <a class="el" href="classtvm_1_1tir_1_1PrimFunc.html" title="Managed reference to PrimFuncNode. ">PrimFunc</a>. <a href="namespacetvm_1_1tir.html#aefb6b3b92f913b5f057ffd14db3a1870">More...</a><br /></td></tr>
+<tr class="separator:aefb6b3b92f913b5f057ffd14db3a1870"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:abbd3ced524b506f532aa1d8ae36dadf3"><td class="memItemLeft" align="right" valign="top">Map< Buffer, Optional< Stmt > > </td><td class="memItemRight" valign="bottom"><a class="el" href="namespacetvm_1_1tir.html#abbd3ced524b506f532aa1d8ae36dadf3">tvm::tir::DetectBufferAccessLCA</a> (const PrimFunc &func)</td></tr>
<tr class="memdesc:abbd3ced524b506f532aa1d8ae36dadf3"><td class="mdescLeft"> </td><td class="mdescRight">Detect the lowest common ancestor(LCA) of buffer access, including both high-level access(BufferLoad, BufferStore) and low-level access(Load, Store and opaque access). The LCA may be a <a class="el" href="classtvm_1_1tir_1_1For.html" title="Managed reference to ForNode. ">For</a> loop or a <a class="el" href="classtvm_1_1tir_1_1Block.html" title="Managed reference to BlockNode. " [...]
<tr class="separator:abbd3ced524b506f532aa1d8ae36dadf3"><td class="memSeparator" colspan="2"> </td></tr>
@@ -183,6 +186,9 @@ Functions</h2></td></tr>
<tr class="memitem:a70a059926c2ea81dcf437eff35f05e3e"><td class="memItemLeft" align="right" valign="top">Pass </td><td class="memItemRight" valign="bottom"><a class="el" href="namespacetvm_1_1tir_1_1transform.html#a70a059926c2ea81dcf437eff35f05e3e">tvm::tir::transform::VerifyGPUCode</a> (Map< String, PrimExpr > constraints)</td></tr>
<tr class="memdesc:a70a059926c2ea81dcf437eff35f05e3e"><td class="mdescLeft"> </td><td class="mdescRight">Pass variant of VerifyGPUCode. <a href="namespacetvm_1_1tir_1_1transform.html#a70a059926c2ea81dcf437eff35f05e3e">More...</a><br /></td></tr>
<tr class="separator:a70a059926c2ea81dcf437eff35f05e3e"><td class="memSeparator" colspan="2"> </td></tr>
+<tr class="memitem:aee836ad2372e1eac4a51f30f3f03f32c"><td class="memItemLeft" align="right" valign="top">Pass </td><td class="memItemRight" valign="bottom"><a class="el" href="namespacetvm_1_1tir_1_1transform.html#aee836ad2372e1eac4a51f30f3f03f32c">tvm::tir::transform::VerifyVTCMLimit</a> (const Integer &limit)</td></tr>
+<tr class="memdesc:aee836ad2372e1eac4a51f30f3f03f32c"><td class="mdescLeft"> </td><td class="mdescRight">Pass to checks if the size of the allocated vtcm memory satisfies the limit. <a href="namespacetvm_1_1tir_1_1transform.html#aee836ad2372e1eac4a51f30f3f03f32c">More...</a><br /></td></tr>
+<tr class="separator:aee836ad2372e1eac4a51f30f3f03f32c"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:aea27d24b6e7852652d258268d8537b66"><td class="memItemLeft" align="right" valign="top">Pass </td><td class="memItemRight" valign="bottom"><a class="el" href="namespacetvm_1_1tir_1_1transform.html#aea27d24b6e7852652d258268d8537b66">tvm::tir::transform::OOBChecker</a> ()</td></tr>
<tr class="memdesc:aea27d24b6e7852652d258268d8537b66"><td class="mdescLeft"> </td><td class="mdescRight">Statically check TIR code for out of bounds array access. <a href="namespacetvm_1_1tir_1_1transform.html#aea27d24b6e7852652d258268d8537b66">More...</a><br /></td></tr>
<tr class="separator:aea27d24b6e7852652d258268d8537b66"><td class="memSeparator" colspan="2"> </td></tr>
diff --git a/docs/reference/api/doxygen/tir_2analysis_8h_source.html b/docs/reference/api/doxygen/tir_2analysis_8h_source.html
index 76e622b5fb..bcd1f5e67a 100644
--- a/docs/reference/api/doxygen/tir_2analysis_8h_source.html
+++ b/docs/reference/api/doxygen/tir_2analysis_8h_source.html
@@ -66,7 +66,7 @@ $(function() {
<div class="title">analysis.h</div> </div>
</div><!--header-->
<div class="contents">
-<a href="tir_2analysis_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno"> 1</span> <span class="comment">/*</span></div><div class="line"><a name="l00002"></a><span class="lineno"> 2</span> <span class="comment"> * Licensed to the Apache Software Foundation (ASF) under one</span></div><div class="line"><a name="l00003"></a><span class="lineno"> 3</span> <span class="comment"> * or mo [...]
+<a href="tir_2analysis_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno"> 1</span> <span class="comment">/*</span></div><div class="line"><a name="l00002"></a><span class="lineno"> 2</span> <span class="comment"> * Licensed to the Apache Software Foundation (ASF) under one</span></div><div class="line"><a name="l00003"></a><span class="lineno"> 3</span> <span class="comment"> * or mo [...]
<div class="ttc" id="namespacetvm_1_1tir_html_abbd3ced524b506f532aa1d8ae36dadf3"><div class="ttname"><a href="namespacetvm_1_1tir.html#abbd3ced524b506f532aa1d8ae36dadf3">tvm::tir::DetectBufferAccessLCA</a></div><div class="ttdeci">Map< Buffer, Optional< Stmt > > DetectBufferAccessLCA(const PrimFunc &func)</div><div class="ttdoc">Detect the lowest common ancestor(LCA) of buffer access, including both high-level access(BufferLoad...</div></div>
<div class="ttc" id="namespacetvm_1_1tir_html_ad22fd3c129c23e8e724a4772084f802c"><div class="ttname"><a href="namespacetvm_1_1tir.html#ad22fd3c129c23e8e724a4772084f802c">tvm::tir::GetBlockReadWriteRegion</a></div><div class="ttdeci">Array< Array< BufferRegion > > GetBlockReadWriteRegion(const Block &block, const Map< Var, Buffer > &buffer_var_map)</div><div class="ttdoc">Auto detect the block read/write region according to its body stmt. An opaque access will be [...]
<div class="ttc" id="namespacetvm_1_1tir_1_1transform_html_aea27d24b6e7852652d258268d8537b66"><div class="ttname"><a href="namespacetvm_1_1tir_1_1transform.html#aea27d24b6e7852652d258268d8537b66">tvm::tir::transform::OOBChecker</a></div><div class="ttdeci">Pass OOBChecker()</div><div class="ttdoc">Statically check TIR code for out of bounds array access. </div></div>
@@ -75,6 +75,7 @@ $(function() {
<div class="ttc" id="namespacetvm_1_1tir_html_aee3d251f82ef3a0f446ea23f8980d84e"><div class="ttname"><a href="namespacetvm_1_1tir.html#aee3d251f82ef3a0f446ea23f8980d84e">tvm::tir::VerifyWellFormed</a></div><div class="ttdeci">bool VerifyWellFormed(const PrimFunc &func, bool assert_mode=true)</div><div class="ttdoc">Verify if the given TIR is well-formed. The verification includes: </div></div>
<div class="ttc" id="tir_2op__attr__types_8h_html"><div class="ttname"><a href="tir_2op__attr__types_8h.html">op_attr_types.h</a></div><div class="ttdoc">Attribute types in the Op registry for TIR ops. </div></div>
<div class="ttc" id="classtvm_1_1tir_1_1VarNode_html"><div class="ttname"><a href="classtvm_1_1tir_1_1VarNode.html">tvm::tir::VarNode</a></div><div class="ttdoc">A variable node in the IR. </div><div class="ttdef"><b>Definition:</b> var.h:47</div></div>
+<div class="ttc" id="namespacetvm_1_1tir_html_aefb6b3b92f913b5f057ffd14db3a1870"><div class="ttname"><a href="namespacetvm_1_1tir.html#aefb6b3b92f913b5f057ffd14db3a1870">tvm::tir::CalculateAllocatedBytes</a></div><div class="ttdeci">tvm::Map< String, Integer > CalculateAllocatedBytes(const PrimFunc &func)</div><div class="ttdoc">Calculate the allocated memory per scope in bytes needed inside the TIR PrimFunc. ...</div></div>
<div class="ttc" id="classtvm_1_1tir_1_1PrimFuncNode_html"><div class="ttname"><a href="classtvm_1_1tir_1_1PrimFuncNode.html">tvm::tir::PrimFuncNode</a></div><div class="ttdoc">Primitive functions that contains TIR statements. </div><div class="ttdef"><b>Definition:</b> function.h:46</div></div>
<div class="ttc" id="tir_2function_8h_html"><div class="ttname"><a href="tir_2function_8h.html">function.h</a></div><div class="ttdoc">TIR Function. </div></div>
<div class="ttc" id="namespacetvm_1_1tir_html_a0242276905dca0e353c6817797d3fa0d"><div class="ttname"><a href="namespacetvm_1_1tir.html#a0242276905dca0e353c6817797d3fa0d">tvm::tir::FindAnchorBlock</a></div><div class="ttdeci">const tir::BlockNode * FindAnchorBlock(const IRModule &mod)</div><div class="ttdoc">Find the "anchor block" of the given module. We define the anchor block to be the block with (1) an i...</div></div>
@@ -84,6 +85,7 @@ $(function() {
<div class="ttc" id="namespacetvm_1_1tir_html_a53dfcb6ef7e178a83fda0bbb5dddcb39"><div class="ttname"><a href="namespacetvm_1_1tir.html#a53dfcb6ef7e178a83fda0bbb5dddcb39">tvm::tir::VerifyGPUCode</a></div><div class="ttdeci">bool VerifyGPUCode(const PrimFunc &func, Map< String, PrimExpr > constraints)</div><div class="ttdoc">Verify the correctness of a GPU code It will check the whether the amount of memory usage or the numb...</div></div>
<div class="ttc" id="namespacetvm_1_1tir_html_a8f4a86b205145696c0555fd02bd37f46"><div class="ttname"><a href="namespacetvm_1_1tir.html#a8f4a86b205145696c0555fd02bd37f46">tvm::tir::CallEffectKind</a></div><div class="ttdeci">CallEffectKind</div><div class="ttdoc">The effect type of the call. </div><div class="ttdef"><b>Definition:</b> op_attr_types.h:62</div></div>
<div class="ttc" id="stmt_8h_html"><div class="ttname"><a href="stmt_8h.html">stmt.h</a></div><div class="ttdoc">TIR statements. </div></div>
+<div class="ttc" id="namespacetvm_1_1tir_1_1transform_html_aee836ad2372e1eac4a51f30f3f03f32c"><div class="ttname"><a href="namespacetvm_1_1tir_1_1transform.html#aee836ad2372e1eac4a51f30f3f03f32c">tvm::tir::transform::VerifyVTCMLimit</a></div><div class="ttdeci">Pass VerifyVTCMLimit(const Integer &limit)</div><div class="ttdoc">Pass to checks if the size of the allocated vtcm memory satisfies the limit. </div></div>
<div class="ttc" id="structtvm_1_1tir_1_1ExprDeepEqual_html"><div class="ttname"><a href="structtvm_1_1tir_1_1ExprDeepEqual.html">tvm::tir::ExprDeepEqual</a></div><div class="ttdoc">Compare two expressions recursively and check if they are equal to each other without var remapping...</div><div class="ttdef"><b>Definition:</b> analysis.h:54</div></div>
<div class="ttc" id="classtvm_1_1IRModuleNode_html_a88423026ad43fa60158bc9a647704d93"><div class="ttname"><a href="classtvm_1_1IRModuleNode.html#a88423026ad43fa60158bc9a647704d93">tvm::IRModuleNode::functions</a></div><div class="ttdeci">Map< GlobalVar, BaseFunc > functions</div><div class="ttdoc">A map from ids to all global functions. </div><div class="ttdef"><b>Definition:</b> module.h:59</div></div>
<div class="ttc" id="tir_2expr_8h_html"><div class="ttname"><a href="tir_2expr_8h.html">expr.h</a></div><div class="ttdoc">TIR expressions. </div></div>
diff --git a/docs/reference/api/python/auto_scheduler.html b/docs/reference/api/python/auto_scheduler.html
index 79e9ff3307..97c10d0d2d 100644
--- a/docs/reference/api/python/auto_scheduler.html
+++ b/docs/reference/api/python/auto_scheduler.html
@@ -1615,7 +1615,7 @@ history states as starting point to perform Evolutionary Search).</p></li>
<dl class="py class">
<dt class="sig sig-object py" id="tvm.auto_scheduler.SketchPolicy">
-<em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">tvm.auto_scheduler.</span></span><span class="sig-name descname"><span class="pre">SketchPolicy</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">task</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">program_cost_model</span></span><span class="o"><span class="pre">=</span></span><span class="defau [...]
+<em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">tvm.auto_scheduler.</span></span><span class="sig-name descname"><span class="pre">SketchPolicy</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">task</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">program_cost_model</span></span><span class="o"><span class="pre">=</span></span><span class="defau [...]
<dd><p>The search policy that searches in a hierarchical search space defined by sketches.
The policy randomly samples programs from the space defined by sketches and use evolutionary
search to fine-tune them.</p>
@@ -1899,7 +1899,7 @@ Candidates:
<dl class="py function">
<dt class="sig sig-object py" id="tvm.auto_scheduler.auto_schedule">
-<span class="sig-prename descclassname"><span class="pre">tvm.auto_scheduler.</span></span><span class="sig-name descname"><span class="pre">auto_schedule</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">task</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">search_policy</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em clas [...]
+<span class="sig-prename descclassname"><span class="pre">tvm.auto_scheduler.</span></span><span class="sig-name descname"><span class="pre">auto_schedule</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">task</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">search_policy</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em clas [...]
<dd><p>THIS API IS DEPRECATED.</p>
<p>Run auto scheduling search for a task.</p>
<dl class="field-list simple">
diff --git a/docs/reference/api/python/target.html b/docs/reference/api/python/target.html
index 89ce2b1f7a..cdde5daee7 100644
--- a/docs/reference/api/python/target.html
+++ b/docs/reference/api/python/target.html
@@ -901,6 +901,7 @@ will be valid, LLVM will throw an error.</p></li>
<li><p><strong>use_qfloat</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.11)"><em>bool</em></a><em> (</em><em>default: True for cpu_ver >= v68</em><em>, </em><em>False otherwise</em><em>)</em>) – Whether to use QFloat HVX instructions.</p></li>
<li><p><strong>use_ieee_fp</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.11)"><em>bool</em></a><em> (</em><em>default: False</em><em>)</em>) – Whether to use IEEE HVX instructions</p></li>
<li><p><strong>num_cores</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.11)"><em>int</em></a><em> (</em><em>default: 4</em><em>)</em>) – The number of HVX threads. This attribute is required by meta scheduler.</p></li>
+<li><p><strong>vtcm_capacity</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.11)"><em>int</em></a><em> (</em><em>default: 0</em><em>)</em>) – Hexagon VTCM capacity limitation. If the value is 0, the capacity is treated as unbounded.</p></li>
<li><p><strong>Note</strong> (<em>Floating point support in HVX requires LLVM 14+.</em>) – </p></li>
</ul>
</dd>
diff --git a/docs/reference/api/python/tir.html b/docs/reference/api/python/tir.html
index 4d85eb2276..f3659d3e9c 100644
--- a/docs/reference/api/python/tir.html
+++ b/docs/reference/api/python/tir.html
@@ -8415,6 +8415,9 @@ a positive bound overlap.</p>
<tr class="row-odd"><td><p><a class="reference internal" href="#tvm.tir.transform.VerifyMemory" title="tvm.tir.transform.VerifyMemory"><code class="xref py py-obj docutils literal notranslate"><span class="pre">VerifyMemory</span></code></a>()</p></td>
<td><p>Verify if func contains illegal host side direct memory access.</p></td>
</tr>
+<tr class="row-even"><td><p><a class="reference internal" href="#tvm.tir.transform.VerifyVTCMLimit" title="tvm.tir.transform.VerifyVTCMLimit"><code class="xref py py-obj docutils literal notranslate"><span class="pre">VerifyVTCMLimit</span></code></a>(limit)</p></td>
+<td><p>Verify if the size of the allocated vtcm memory satisfies the limit.</p></td>
+</tr>
</tbody>
</table>
<p><strong>Classes:</strong></p>
@@ -9715,6 +9718,20 @@ Will lower to scalar loop when it is turned off.</p>
</dl>
</dd></dl>
+<dl class="py function">
+<dt class="sig sig-object py" id="tvm.tir.transform.VerifyVTCMLimit">
+<span class="sig-prename descclassname"><span class="pre">tvm.tir.transform.</span></span><span class="sig-name descname"><span class="pre">VerifyVTCMLimit</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">limit</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.11)"><span class="pre">int</span></a></ [...]
+<dd><p>Verify if the size of the allocated vtcm memory satisfies the limit.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p><strong>fpass</strong> – The result pass</p>
+</dd>
+<dt class="field-even">Return type</dt>
+<dd class="field-even"><p><a class="reference internal" href="ir.html#tvm.transform.Pass" title="tvm.transform.Pass">tvm.transform.Pass</a></p>
+</dd>
+</dl>
+</dd></dl>
+
</div>
<div class="section" id="tvm-tir-analysis">
<h1>tvm.tir.analysis<a class="headerlink" href="#tvm-tir-analysis" title="Permalink to this headline">¶</a></h1>
@@ -9768,43 +9785,46 @@ Will lower to scalar loop when it is turned off.</p>
<tr class="row-even"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">apply_prim_func_arg_and_result_memory_constraints</span></code>(...)</p></td>
<td><p>Returns func written to capture the memory (aka storage) scope constraints for each of the func's parameters given by arg_and_result_memory_scopes.</p></td>
</tr>
-<tr class="row-odd"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">calculate_constant_bytes</span></code>(func, ...)</p></td>
+<tr class="row-odd"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">calculate_allocated_bytes</span></code>(func)</p></td>
+<td><p>Calculate allocated memory per memory scope required by TIR PrimFuncs.</p></td>
+</tr>
+<tr class="row-even"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">calculate_constant_bytes</span></code>(func, ...)</p></td>
<td><p>Calculate the constant size in bytes needed by the TIR allocates inside the TIR PrimFunc.</p></td>
</tr>
-<tr class="row-even"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">calculate_workspace_bytes</span></code>(func, ...)</p></td>
+<tr class="row-odd"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">calculate_workspace_bytes</span></code>(func, ...)</p></td>
<td><p>Calculate the workspace size in bytes needed by the TIR allocates inside the TIR PrimFunc.</p></td>
</tr>
-<tr class="row-odd"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">detect_buffer_access_lca</span></code>(func)</p></td>
+<tr class="row-even"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">detect_buffer_access_lca</span></code>(func)</p></td>
<td><p>Detect the lowest common ancestor(LCA) of buffer access, including both high-level access(BufferLoad, BufferStore) and low-level access(Load, Store and opaque access).</p></td>
</tr>
-<tr class="row-even"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">estimate_tir_flops</span></code>(stmt_or_mod)</p></td>
+<tr class="row-odd"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">estimate_tir_flops</span></code>(stmt_or_mod)</p></td>
<td><p>Estimate the FLOPs of a TIR fragment.</p></td>
</tr>
-<tr class="row-odd"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">expr_deep_equal</span></code>(lhs, rhs)</p></td>
+<tr class="row-even"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">expr_deep_equal</span></code>(lhs, rhs)</p></td>
<td><p>Deeply compare two nested expressions.</p></td>
</tr>
-<tr class="row-even"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">find_anchor_block</span></code>(mod)</p></td>
+<tr class="row-odd"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">find_anchor_block</span></code>(mod)</p></td>
<td><p>Find the "anchor block" of the given module.</p></td>
</tr>
-<tr class="row-odd"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">get_block_access_region</span></code>(block, buffer_var_map)</p></td>
+<tr class="row-even"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">get_block_access_region</span></code>(block, buffer_var_map)</p></td>
<td><p>Detect which regions of tensors in this block are read or written to.</p></td>
</tr>
-<tr class="row-even"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">get_block_read_write_region</span></code>(block, ...)</p></td>
+<tr class="row-odd"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">get_block_read_write_region</span></code>(block, ...)</p></td>
<td><p>Auto detect the block read/write region according to its body stmt.</p></td>
</tr>
-<tr class="row-odd"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">get_prim_func_arg_and_result_memory_constraints</span></code>(...)</p></td>
+<tr class="row-even"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">get_prim_func_arg_and_result_memory_constraints</span></code>(...)</p></td>
<td><p>Returns the memory (aka storage) scope constraints for all the arguments and result of func.</p></td>
</tr>
-<tr class="row-even"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">verify_gpu_code</span></code>(func, constraints)</p></td>
+<tr class="row-odd"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">verify_gpu_code</span></code>(func, constraints)</p></td>
<td><p>Verify if module contains illegal host side direct memory access.</p></td>
</tr>
-<tr class="row-odd"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">verify_memory</span></code>(func)</p></td>
+<tr class="row-even"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">verify_memory</span></code>(func)</p></td>
<td><p>Verify if func contains illegal host side direct memory access.</p></td>
</tr>
-<tr class="row-even"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">verify_ssa</span></code>(func)</p></td>
+<tr class="row-odd"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">verify_ssa</span></code>(func)</p></td>
<td><p>Verify if the func is in SSA form.</p></td>
</tr>
-<tr class="row-odd"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">verify_well_formed</span></code>(func[, assert_mode])</p></td>
+<tr class="row-even"><td><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">verify_well_formed</span></code>(func[, assert_mode])</p></td>
<td><p>Verify if the given TIR is well-formed. The verification includes:</p></td>
</tr>
</tbody>
@@ -10493,6 +10513,23 @@ for all new memory scopes.</p>
</dl>
</dd></dl>
+<dl class="py function">
+<dt class="sig sig-object py">
+<span class="sig-prename descclassname"><span class="pre">tvm.tir.analysis.</span></span><span class="sig-name descname"><span class="pre">calculate_allocated_bytes</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#tvm.tir.PrimFunc" title="tvm.tir.function.PrimFunc"><span class="pre">tvm.tir.function.PrimFunc</span></a [...]
+<dd><p>Calculate allocated memory per memory scope required by TIR PrimFuncs.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>func</strong> (<a class="reference internal" href="#tvm.tir.PrimFunc" title="tvm.tir.PrimFunc"><em>tvm.tir.PrimFunc</em></a>) – The function to be detected.</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p><strong>result</strong> – Allocated memory size per scope in bytes.</p>
+</dd>
+<dt class="field-odd">Return type</dt>
+<dd class="field-odd"><p>Dict[<a class="reference internal" href="runtime.html#tvm.runtime.String" title="tvm.runtime.String">String</a>, <a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.11)">int</a>]</p>
+</dd>
+</dl>
+</dd></dl>
+
<dl class="py function">
<dt class="sig sig-object py">
<span class="sig-prename descclassname"><span class="pre">tvm.tir.analysis.</span></span><span class="sig-name descname"><span class="pre">calculate_constant_bytes</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#tvm.tir.PrimFunc" title="tvm.tir.function.PrimFunc"><span class="pre">tvm.tir.function.PrimFunc</span></a> [...]
diff --git a/docs/reference/api/typedoc/classes/bytestreamreader.html b/docs/reference/api/typedoc/classes/bytestreamreader.html
index ea77f58d21..9fdbd6c2f5 100644
--- a/docs/reference/api/typedoc/classes/bytestreamreader.html
+++ b/docs/reference/api/typedoc/classes/bytestreamreader.html
@@ -119,7 +119,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/rpc_server.ts#L43">rpc_server.ts:43</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/rpc_server.ts#L43">rpc_server.ts:43</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -141,7 +141,7 @@
<div class="tsd-signature tsd-kind-icon">bytes<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">Uint8Array</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/rpc_server.ts#L43">rpc_server.ts:43</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/rpc_server.ts#L43">rpc_server.ts:43</a></li>
</ul>
</aside>
</section>
@@ -151,7 +151,7 @@
<div class="tsd-signature tsd-kind-icon">offset<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">number</span><span class="tsd-signature-symbol"> = 0</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/rpc_server.ts#L42">rpc_server.ts:42</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/rpc_server.ts#L42">rpc_server.ts:42</a></li>
</ul>
</aside>
</section>
@@ -168,7 +168,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/rpc_server.ts#L63">rpc_server.ts:63</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/rpc_server.ts#L63">rpc_server.ts:63</a></li>
</ul>
</aside>
<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">Uint8Array</span></h4>
@@ -185,7 +185,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/rpc_server.ts#L49">rpc_server.ts:49</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/rpc_server.ts#L49">rpc_server.ts:49</a></li>
</ul>
</aside>
<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">number</span></h4>
@@ -202,7 +202,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/rpc_server.ts#L57">rpc_server.ts:57</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/rpc_server.ts#L57">rpc_server.ts:57</a></li>
</ul>
</aside>
<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">number</span></h4>
diff --git a/docs/reference/api/typedoc/classes/cachedcallstack.html b/docs/reference/api/typedoc/classes/cachedcallstack.html
index f305ee22aa..cd94d79a02 100644
--- a/docs/reference/api/typedoc/classes/cachedcallstack.html
+++ b/docs/reference/api/typedoc/classes/cachedcallstack.html
@@ -144,7 +144,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/memory.ts#L223">memory.ts:223</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/memory.ts#L223">memory.ts:223</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -172,7 +172,7 @@
<div class="tsd-signature tsd-kind-icon">temp<wbr>Args<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">Array</span><span class="tsd-signature-symbol"><</span><a href="../interfaces/disposable.html" class="tsd-signature-type">Disposable</a><span class="tsd-signature-symbol">></span><span class="tsd-signature-symbol"> = []</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/memory.ts#L208">memory.ts:208</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/memory.ts#L208">memory.ts:208</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -194,7 +194,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/memory.ts#L312">memory.ts:312</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/memory.ts#L312">memory.ts:312</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -226,7 +226,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/memory.ts#L284">memory.ts:284</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/memory.ts#L284">memory.ts:284</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -262,7 +262,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/memory.ts#L388">memory.ts:388</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/memory.ts#L388">memory.ts:388</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -300,7 +300,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/memory.ts#L376">memory.ts:376</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/memory.ts#L376">memory.ts:376</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -340,7 +340,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/memory.ts#L267">memory.ts:267</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/memory.ts#L267">memory.ts:267</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -373,7 +373,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/memory.ts#L243">memory.ts:243</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/memory.ts#L243">memory.ts:243</a></li>
</ul>
</aside>
<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">void</span></h4>
@@ -390,7 +390,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/memory.ts#L321">memory.ts:321</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/memory.ts#L321">memory.ts:321</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -422,7 +422,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/memory.ts#L252">memory.ts:252</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/memory.ts#L252">memory.ts:252</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -444,7 +444,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/memory.ts#L359">memory.ts:359</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/memory.ts#L359">memory.ts:359</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -470,7 +470,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/memory.ts#L342">memory.ts:342</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/memory.ts#L342">memory.ts:342</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -496,7 +496,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/memory.ts#L350">memory.ts:350</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/memory.ts#L350">memory.ts:350</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -522,7 +522,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/memory.ts#L326">memory.ts:326</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/memory.ts#L326">memory.ts:326</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -548,7 +548,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/memory.ts#L363">memory.ts:363</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/memory.ts#L363">memory.ts:363</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -574,7 +574,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/memory.ts#L346">memory.ts:346</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/memory.ts#L346">memory.ts:346</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -600,7 +600,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/memory.ts#L334">memory.ts:334</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/memory.ts#L334">memory.ts:334</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
diff --git a/docs/reference/api/typedoc/classes/dldatatype.html b/docs/reference/api/typedoc/classes/dldatatype.html
index f7dd3f7bf9..74f870e5bd 100644
--- a/docs/reference/api/typedoc/classes/dldatatype.html
+++ b/docs/reference/api/typedoc/classes/dldatatype.html
@@ -119,7 +119,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L262">runtime.ts:262</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L262">runtime.ts:262</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -147,7 +147,7 @@
<div class="tsd-signature tsd-kind-icon">bits<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">number</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L260">runtime.ts:260</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L260">runtime.ts:260</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -162,7 +162,7 @@
<div class="tsd-signature tsd-kind-icon">code<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">number</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L258">runtime.ts:258</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L258">runtime.ts:258</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -177,7 +177,7 @@
<div class="tsd-signature tsd-kind-icon">lanes<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">number</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L262">runtime.ts:262</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L262">runtime.ts:262</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -199,7 +199,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L279">runtime.ts:279</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L279">runtime.ts:279</a></li>
</ul>
</aside>
<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">number</span></h4>
@@ -216,7 +216,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L270">runtime.ts:270</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L270">runtime.ts:270</a></li>
</ul>
</aside>
<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">string</span></h4>
diff --git a/docs/reference/api/typedoc/classes/dldevice.html b/docs/reference/api/typedoc/classes/dldevice.html
index aee372cc7a..3059dc8c31 100644
--- a/docs/reference/api/typedoc/classes/dldevice.html
+++ b/docs/reference/api/typedoc/classes/dldevice.html
@@ -118,7 +118,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L202">runtime.ts:202</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L202">runtime.ts:202</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -146,7 +146,7 @@
<div class="tsd-signature tsd-kind-icon">device<wbr>Id<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">number</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L200">runtime.ts:200</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L200">runtime.ts:200</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -161,7 +161,7 @@
<div class="tsd-signature tsd-kind-icon">device<wbr>Type<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">number</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L198">runtime.ts:198</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L198">runtime.ts:198</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -183,7 +183,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L223">runtime.ts:223</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L223">runtime.ts:223</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -205,7 +205,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L230">runtime.ts:230</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L230">runtime.ts:230</a></li>
</ul>
</aside>
<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">string</span></h4>
diff --git a/docs/reference/api/typedoc/classes/environment.html b/docs/reference/api/typedoc/classes/environment.html
index d308227232..7fcd3b5565 100644
--- a/docs/reference/api/typedoc/classes/environment.html
+++ b/docs/reference/api/typedoc/classes/environment.html
@@ -125,7 +125,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/environment.ts#L86">environment.ts:86</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/environment.ts#L86">environment.ts:86</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -169,7 +169,7 @@
<aside class="tsd-sources">
<p>Implementation of <a href="../interfaces/libraryprovider.html">LibraryProvider</a>.<a href="../interfaces/libraryprovider.html#imports">imports</a></p>
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/environment.ts#L70">environment.ts:70</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/environment.ts#L70">environment.ts:70</a></li>
</ul>
</aside>
</section>
@@ -179,7 +179,7 @@
<div class="tsd-signature tsd-kind-icon">logger<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-symbol">(</span>msg<span class="tsd-signature-symbol">: </span><span class="tsd-signature-type">string</span><span class="tsd-signature-symbol">)</span><span class="tsd-signature-symbol"> => </span><span class="tsd-signature-type">void</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/environment.ts#L69">environment.ts:69</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/environment.ts#L69">environment.ts:69</a></li>
</ul>
</aside>
<div class="tsd-type-declaration">
@@ -210,7 +210,7 @@
<div class="tsd-signature tsd-kind-icon">packedCFunc<wbr>Table<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">Array</span><span class="tsd-signature-symbol"><</span><span class="tsd-signature-type">ctypes.FTVMWasmPackedCFunc</span><span class="tsd-signature-symbol"> | </span><span class="tsd-signature-type">undefined</span><span class="tsd-signature-symbol">></span><span class="tsd-signature-symbol"> = [undefined,]</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/environment.ts#L78">environment.ts:78</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/environment.ts#L78">environment.ts:78</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -228,7 +228,7 @@
<div class="tsd-signature tsd-kind-icon">packedCFunc<wbr>Table<wbr>Free<wbr>Id<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">Array</span><span class="tsd-signature-symbol"><</span><span class="tsd-signature-type">number</span><span class="tsd-signature-symbol">></span><span class="tsd-signature-symbol"> = []</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/environment.ts#L84">environment.ts:84</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/environment.ts#L84">environment.ts:84</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -250,7 +250,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/environment.ts#L105">environment.ts:105</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/environment.ts#L105">environment.ts:105</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
diff --git a/docs/reference/api/typedoc/classes/ffilibrary.html b/docs/reference/api/typedoc/classes/ffilibrary.html
index e62cfcfbad..20a5ee4667 100644
--- a/docs/reference/api/typedoc/classes/ffilibrary.html
+++ b/docs/reference/api/typedoc/classes/ffilibrary.html
@@ -131,7 +131,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L49">runtime.ts:49</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L49">runtime.ts:49</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -156,7 +156,7 @@
<div class="tsd-signature tsd-kind-icon">exports<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">Record</span><span class="tsd-signature-symbol"><</span><span class="tsd-signature-type">string</span><span class="tsd-signature-symbol">, </span><span class="tsd-signature-type">Function</span><span class="tsd-signature-symbol">></span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L46">runtime.ts:46</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L46">runtime.ts:46</a></li>
</ul>
</aside>
</section>
@@ -166,7 +166,7 @@
<div class="tsd-signature tsd-kind-icon">memory<span class="tsd-signature-symbol">:</span> <a href="memory.html" class="tsd-signature-type">Memory</a></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L45">runtime.ts:45</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L45">runtime.ts:45</a></li>
</ul>
</aside>
</section>
@@ -176,7 +176,7 @@
<div class="tsd-signature tsd-kind-icon">wasm32<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">boolean</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L44">runtime.ts:44</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L44">runtime.ts:44</a></li>
</ul>
</aside>
</section>
@@ -186,7 +186,7 @@
<div class="tsd-signature tsd-kind-icon">webGPUContext<span class="tsd-signature-symbol">:</span> <a href="webgpucontext.html" class="tsd-signature-type">WebGPUContext</a></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L47">runtime.ts:47</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L47">runtime.ts:47</a></li>
</ul>
</aside>
</section>
@@ -203,7 +203,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L76">runtime.ts:76</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L76">runtime.ts:76</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -226,7 +226,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L66">runtime.ts:66</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L66">runtime.ts:66</a></li>
</ul>
</aside>
<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">void</span></h4>
@@ -243,7 +243,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L84">runtime.ts:84</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L84">runtime.ts:84</a></li>
</ul>
</aside>
<h4 class="tsd-returns-title">Returns <a href="cachedcallstack.html" class="tsd-signature-type">CachedCallStack</a></h4>
@@ -260,7 +260,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L95">runtime.ts:95</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L95">runtime.ts:95</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -283,7 +283,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L72">runtime.ts:72</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L72">runtime.ts:72</a></li>
</ul>
</aside>
<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">number</span></h4>
diff --git a/docs/reference/api/typedoc/classes/graphexecutor.html b/docs/reference/api/typedoc/classes/graphexecutor.html
index daa3bd1284..10de02ddc5 100644
--- a/docs/reference/api/typedoc/classes/graphexecutor.html
+++ b/docs/reference/api/typedoc/classes/graphexecutor.html
@@ -130,7 +130,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L583">runtime.ts:583</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L583">runtime.ts:583</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -162,7 +162,7 @@
<div class="tsd-signature tsd-kind-icon">module<span class="tsd-signature-symbol">:</span> <a href="module.html" class="tsd-signature-type">Module</a></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L579">runtime.ts:579</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L579">runtime.ts:579</a></li>
</ul>
</aside>
</section>
@@ -179,7 +179,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L654">runtime.ts:654</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L654">runtime.ts:654</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -224,7 +224,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L597">runtime.ts:597</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L597">runtime.ts:597</a></li>
</ul>
</aside>
<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">void</span></h4>
@@ -241,7 +241,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L631">runtime.ts:631</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L631">runtime.ts:631</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -279,7 +279,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L644">runtime.ts:644</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L644">runtime.ts:644</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -310,7 +310,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L621">runtime.ts:621</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L621">runtime.ts:621</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -332,7 +332,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L609">runtime.ts:609</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L609">runtime.ts:609</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
diff --git a/docs/reference/api/typedoc/classes/instance.html b/docs/reference/api/typedoc/classes/instance.html
index 11f2267436..98062e49a1 100644
--- a/docs/reference/api/typedoc/classes/instance.html
+++ b/docs/reference/api/typedoc/classes/instance.html
@@ -139,7 +139,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L692">runtime.ts:692</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L692">runtime.ts:692</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -202,7 +202,7 @@
<div class="tsd-signature tsd-kind-icon">exports<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">Record</span><span class="tsd-signature-symbol"><</span><span class="tsd-signature-type">string</span><span class="tsd-signature-symbol">, </span><span class="tsd-signature-type">Function</span><span class="tsd-signature-symbol">></span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L684">runtime.ts:684</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L684">runtime.ts:684</a></li>
</ul>
</aside>
</section>
@@ -212,7 +212,7 @@
<div class="tsd-signature tsd-kind-icon">memory<span class="tsd-signature-symbol">:</span> <a href="memory.html" class="tsd-signature-type">Memory</a></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L683">runtime.ts:683</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L683">runtime.ts:683</a></li>
</ul>
</aside>
</section>
@@ -229,7 +229,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L932">runtime.ts:932</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L932">runtime.ts:932</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -260,7 +260,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L994">runtime.ts:994</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L994">runtime.ts:994</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -303,7 +303,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L924">runtime.ts:924</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L924">runtime.ts:924</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -341,7 +341,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L732">runtime.ts:732</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L732">runtime.ts:732</a></li>
</ul>
</aside>
<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">void</span></h4>
@@ -358,7 +358,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L952">runtime.ts:952</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L952">runtime.ts:952</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -402,7 +402,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L816">runtime.ts:816</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L816">runtime.ts:816</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -434,7 +434,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L1033">runtime.ts:1033</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L1033">runtime.ts:1033</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -465,7 +465,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L846">runtime.ts:846</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L846">runtime.ts:846</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -497,7 +497,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L750">runtime.ts:750</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L750">runtime.ts:750</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -520,7 +520,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L1013">runtime.ts:1013</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L1013">runtime.ts:1013</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -568,7 +568,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L789">runtime.ts:789</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L789">runtime.ts:789</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -608,7 +608,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L914">runtime.ts:914</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L914">runtime.ts:914</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -646,7 +646,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L1145">runtime.ts:1145</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L1145">runtime.ts:1145</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -698,7 +698,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L740">runtime.ts:740</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L740">runtime.ts:740</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -722,7 +722,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L868">runtime.ts:868</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L868">runtime.ts:868</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -754,7 +754,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L857">runtime.ts:857</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L857">runtime.ts:857</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -786,7 +786,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/runtime.ts#L940">runtime.ts:940</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/runtime.ts#L940">runtime.ts:940</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
diff --git a/docs/reference/api/typedoc/classes/memory.html b/docs/reference/api/typedoc/classes/memory.html
index 5ae1613798..8012a4e81c 100644
--- a/docs/reference/api/typedoc/classes/memory.html
+++ b/docs/reference/api/typedoc/classes/memory.html
@@ -130,7 +130,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/memory.ts#L40">memory.ts:40</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/memory.ts#L40">memory.ts:40</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -152,7 +152,7 @@
<div class="tsd-signature tsd-kind-icon">memory<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">Memory</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/memory.ts#L32">memory.ts:32</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/memory.ts#L32">memory.ts:32</a></li>
</ul>
</aside>
</section>
@@ -162,7 +162,7 @@
<div class="tsd-signature tsd-kind-icon">wasm32<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">boolean</span><span class="tsd-signature-symbol"> = true</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/memory.ts#L33">memory.ts:33</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/memory.ts#L33">memory.ts:33</a></li>
</ul>
</aside>
</section>
@@ -179,7 +179,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/memory.ts#L154">memory.ts:154</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/memory.ts#L154">memory.ts:154</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -210,7 +210,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/memory.ts#L90">memory.ts:90</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/memory.ts#L90">memory.ts:90</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -233,7 +233,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/memory.ts#L97">memory.ts:97</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/memory.ts#L97">memory.ts:97</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -256,7 +256,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/memory.ts#L74">memory.ts:74</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/memory.ts#L74">memory.ts:74</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -279,7 +279,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/bf16b42ed/web/src/memory.ts#L81">memory.ts:81</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/afbfb7aa7/web/src/memory.ts#L81">memory.ts:81</a></li>
</ul>
... 2139 lines suppressed ...