You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by tq...@apache.org on 2023/01/13 06:30:48 UTC
[tvm-site] branch asf-site updated: deploying docs (apache/tvm@f9759920e0f9fc2d01b86ed540e5528f0de896e9)
This is an automated email from the ASF dual-hosted git repository.
tqchen pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/tvm-site.git
The following commit(s) were added to refs/heads/asf-site by this push:
new 5b11bf79b5 deploying docs (apache/tvm@f9759920e0f9fc2d01b86ed540e5528f0de896e9)
5b11bf79b5 is described below
commit 5b11bf79b58165ce6575608b2ef4f39bc907946b
Author: tvm-bot <95...@users.noreply.github.com>
AuthorDate: Fri Jan 13 06:30:42 2023 +0000
deploying docs (apache/tvm@f9759920e0f9fc2d01b86ed540e5528f0de896e9)
---
docs/_images/sphx_glr_micro_train_001.png | Bin 332439 -> 332113 bytes
docs/_images/sphx_glr_micro_train_thumb.png | Bin 23544 -> 23380 bytes
.../how_to/compile_models/from_darknet.rst.txt | 2 +-
.../how_to/compile_models/from_keras.rst.txt | 2 +-
.../how_to/compile_models/from_mxnet.rst.txt | 2 +-
.../how_to/compile_models/from_oneflow.rst.txt | 2 +-
.../how_to/compile_models/from_pytorch.rst.txt | 2 +-
.../how_to/compile_models/from_tensorflow.rst.txt | 2 +-
.../compile_models/sg_execution_times.rst.txt | 22 +-
.../deploy_models/deploy_model_on_adreno.rst.txt | 2 +-
.../deploy_models/deploy_model_on_android.rst.txt | 2 +-
.../deploy_object_detection_pytorch.rst.txt | 4 +-
.../deploy_models/deploy_prequantized.rst.txt | 6 +-
.../deploy_prequantized_tflite.rst.txt | 4 +-
.../how_to/deploy_models/deploy_quantized.rst.txt | 2 +-
.../deploy_models/deploy_ssd_gluoncv.rst.txt | 4 +-
.../deploy_models/sg_execution_times.rst.txt | 20 +-
.../extend_tvm/bring_your_own_datatypes.rst.txt | 2 +-
.../how_to/extend_tvm/sg_execution_times.rst.txt | 8 +-
.../how_to/extend_tvm/use_pass_instrument.rst.txt | 16 +-
.../optimize_operators/opt_conv_cuda.rst.txt | 2 +-
.../optimize_operators/opt_conv_tensorcore.rst.txt | 2 +-
.../how_to/optimize_operators/opt_gemm.rst.txt | 16 +-
.../optimize_operators/sg_execution_times.rst.txt | 8 +-
.../sg_execution_times.rst.txt | 14 +-
.../tune_conv2d_layer_cuda.rst.txt | 3055 ++++++--------------
.../tune_network_cuda.rst.txt | 4 +-
.../tune_network_x86.rst.txt | 4 +-
.../tune_sparse_x86.rst.txt | 114 +-
.../tune_with_autotvm/sg_execution_times.rst.txt | 8 +-
.../tune_with_autotvm/tune_conv2d_cuda.rst.txt | 695 ++++-
.../work_with_microtvm/micro_autotune.rst.txt | 16 +-
.../work_with_microtvm/micro_pytorch.rst.txt | 4 +-
.../how_to/work_with_microtvm/micro_train.rst.txt | 18 +-
.../work_with_microtvm/sg_execution_times.rst.txt | 16 +-
.../work_with_relay/sg_execution_times.rst.txt | 8 +-
.../how_to/work_with_schedules/intrin_math.rst.txt | 2 +-
.../work_with_schedules/sg_execution_times.rst.txt | 12 +-
.../how_to/work_with_schedules/tensorize.rst.txt | 2 +-
.../tutorials/autotvm/sg_execution_times.rst.txt | 6 +-
.../frontend/deploy_classification.rst.txt | 2 +-
.../tutorials/frontend/deploy_detection.rst.txt | 2 +-
.../tutorials/frontend/sg_execution_times.rst.txt | 6 +-
.../tutorials/optimize/sg_execution_times.rst.txt | 4 +-
.../topic/vta/tutorials/sg_execution_times.rst.txt | 6 +-
.../tutorial/auto_scheduler_matmul_x86.rst.txt | 4 +-
docs/_sources/tutorial/autotvm_matmul_x86.rst.txt | 20 +-
docs/_sources/tutorial/autotvm_relay_x86.rst.txt | 58 +-
.../tutorial/cross_compilation_and_rpc.rst.txt | 2 +-
docs/_sources/tutorial/intro_topi.rst.txt | 2 +-
docs/_sources/tutorial/sg_execution_times.rst.txt | 18 +-
.../tutorial/tensor_expr_get_started.rst.txt | 42 +-
docs/commit_hash | 2 +-
docs/how_to/compile_models/from_darknet.html | 2 +-
docs/how_to/compile_models/from_keras.html | 2 +-
docs/how_to/compile_models/from_mxnet.html | 2 +-
docs/how_to/compile_models/from_oneflow.html | 11 +-
docs/how_to/compile_models/from_pytorch.html | 9 +-
docs/how_to/compile_models/from_tensorflow.html | 2 +-
docs/how_to/compile_models/sg_execution_times.html | 22 +-
.../deploy_models/deploy_model_on_adreno.html | 2 +-
.../deploy_models/deploy_model_on_android.html | 2 +-
.../deploy_object_detection_pytorch.html | 36 +-
docs/how_to/deploy_models/deploy_prequantized.html | 7 +-
.../deploy_models/deploy_prequantized_tflite.html | 4 +-
docs/how_to/deploy_models/deploy_quantized.html | 2 +-
docs/how_to/deploy_models/deploy_ssd_gluoncv.html | 35 +-
docs/how_to/deploy_models/sg_execution_times.html | 20 +-
.../extend_tvm/bring_your_own_datatypes.html | 2 +-
docs/how_to/extend_tvm/sg_execution_times.html | 8 +-
docs/how_to/extend_tvm/use_pass_instrument.html | 16 +-
docs/how_to/optimize_operators/opt_conv_cuda.html | 2 +-
.../optimize_operators/opt_conv_tensorcore.html | 2 +-
docs/how_to/optimize_operators/opt_gemm.html | 16 +-
.../optimize_operators/sg_execution_times.html | 8 +-
.../sg_execution_times.html | 14 +-
.../tune_conv2d_layer_cuda.html | 3055 ++++++--------------
.../tune_with_autoscheduler/tune_network_cuda.html | 4 +-
.../tune_with_autoscheduler/tune_network_x86.html | 4 +-
.../tune_with_autoscheduler/tune_sparse_x86.html | 114 +-
.../tune_with_autotvm/sg_execution_times.html | 8 +-
.../how_to/tune_with_autotvm/tune_conv2d_cuda.html | 695 ++++-
docs/how_to/work_with_microtvm/micro_autotune.html | 16 +-
docs/how_to/work_with_microtvm/micro_pytorch.html | 4 +-
docs/how_to/work_with_microtvm/micro_train.html | 16 +-
.../work_with_microtvm/sg_execution_times.html | 16 +-
.../how_to/work_with_relay/sg_execution_times.html | 8 +-
docs/how_to/work_with_schedules/intrin_math.html | 2 +-
.../work_with_schedules/sg_execution_times.html | 12 +-
docs/how_to/work_with_schedules/tensorize.html | 2 +-
docs/install/nnpack.html | 12 +-
docs/reference/api/python/auto_scheduler.html | 4 +-
.../api/typedoc/classes/bytestreamreader.html | 12 +-
.../api/typedoc/classes/cachedcallstack.html | 34 +-
docs/reference/api/typedoc/classes/dldatatype.html | 12 +-
docs/reference/api/typedoc/classes/dldevice.html | 10 +-
.../reference/api/typedoc/classes/environment.html | 12 +-
docs/reference/api/typedoc/classes/ffilibrary.html | 20 +-
.../api/typedoc/classes/graphexecutor.html | 16 +-
docs/reference/api/typedoc/classes/instance.html | 40 +-
docs/reference/api/typedoc/classes/memory.html | 34 +-
docs/reference/api/typedoc/classes/module.html | 10 +-
docs/reference/api/typedoc/classes/ndarray.html | 22 +-
.../api/typedoc/classes/packedfunccell.html | 6 +-
docs/reference/api/typedoc/classes/rpcserver.html | 14 +-
docs/reference/api/typedoc/classes/scalar.html | 6 +-
.../api/typedoc/classes/webgpucontext.html | 12 +-
docs/reference/api/typedoc/enums/argtypecode.html | 30 +-
.../api/typedoc/enums/aynccallbackcode.html | 4 +-
.../api/typedoc/enums/dldatatypecode.html | 8 +-
.../api/typedoc/enums/rpcserverstate.html | 12 +-
docs/reference/api/typedoc/enums/sizeof.html | 18 +-
docs/reference/api/typedoc/index.html | 112 +-
.../api/typedoc/interfaces/disposable.html | 2 +-
.../api/typedoc/interfaces/functioninfo.html | 6 +-
.../api/typedoc/interfaces/libraryprovider.html | 4 +-
docs/searchindex.js | 2 +-
.../vta/tutorials/autotvm/sg_execution_times.html | 6 +-
.../tutorials/frontend/deploy_classification.html | 2 +-
.../vta/tutorials/frontend/deploy_detection.html | 2 +-
.../vta/tutorials/frontend/sg_execution_times.html | 6 +-
.../vta/tutorials/optimize/sg_execution_times.html | 4 +-
docs/topic/vta/tutorials/sg_execution_times.html | 6 +-
docs/tutorial/auto_scheduler_matmul_x86.html | 4 +-
docs/tutorial/autotvm_matmul_x86.html | 20 +-
docs/tutorial/autotvm_relay_x86.html | 272 +-
docs/tutorial/cross_compilation_and_rpc.html | 2 +-
docs/tutorial/intro_topi.html | 2 +-
docs/tutorial/sg_execution_times.html | 18 +-
docs/tutorial/tensor_expr_get_started.html | 42 +-
130 files changed, 3920 insertions(+), 5400 deletions(-)
diff --git a/docs/_images/sphx_glr_micro_train_001.png b/docs/_images/sphx_glr_micro_train_001.png
index 86b04a461c..0fd9c0ef04 100644
Binary files a/docs/_images/sphx_glr_micro_train_001.png and b/docs/_images/sphx_glr_micro_train_001.png differ
diff --git a/docs/_images/sphx_glr_micro_train_thumb.png b/docs/_images/sphx_glr_micro_train_thumb.png
index 76771d8f4a..2b4fe4d842 100644
Binary files a/docs/_images/sphx_glr_micro_train_thumb.png and b/docs/_images/sphx_glr_micro_train_thumb.png differ
diff --git a/docs/_sources/how_to/compile_models/from_darknet.rst.txt b/docs/_sources/how_to/compile_models/from_darknet.rst.txt
index f6438e8f6e..2d5ab893a9 100644
--- a/docs/_sources/how_to/compile_models/from_darknet.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_darknet.rst.txt
@@ -318,7 +318,7 @@ The process is no different from other examples.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 8.780 seconds)
+ **Total running time of the script:** ( 1 minutes 9.985 seconds)
.. _sphx_glr_download_how_to_compile_models_from_darknet.py:
diff --git a/docs/_sources/how_to/compile_models/from_keras.rst.txt b/docs/_sources/how_to/compile_models/from_keras.rst.txt
index 486604602a..8c83ccad4f 100644
--- a/docs/_sources/how_to/compile_models/from_keras.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_keras.rst.txt
@@ -232,7 +232,7 @@ Look up prediction top 1 index in 1000 class synset.
.. code-block:: none
Relay top-1 id: 285, class name: Egyptian cat
-
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 1s 911ms/step
+
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 1s 969ms/step
Keras top-1 id: 285, class name: Egyptian cat
diff --git a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
index a600b943c1..9722aa105e 100644
--- a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
@@ -116,7 +116,7 @@ In this section, we download a pretrained imagenet model and classify an image.
.. code-block:: none
- Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zipd6e61b3c-d97a-43be-82a8-fe50d3492aa1 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+ Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip87b1ba24-3b71-44a9-87e6-cb35d462db82 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
x (1, 3, 224, 224)
diff --git a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
index 07afed94aa..a84839e99a 100644
--- a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
@@ -121,7 +121,7 @@ Load a pretrained OneFlow model and save model
.. code-block:: none
Downloading: "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip" to /workspace/.oneflow/flowvision_cache/resnet18.zip
-
0%| | 0.00/41.5M [00:00<?, ?B/s]
19%|#9 | 7.99M/41.5M [00:00<00:00, 66.0MB/s]
39%|###8 | 16.0M/41.5M [00:00<00:00, 71.7MB/s]
55%|#####5 | 22.9M/41.5M [00:00<00:00, 67.3MB/s]
71%|####### | 29.3M/41.5M [00:00<00:00, 62.1MB/s]
92%|#########2| 38.3M/41.5M [00:00<00:00, 67.9MB/s]
100%|##########| 41.5M/41.5M [00:00<00:00, 67.6MB/s]
+
0%| | 0.00/41.5M [00:00<?, ?B/s]
19%|#9 | 7.99M/41.5M [00:00<00:00, 56.1MB/s]
39%|###8 | 16.0M/41.5M [00:00<00:00, 61.1MB/s]
58%|#####7 | 24.0M/41.5M [00:00<00:00, 52.6MB/s]
77%|#######7 | 32.0M/41.5M [00:00<00:00, 57.4MB/s]
100%|##########| 41.5M/41.5M [00:00<00:00, 64.2MB/s]
diff --git a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
index 1e06be448b..5771030ba6 100644
--- a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
@@ -101,7 +101,7 @@ Load a pretrained PyTorch model
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and will be removed in 0.15. The current behavior is equivalent to passing `weights=ResNet18_Weights.IMAGENET1K_V1`. You can also use `weights=ResNet18_Weights.DEFAULT` to get the most up-to-date weights.
warnings.warn(msg)
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
-
0%| | 0.00/44.7M [00:00<?, ?B/s]
18%|#7 | 7.99M/44.7M [00:00<00:00, 65.3MB/s]
38%|###7 | 16.9M/44.7M [00:00<00:00, 80.3MB/s]
68%|######7 | 30.3M/44.7M [00:00<00:00, 105MB/s]
91%|######### | 40.5M/44.7M [00:00<00:00, 94.5MB/s]
100%|##########| 44.7M/44.7M [00:00<00:00, 97.0MB/s]
+
0%| | 0.00/44.7M [00:00<?, ?B/s]
24%|##3 | 10.6M/44.7M [00:00<00:00, 97.0MB/s]
54%|#####3 | 24.0M/44.7M [00:00<00:00, 114MB/s]
78%|#######8 | 34.8M/44.7M [00:00<00:00, 111MB/s]
100%|##########| 44.7M/44.7M [00:00<00:00, 107MB/s]
diff --git a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
index 12feef6d0e..6830914c4a 100644
--- a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
@@ -424,7 +424,7 @@ Run the corresponding model on tensorflow
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 9.956 seconds)
+ **Total running time of the script:** ( 1 minutes 11.470 seconds)
.. _sphx_glr_download_how_to_compile_models_from_tensorflow.py:
diff --git a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
index 1f585273aa..ca7122abbd 100644
--- a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
@@ -5,26 +5,26 @@
Computation times
=================
-**05:35.607** total execution time for **how_to_compile_models** files:
+**05:42.742** total execution time for **how_to_compile_models** files:
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``) | 01:09.956 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``) | 01:11.470 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``) | 01:08.780 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``) | 01:09.985 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``) | 00:45.931 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``) | 00:47.277 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``) | 00:30.721 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``) | 00:32.329 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``) | 00:27.368 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``) | 00:27.763 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``) | 00:25.862 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``) | 00:26.274 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``) | 00:25.269 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``) | 00:25.260 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``) | 00:22.186 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``) | 00:22.601 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``) | 00:17.144 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``) | 00:17.300 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``) | 00:02.390 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``) | 00:02.482 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/deploy_models/deploy_model_on_adreno.rst.txt b/docs/_sources/how_to/deploy_models/deploy_model_on_adreno.rst.txt
index 97dcd71547..846801eae1 100644
--- a/docs/_sources/how_to/deploy_models/deploy_model_on_adreno.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_model_on_adreno.rst.txt
@@ -727,7 +727,7 @@ well as provides information about the model's performance
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 2755.0304 2754.1602 2761.8323 2752.2916 2.6508
+ 2752.8046 2752.1136 2757.1940 2750.7929 2.0615
diff --git a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
index 2277c84f02..023decdb36 100644
--- a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
@@ -437,7 +437,7 @@ Execute on TVM
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 15.7371 15.7188 15.8776 15.6246 0.0824
+ 16.4263 16.4800 16.9027 15.8362 0.3722
diff --git a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
index 673704e3f9..f046df4e83 100644
--- a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
@@ -130,7 +130,7 @@ Load pre-trained maskrcnn from torchvision and do tracing
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and will be removed in 0.15. The current behavior is equivalent to passing `weights=MaskRCNN_ResNet50_FPN_Weights.COCO_V1`. You can also use `weights=MaskRCNN_ResNet50_FPN_Weights.DEFAULT` to get the most up-to-date weights.
warnings.warn(msg)
Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
-
0%| | 0.00/170M [00:00<?, ?B/s]
5%|4 | 8.17M/170M [00:00<00:01, 85.7MB/s]
10%|9 | 16.3M/170M [00:00<00:01, 84.8MB/s]
15%|#5 | 26.1M/170M [00:00<00:01, 92.8MB/s]
21%|## | 35.0M/170M [00:00<00:01, 71.8MB/s]
25%|##4 | 42.3M/170M [00:00<00:02, 65.6MB/s]
30%|##9 | 50.6M/170M [00:00<00:01, 71.6MB/s]
34%|###4 | 58.5M/170M [00:00<00:01, 74.8MB/s]
39%|###8 | 66.0M/170M [00:00<00:01, 70.9MB/s]
44%|####4 | 75.1M/170M [00:01<00:01, 77.9MB/s]
49%|####8 | 82.8M/170M [00:01<00:01, 77.4MB/s]
53%|#####3 | 90.3M/170M [00:01<00:01, 77.8MB/s]
58%|#####7 | 98.1M/170M [00:01<00:00, 78.8MB/s]
62%|######2 | 106M/170M [00:01<00:00, 69.1MB/s]
69%|######8 | 116M/170M [00:01<00:00, 80.8MB/s]
73%|#######3 | 124M/170M [00:01<00:00, 80.8MB/s]
80%|######## | 136M/170M [00:01<00:00, 76.3MB/s]
85%|########4 | 144M/170M [00:01<00:00, 75.3MB/s]
89%|########9 | 152M/170M [00:02<00:00, 72.6MB/s]
95%|#########5| 162M/170M [00:02<00:00, 70.8MB/s]
100%|##########| 170M/170M [00:02<00:00, 76.5MB/s]
+
0%| | 0.00/170M [00:00<?, ?B/s]
9%|8 | 15.0M/170M [00:00<00:01, 156MB/s]
18%|#7 | 29.9M/170M [00:00<00:01, 98.5MB/s]
26%|##5 | 43.6M/170M [00:00<00:01, 114MB/s]
33%|###2 | 55.6M/170M [00:00<00:01, 109MB/s]
39%|###9 | 66.6M/170M [00:00<00:01, 98.7MB/s]
45%|####5 | 76.5M/170M [00:00<00:00, 98.7MB/s]
52%|#####1 | 88.0M/170M [00:00<00:00, 96.2MB/s]
57%|#####7 | 97.4M/170M [00:01<00:00, 96.6MB/s]
65%|######5 | 111M/170M [00:01<00:00, 109MB/s]
71%|#######1 | 121M/170M [00:01<00:00, 88.2MB/s]
80%|######## | 136M/170M [00:01<00:00, 94.5MB/s]
88%|########8 | 150M/170M [00:01<00:00, 107MB/s]
95%|#########4| 161M/170M [00:01<00:00, 98.5MB/s]
100%|##########| 170M/170M [00:01<00:00, 103MB/s]
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/torch/nn/functional.py:3897: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
for i in range(dim)
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/detection/anchor_utils.py:124: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
@@ -299,7 +299,7 @@ Get boxes with score larger than 0.9
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 3 minutes 11.913 seconds)
+ **Total running time of the script:** ( 3 minutes 15.819 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_object_detection_pytorch.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
index 5a8f21258c..5d5e8e7ef8 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
@@ -227,7 +227,7 @@ training. Other models require a full post training calibration.
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and will be removed in 0.15. The current behavior is equivalent to passing `weights=MobileNet_V2_Weights.IMAGENET1K_V1`. You can also use `weights=MobileNet_V2_Weights.DEFAULT` to get the most up-to-date weights.
warnings.warn(msg)
Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
-
0%| | 0.00/13.6M [00:00<?, ?B/s]
59%|#####8 | 7.99M/13.6M [00:00<00:00, 60.6MB/s]
100%|##########| 13.6M/13.6M [00:00<00:00, 82.4MB/s]
+
0%| | 0.00/13.6M [00:00<?, ?B/s]
100%|##########| 13.6M/13.6M [00:00<00:00, 203MB/s]
@@ -409,7 +409,7 @@ Here we give an example of how to measure performance of TVM compiled models.
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 90.2995 90.2060 92.0427 90.0098 0.2888
+ 90.3102 90.2674 92.8391 89.9773 0.2968
@@ -458,7 +458,7 @@ TODO
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 5.862 seconds)
+ **Total running time of the script:** ( 1 minutes 6.547 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_prequantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
index 79df281ca6..a2fc813c63 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
@@ -423,7 +423,7 @@ Here we give an example of how to measure performance of TVM compiled models.
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 120.1069 120.0560 124.1617 118.7299 0.5242
+ 121.1376 121.0263 129.5003 120.4150 0.9146
@@ -460,7 +460,7 @@ Here we give an example of how to measure performance of TVM compiled models.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 2 minutes 27.541 seconds)
+ **Total running time of the script:** ( 2 minutes 27.427 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_prequantized_tflite.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
index c05e7f6d21..4faabd0dfe 100644
--- a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
@@ -257,7 +257,7 @@ We create a Relay VM to build and execute the model.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 24.340 seconds)
+ **Total running time of the script:** ( 1 minutes 26.831 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_quantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
index a0224c9fab..2833bc5e10 100644
--- a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
@@ -170,7 +170,7 @@ Convert and compile model for CPU.
data: None
input_sym_arg_type = in_param.infer_type()[0]
Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
-
0%| | 0/132723 [00:00<?, ?KB/s]
3%|2 | 3629/132723 [00:00<00:03, 36285.36KB/s]
7%|7 | 9309/132723 [00:00<00:02, 48349.68KB/s]
14%|#3 | 17934/132723 [00:00<00:01, 65651.45KB/s]
20%|## | 26629/132723 [00:00<00:01, 74056.30KB/s]
27%|##6 | 35236/132723 [00:00<00:01, 78385.63KB/s]
33%|###3 | 43816/132723 [00:00<00:01, 80902.97KB/s]
40%|###9 | 52512/132723 [00:00<00:00, 82881.68KB/s]
46%|####6 | 61183/132723 [00:00<00:00, 84099.05KB/s]
53%|#####2 | 69909/132723 [00:00<00:00, 85084.77KB/s]
59%|#####9 | 78513/132723 [00:01<00:00, 85377.54KB/s]
66%|######5 | 87252/132723 [00:01<00:00, 85987.55KB/s]
72%|#######2 | 95988/132723 [00:01<00:00, 86399.54KB/s]
79%|#######8 | 104732/132723 [00:01<00:00, 86711.41KB/s]
86%|########5 | 113481/132723 [00:01<00:00, 86944.14KB/s]
92%|#########2| 122282/132723 [00:01<00:00, 87263.73KB/s]
99%|#########
8| 131009/132723 [00:01<00:00, 84488.17KB/s]
100%|##########| 132723/132723 [00:01<00:00, 81301.66KB/s]
+
0%| | 0/132723 [00:00<?, ?KB/s]
4%|4 | 5724/132723 [00:00<00:02, 57232.55KB/s]
11%|# | 14554/132723 [00:00<00:01, 75500.87KB/s]
18%|#7 | 23420/132723 [00:00<00:01, 81506.20KB/s]
24%|##4 | 32324/132723 [00:00<00:01, 84476.40KB/s]
31%|###1 | 41247/132723 [00:00<00:01, 86187.23KB/s]
38%|###7 | 50149/132723 [00:00<00:00, 87147.37KB/s]
44%|####4 | 59012/132723 [00:00<00:00, 87629.41KB/s]
51%|#####1 | 67897/132723 [00:00<00:00, 88014.44KB/s]
58%|#####7 | 76831/132723 [00:00<00:00, 88424.08KB/s]
65%|######4 | 85753/132723 [00:01<00:00, 88665.17KB/s]
71%|#######1 | 94620/132723 [00:01<00:00, 88640.48KB/s]
78%|#######7 | 103501/132723 [00:01<00:00, 88690.33KB/s]
85%|########4 | 112371/132723 [00:01<00:00, 88673.56KB/s]
91%|#########1| 121300/132723 [00:01<00:00, 88855.76KB/s]
98%|#########8| 130262/132723 [00:01<00:00, 89082.90KB/s]
100%|#######
###| 132723/132723 [00:01<00:00, 86750.30KB/s]
@@ -246,7 +246,7 @@ Display result
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 3 minutes 4.912 seconds)
+ **Total running time of the script:** ( 3 minutes 7.842 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_ssd_gluoncv.py:
diff --git a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
index 1d0c5c500a..20a3ab6c39 100644
--- a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
@@ -5,26 +5,26 @@
Computation times
=================
-**13:33.904** total execution time for **how_to_deploy_models** files:
+**13:45.448** total execution time for **how_to_deploy_models** files:
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``) | 03:11.913 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``) | 03:15.819 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``) | 03:04.912 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``) | 03:07.842 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``) | 02:27.541 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``) | 02:27.427 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``) | 01:24.340 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``) | 01:26.831 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``) | 01:05.862 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``) | 01:06.547 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_adreno.py` (``deploy_model_on_adreno.py``) | 00:53.764 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_adreno.py` (``deploy_model_on_adreno.py``) | 00:54.025 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``) | 00:35.025 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``) | 00:35.851 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_nano.py` (``deploy_model_on_nano.py``) | 00:25.473 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_nano.py` (``deploy_model_on_nano.py``) | 00:25.722 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``) | 00:25.068 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``) | 00:25.379 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_deploy_models_deploy_sparse.py` (``deploy_sparse.py``) | 00:00.006 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
index 4a803a9167..c1178acb81 100644
--- a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
@@ -463,7 +463,7 @@ First let us define two helper functions to get the mobilenet model and a cat im
.. code-block:: none
- Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip08804c12-a636-4348-8548-412315d09f61 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+ Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zipc704b22f-922e-4cf7-b41b-561c0f17a593 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
diff --git a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
index 3668256c14..de4f56d4ef 100644
--- a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
Computation times
=================
-**00:47.421** total execution time for **how_to_extend_tvm** files:
+**00:47.417** total execution time for **how_to_extend_tvm** files:
+-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``) | 00:44.004 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``) | 00:43.952 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``) | 00:02.383 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``) | 00:02.422 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``) | 00:01.026 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``) | 00:01.037 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_extend_tvm_low_level_custom_pass.py` (``low_level_custom_pass.py``) | 00:00.007 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
index 5402311374..1307843670 100644
--- a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
@@ -220,10 +220,10 @@ profile the execution time of each passes.
.. code-block:: none
Printing results of timing profile...
- InferType: 7280us [7280us] (46.78%; 46.78%)
- FoldScaleAxis: 8283us [7us] (53.22%; 53.22%)
- FoldConstant: 8276us [1674us] (53.18%; 99.92%)
- InferType: 6602us [6602us] (42.42%; 79.77%)
+ InferType: 7179us [7179us] (45.74%; 45.74%)
+ FoldScaleAxis: 8517us [6us] (54.26%; 54.26%)
+ FoldConstant: 8511us [1685us] (54.22%; 99.93%)
+ InferType: 6826us [6826us] (43.49%; 80.21%)
@@ -262,10 +262,10 @@ Refer to following sections and :py:func:`tvm.instrument.pass_instrument` for th
.. code-block:: none
Printing results of timing profile...
- InferType: 6620us [6620us] (44.98%; 44.98%)
- FoldScaleAxis: 8098us [5us] (55.02%; 55.02%)
- FoldConstant: 8093us [1649us] (54.99%; 99.94%)
- InferType: 6444us [6444us] (43.78%; 79.62%)
+ InferType: 6620us [6620us] (45.06%; 45.06%)
+ FoldScaleAxis: 8073us [5us] (54.94%; 54.94%)
+ FoldConstant: 8069us [1675us] (54.91%; 99.94%)
+ InferType: 6393us [6393us] (43.51%; 79.24%)
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
index 1d80e42236..cdd7064371 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
@@ -331,7 +331,7 @@ latency of convolution.
.. code-block:: none
- Convolution: 54.173694 ms
+ Convolution: 54.149120 ms
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
index 55948b0bcc..6604522bec 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
@@ -660,7 +660,7 @@ be able to run on our build server
.. code-block:: none
- conv2d with tensor core: 8.290288 ms
+ conv2d with tensor core: 13.364224 ms
diff --git a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
index 23929c6dec..ad72de2c15 100644
--- a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
@@ -134,8 +134,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
.. code-block:: none
- Numpy running time: 0.018250
- Baseline: 3.234689
+ Numpy running time: 0.018687
+ Baseline: 3.208006
@@ -229,7 +229,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
.. code-block:: none
- Opt1: 0.297403
+ Opt1: 0.299340
@@ -331,7 +331,7 @@ In this tutorial, we chose to vectorize the inner loop row data since it is cach
.. code-block:: none
- Opt2: 0.329462
+ Opt2: 0.338976
@@ -426,7 +426,7 @@ the access pattern for A matrix is more cache friendly.
.. code-block:: none
- Opt3: 0.114911
+ Opt3: 0.114522
@@ -550,7 +550,7 @@ flattening.
.. code-block:: none
- Opt4: 0.109820
+ Opt4: 0.107576
@@ -671,7 +671,7 @@ write to C when all the block results are ready.
.. code-block:: none
- Opt5: 0.111050
+ Opt5: 0.110665
@@ -795,7 +795,7 @@ Furthermore, we can also utilize multi-core processors to do the thread-level pa
.. code-block:: none
- Opt6: 0.147391
+ Opt6: 0.146049
diff --git a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
index b645aab8b7..e26b1d95ca 100644
--- a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
@@ -5,12 +5,12 @@
Computation times
=================
-**00:34.364** total execution time for **how_to_optimize_operators** files:
+**00:34.447** total execution time for **how_to_optimize_operators** files:
+-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``) | 00:31.642 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``) | 00:31.647 | 0.0 MB |
+-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``) | 00:01.564 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``) | 00:01.633 | 0.0 MB |
+-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``) | 00:01.158 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``) | 00:01.167 | 0.0 MB |
+-----------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
index a8e289c8cd..d1dfd9d902 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
@@ -5,18 +5,18 @@
Computation times
=================
-**08:53.171** total execution time for **how_to_tune_with_autoscheduler** files:
+**09:04.969** total execution time for **how_to_tune_with_autoscheduler** files:
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``) | 05:30.470 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``) | 05:37.098 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``) | 01:31.128 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``) | 01:32.166 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``) | 01:01.205 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``) | 01:01.802 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``) | 00:27.534 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``) | 00:30.348 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``) | 00:11.890 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``) | 00:12.257 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``) | 00:10.945 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``) | 00:11.297 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
index 693584e18b..f3864ad821 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
@@ -242,1097 +242,484 @@ cooperative fetching, unrolling and operator fusion.
bias: Buffer(bias_2: Pointer(float32), float32, [1, 512, 1, 1], []),
compute: Buffer(compute_2: Pointer(float32), float32, [1, 512, 7, 7], [])}
buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, compute_1: compute} {
- attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 16;
- allocate(conv2d_nchw: Pointer(local float32), float32, [7]), storage_scope = local;
- allocate(pad_temp.shared: Pointer(shared float32), float32, [1296]), storage_scope = shared;
- allocate(kernel.shared: Pointer(shared float32), float32, [4608]), storage_scope = shared;
- attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224 {
- conv2d_nchw_1: Buffer(conv2d_nchw, float32, [1], [], scope="local", align=4)[0] = 0f32
+ attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 28;
+ allocate(conv2d_nchw: Pointer(local float32), float32, [14]), storage_scope = local;
+ allocate(pad_temp.shared: Pointer(shared float32), float32, [72]), storage_scope = shared;
+ allocate(kernel.shared: Pointer(shared float32), float32, [3072]), storage_scope = shared;
+ attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64 {
+ conv2d_nchw_1: Buffer(conv2d_nchw, float32, [14], [], scope="local", align=32)[0] = 0f32
conv2d_nchw_1[1] = 0f32
conv2d_nchw_1[2] = 0f32
conv2d_nchw_1[3] = 0f32
conv2d_nchw_1[4] = 0f32
conv2d_nchw_1[5] = 0f32
conv2d_nchw_1[6] = 0f32
- for (rc.outer.outer: int32, 0, 32) {
- let cse_var_2: int32 = (rc.outer.outer*784)
- let cse_var_1: int32 = (rc.outer.outer*144)
- {
- attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- pad_temp.shared_1: Buffer(pad_temp.shared, float32, [1296], [], scope="shared")[threadIdx.x_1] = @tir.if_then_else(((((9 <= floormod(threadIdx.x_1, 81)) && (floormod(threadIdx.x_1, 81) < 72)) && (1 <= floormod(threadIdx.x_1, 9))) && (floormod(threadIdx.x_1, 9) < 8)), data_3: Buffer(data_2, float32, [25088], [])[((((cse_var_2 + (floordiv(threadIdx.x_1, 81)*49)) + (floordiv(floormod(threadIdx.x_1, 81), 9)*7)) + floormod(threadIdx.x_1, 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- pad_temp.shared_1[(threadIdx.x_1 + 224)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 62), 81)) && (floormod((threadIdx.x_1 + 62), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 8), 9))) && (floormod((threadIdx.x_1 + 8), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 224), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 62), 81), 9)*7)) + floormod((threadIdx.x_1 + 8), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- pad_temp.shared_1[(threadIdx.x_1 + 448)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 43), 81)) && (floormod((threadIdx.x_1 + 43), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 7), 9))) && (floormod((threadIdx.x_1 + 7), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 448), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 43), 81), 9)*7)) + floormod((threadIdx.x_1 + 7), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- pad_temp.shared_1[(threadIdx.x_1 + 672)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 24), 81)) && (floormod((threadIdx.x_1 + 24), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 6), 9))) && (floormod((threadIdx.x_1 + 6), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 672), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 24), 81), 9)*7)) + floormod((threadIdx.x_1 + 6), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- pad_temp.shared_1[(threadIdx.x_1 + 896)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 5), 81)) && (floormod((threadIdx.x_1 + 5), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 5), 9))) && (floormod((threadIdx.x_1 + 5), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 896), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 5), 81), 9)*7)) + floormod((threadIdx.x_1 + 5), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- if @tir.likely((threadIdx.x_1 < 176), dtype=bool) {
- pad_temp.shared_1[(threadIdx.x_1 + 1120)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 67), 81)) && (floormod((threadIdx.x_1 + 67), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 4), 9))) && (floormod((threadIdx.x_1 + 4), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1120), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 67), 81), 9)*7)) + floormod((threadIdx.x_1 + 4), 9)) - 8)], 0f32, dtype=float32)
+ conv2d_nchw_1[7] = 0f32
+ conv2d_nchw_1[8] = 0f32
+ conv2d_nchw_1[9] = 0f32
+ conv2d_nchw_1[10] = 0f32
+ conv2d_nchw_1[11] = 0f32
+ conv2d_nchw_1[12] = 0f32
+ conv2d_nchw_1[13] = 0f32
+ for (rc.outer.outer: int32, 0, 64) {
+ for (ry.outer.outer: int32, 0, 3) {
+ let cse_var_2: int32 = (rc.outer.outer*72)
+ let cse_var_1: int32 = (ry.outer.outer*3)
+ {
+ attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64 {
+ if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
+ pad_temp.shared_1: Buffer(pad_temp.shared, float32, [72], [], scope="shared")[(threadIdx.x_1*4)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1*4), 9))) && (floormod((threadIdx.x_1*4), 9) < 8)), data_3: Buffer(data_2, float32, [25088], [])[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1*4), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + fl [...]
+ }
+ if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
+ pad_temp.shared_1[((threadIdx.x_1*4) + 1)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 1), 9))) && (floormod(((threadIdx.x_1*4) + 1), 9) < 8)), data_3[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 1), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 1), 9)) - 8)], 0f32, dtype=float32)
+ }
+ if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
+ pad_temp.shared_1[((threadIdx.x_1*4) + 2)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 2), 9))) && (floormod(((threadIdx.x_1*4) + 2), 9) < 8)), data_3[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 2), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 2), 9)) - 8)], 0f32, dtype=float32)
+ }
+ if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
+ pad_temp.shared_1[((threadIdx.x_1*4) + 3)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 3), 9))) && (floormod(((threadIdx.x_1*4) + 3), 9) < 8)), data_3[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 3), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 3), 9)) - 8)], 0f32, dtype=float32)
+ }
+ }
+ attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1: Buffer(kernel.shared, float32, [3072], [], scope="shared")[threadIdx.x_2] = kernel_3: Buffer(kernel_2, float32, [2359296], [])[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 64)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 64), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 128)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 128), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 192)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 36864)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 256)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 256), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 320)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 320), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 384)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 73728)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 448)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 448), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 512)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 512), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 576)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 110592)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 640)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 640), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 704)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 704), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 768)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 147456)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 832)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 832), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 896)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 896), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 960)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 184320)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1024)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1024), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1088)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1088), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1152)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 221184)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1216)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1216), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1280)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1280), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1344)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 258048)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1408)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1408), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1472)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1472), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1536)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 294912)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1600)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1600), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1664)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1664), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1728)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 331776)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1792)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1792), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1856)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1856), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1920)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 368640)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1984)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1984), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2048)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2048), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2112)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 405504)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2176)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2176), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2240)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2240), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2304)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 442368)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2368)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2368), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2432)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2432), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2496)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 479232)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2560)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2560), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2624)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2624), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2688)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 516096)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2752)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2752), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2816)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2816), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2880)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 552960)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2944)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2944), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 3008)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 3008), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[0]*kernel.shared_1[(threadIdx.x*48)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[9]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[1]*kernel.shared_1[(threadIdx.x*48)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[2]*kernel.shared_1[(threadIdx.x*48)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[3]*kernel.shared_1[(threadIdx.x*48)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[4]*kernel.shared_1[(threadIdx.x*48)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[5]*kernel.shared_1[(threadIdx.x*48)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[6]*kernel.shared_1[(threadIdx.x*48)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[0]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[9]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[8]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[17]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[8]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[17]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[18]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[27]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[18]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[27]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[26]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[35]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[26]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[35]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[36]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[45]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[36]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[45]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[44]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[53]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[44]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[53]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[54]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[63]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[54]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[63]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[62]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[71]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[62]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[71]*kernel.shared_1[((threadIdx.x*48) + 47)]))
}
- attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1: Buffer(kernel.shared, float32, [4608], [], scope="shared")[threadIdx.x_2] = kernel_3: Buffer(kernel_2, float32, [2359296], [])[((((blockIdx.x*147456) + (floordiv(threadIdx.x_2, 144)*4608)) + cse_var_1) + floormod(threadIdx.x_2, 144))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 224)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 224), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 80), 144), 9)*9)) + (floordiv(floormod((threadIdx.x_2 + 8), 9), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 448)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 448), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 16), 144), 9)*9)) + (floordiv(floormod((threadIdx.x_2 + 7), 9), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 672)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 672), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 96), 144), 9)*9)) + (floormod((floordiv(threadIdx.x_2, 3) + 2), 3)*3)) + floormod(threadIdx.x_2, 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 896)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 896), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 144), 9)*9)) + (floordiv(floormod((threadIdx.x_2 + 5), 9), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 1120)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1120), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 112), 144), 9)*9)) + (floordiv(floormod((threadIdx.x_2 + 4), 9), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 1344)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1344), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 48), 144), 9)*9)) + (floormod((floordiv(threadIdx.x_2, 3) + 1), 3)*3)) + floormod(threadIdx.x_2, 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 1568)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1568), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 128), 144), 9)*9)) + floormod((threadIdx.x_2 + 2), 9))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 1792)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1792), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 64), 144), 9)*9)) + floormod((threadIdx.x_2 + 1), 9))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 2016)] = kernel_3[(((((blockIdx.x*147456) + (floordiv(threadIdx.x_2, 144)*4608)) + cse_var_1) + floormod(threadIdx.x_2, 144)) + 64512)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 2240)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2240), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 80), 144), 9)*9)) + (floordiv(floormod((threadIdx.x_2 + 8), 9), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 2464)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2464), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 16), 144), 9)*9)) + (floordiv(floormod((threadIdx.x_2 + 7), 9), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 2688)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2688), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 96), 144), 9)*9)) + (floormod((floordiv(threadIdx.x_2, 3) + 2), 3)*3)) + floormod(threadIdx.x_2, 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 2912)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2912), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 144), 9)*9)) + (floordiv(floormod((threadIdx.x_2 + 5), 9), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 3136)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 3136), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 112), 144), 9)*9)) + (floordiv(floormod((threadIdx.x_2 + 4), 9), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 3360)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 3360), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 48), 144), 9)*9)) + (floormod((floordiv(threadIdx.x_2, 3) + 1), 3)*3)) + floormod(threadIdx.x_2, 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 3584)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 3584), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 128), 144), 9)*9)) + floormod((threadIdx.x_2 + 2), 9))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 3808)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 3808), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 64), 144), 9)*9)) + floormod((threadIdx.x_2 + 1), 9))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 4032)] = kernel_3[(((((blockIdx.x*147456) + (floordiv(threadIdx.x_2, 144)*4608)) + cse_var_1) + floormod(threadIdx.x_2, 144)) + 129024)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 4256)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 4256), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 80), 144), 9)*9)) + (floordiv(floormod((threadIdx.x_2 + 8), 9), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- if @tir.likely((threadIdx.x_2 < 128), dtype=bool) {
- kernel.shared_1[(threadIdx.x_2 + 4480)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 4480), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 16), 144), 9)*9)) + (floordiv(floormod((threadIdx.x_2 + 7), 9), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
- }
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 7)*9)]*kernel.shared_1[(floordiv(threadIdx.x, 7)*144)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1)]*kernel.shared_1[(floordiv(threadIdx.x, 7)*144)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 2)]*kernel.shared_1[(floordiv(threadIdx.x, 7)*144)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 3)]*kernel.shared_1[(floordiv(threadIdx.x, 7)*144)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 4)]*kernel.shared_1[(floordiv(threadIdx.x, 7)*144)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 5)]*kernel.shared_1[(floordiv(threadIdx.x, 7)*144)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 6)]*kernel.shared_1[(floordiv(threadIdx.x, 7)*144)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 81)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 9)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 82)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 9)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 83)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 9)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 84)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 9)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 85)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 9)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 86)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 9)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 87)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 9)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 162)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 18)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 163)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 18)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 164)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 18)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 165)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 18)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 166)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 18)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 167)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 18)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 168)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 18)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 243)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 27)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 244)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 27)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 245)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 27)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 246)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 27)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 247)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 27)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 248)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 27)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 249)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 27)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 324)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 36)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 325)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 36)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 326)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 36)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 327)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 36)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 328)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 36)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 329)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 36)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 330)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 36)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 405)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 45)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 406)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 45)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 407)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 45)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 408)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 45)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 409)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 45)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 410)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 45)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 411)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 45)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 486)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 54)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 487)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 54)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 488)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 54)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 489)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 54)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 490)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 54)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 491)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 54)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 492)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 54)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 567)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 63)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 568)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 63)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 569)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 63)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 570)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 63)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 571)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 63)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 572)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 63)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 573)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 63)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 648)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 72)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 649)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 72)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 650)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 72)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 651)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 72)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 652)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 72)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 653)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 72)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 654)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 72)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 729)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 81)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 730)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 81)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 731)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 81)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 732)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 81)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 733)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 81)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 734)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 81)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 735)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 81)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 810)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 90)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 811)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 90)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 812)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 90)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 813)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 90)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 814)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 90)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 815)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 90)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 816)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 90)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 891)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 99)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 892)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 99)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 893)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 99)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 894)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 99)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 895)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 99)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 896)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 99)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 897)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 99)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 972)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 108)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 973)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 108)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 974)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 108)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 975)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 108)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 976)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 108)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 977)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 108)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 978)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 108)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1053)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 117)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1054)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 117)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1055)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 117)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1056)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 117)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1057)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 117)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1058)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 117)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1059)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 117)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1134)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 126)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1135)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 126)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1136)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 126)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1137)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 126)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1138)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 126)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1139)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 126)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1140)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 126)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1215)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 135)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1216)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 135)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1217)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 135)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1218)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 135)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1219)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 135)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1220)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 135)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1221)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 135)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 1)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 2)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 1)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 3)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 1)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 4)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 1)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 5)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 1)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 6)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 1)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 7)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 1)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 82)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 10)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 83)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 10)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 84)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 10)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 85)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 10)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 86)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 10)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 87)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 10)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 88)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 10)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 163)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 19)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 164)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 19)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 165)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 19)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 166)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 19)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 167)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 19)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 168)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 19)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 169)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 19)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 244)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 28)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 245)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 28)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 246)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 28)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 247)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 28)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 248)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 28)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 249)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 28)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 250)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 28)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 325)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 37)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 326)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 37)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 327)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 37)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 328)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 37)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 329)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 37)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 330)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 37)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 331)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 37)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 406)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 46)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 407)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 46)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 408)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 46)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 409)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 46)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 410)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 46)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 411)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 46)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 412)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 46)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 487)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 55)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 488)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 55)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 489)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 55)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 490)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 55)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 491)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 55)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 492)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 55)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 493)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 55)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 568)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 64)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 569)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 64)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 570)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 64)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 571)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 64)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 572)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 64)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 573)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 64)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 574)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 64)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 649)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 73)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 650)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 73)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 651)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 73)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 652)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 73)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 653)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 73)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 654)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 73)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 655)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 73)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 730)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 82)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 731)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 82)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 732)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 82)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 733)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 82)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 734)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 82)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 735)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 82)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 736)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 82)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 811)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 91)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 812)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 91)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 813)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 91)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 814)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 91)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 815)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 91)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 816)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 91)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 817)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 91)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 892)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 100)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 893)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 100)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 894)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 100)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 895)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 100)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 896)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 100)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 897)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 100)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 898)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 100)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 973)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 109)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 974)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 109)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 975)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 109)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 976)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 109)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 977)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 109)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 978)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 109)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 979)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 109)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1054)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 118)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1055)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 118)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1056)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 118)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1057)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 118)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1058)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 118)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1059)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 118)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1060)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 118)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1135)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 127)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1136)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 127)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1137)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 127)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1138)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 127)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1139)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 127)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1140)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 127)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1141)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 127)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1216)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 136)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1217)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 136)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1218)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 136)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1219)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 136)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1220)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 136)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1221)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 136)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1222)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 136)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 2)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 2)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 3)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 2)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 4)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 2)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 5)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 2)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 6)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 2)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 7)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 2)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 8)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 2)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 83)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 11)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 84)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 11)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 85)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 11)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 86)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 11)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 87)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 11)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 88)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 11)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 89)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 11)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 164)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 20)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 165)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 20)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 166)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 20)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 167)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 20)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 168)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 20)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 169)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 20)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 170)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 20)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 245)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 29)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 246)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 29)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 247)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 29)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 248)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 29)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 249)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 29)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 250)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 29)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 251)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 29)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 326)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 38)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 327)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 38)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 328)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 38)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 329)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 38)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 330)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 38)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 331)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 38)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 332)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 38)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 407)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 47)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 408)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 47)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 409)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 47)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 410)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 47)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 411)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 47)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 412)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 47)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 413)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 47)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 488)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 56)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 489)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 56)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 490)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 56)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 491)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 56)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 492)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 56)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 493)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 56)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 494)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 56)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 569)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 65)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 570)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 65)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 571)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 65)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 572)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 65)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 573)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 65)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 574)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 65)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 575)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 65)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 650)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 74)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 651)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 74)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 652)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 74)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 653)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 74)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 654)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 74)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 655)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 74)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 656)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 74)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 731)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 83)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 732)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 83)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 733)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 83)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 734)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 83)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 735)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 83)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 736)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 83)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 737)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 83)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 812)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 92)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 813)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 92)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 814)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 92)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 815)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 92)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 816)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 92)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 817)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 92)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 818)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 92)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 893)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 101)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 894)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 101)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 895)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 101)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 896)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 101)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 897)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 101)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 898)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 101)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 899)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 101)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 974)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 110)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 975)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 110)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 976)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 110)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 977)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 110)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 978)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 110)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 979)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 110)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 980)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 110)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1055)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 119)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1056)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 119)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1057)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 119)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1058)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 119)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1059)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 119)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1060)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 119)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1061)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 119)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1136)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 128)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1137)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 128)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1138)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 128)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1139)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 128)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1140)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 128)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1141)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 128)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1142)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 128)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1217)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 137)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1218)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 137)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1219)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 137)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1220)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 137)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1221)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 137)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1222)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 137)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1223)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 137)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 9)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 3)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 10)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 3)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 11)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 3)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 12)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 3)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 13)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 3)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 14)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 3)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 15)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 3)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 90)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 12)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 91)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 12)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 92)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 12)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 93)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 12)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 94)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 12)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 95)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 12)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 96)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 12)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 171)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 21)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 172)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 21)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 173)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 21)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 174)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 21)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 175)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 21)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 176)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 21)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 177)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 21)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 252)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 30)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 253)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 30)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 254)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 30)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 255)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 30)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 256)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 30)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 257)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 30)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 258)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 30)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 333)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 39)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 334)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 39)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 335)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 39)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 336)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 39)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 337)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 39)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 338)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 39)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 339)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 39)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 414)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 48)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 415)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 48)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 416)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 48)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 417)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 48)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 418)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 48)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 419)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 48)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 420)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 48)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 495)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 57)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 496)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 57)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 497)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 57)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 498)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 57)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 499)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 57)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 500)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 57)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 501)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 57)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 576)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 66)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 577)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 66)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 578)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 66)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 579)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 66)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 580)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 66)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 581)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 66)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 582)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 66)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 657)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 75)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 658)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 75)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 659)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 75)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 660)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 75)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 661)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 75)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 662)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 75)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 663)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 75)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 738)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 84)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 739)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 84)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 740)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 84)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 741)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 84)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 742)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 84)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 743)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 84)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 744)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 84)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 819)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 93)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 820)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 93)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 821)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 93)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 822)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 93)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 823)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 93)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 824)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 93)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 825)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 93)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 900)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 102)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 901)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 102)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 902)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 102)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 903)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 102)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 904)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 102)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 905)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 102)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 906)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 102)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 981)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 111)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 982)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 111)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 983)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 111)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 984)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 111)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 985)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 111)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 986)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 111)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 987)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 111)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1062)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 120)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1063)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 120)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1064)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 120)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1065)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 120)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1066)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 120)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1067)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 120)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1068)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 120)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1143)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 129)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1144)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 129)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1145)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 129)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1146)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 129)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1147)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 129)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1148)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 129)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1149)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 129)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1224)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 138)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1225)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 138)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1226)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 138)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1227)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 138)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1228)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 138)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1229)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 138)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1230)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 138)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 10)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 4)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 11)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 4)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 12)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 4)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 13)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 4)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 14)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 4)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 15)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 4)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 16)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 4)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 91)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 13)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 92)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 13)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 93)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 13)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 94)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 13)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 95)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 13)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 96)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 13)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 97)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 13)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 172)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 22)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 173)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 22)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 174)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 22)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 175)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 22)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 176)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 22)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 177)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 22)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 178)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 22)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 253)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 31)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 254)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 31)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 255)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 31)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 256)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 31)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 257)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 31)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 258)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 31)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 259)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 31)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 334)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 40)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 335)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 40)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 336)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 40)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 337)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 40)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 338)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 40)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 339)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 40)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 340)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 40)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 415)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 49)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 416)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 49)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 417)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 49)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 418)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 49)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 419)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 49)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 420)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 49)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 421)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 49)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 496)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 58)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 497)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 58)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 498)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 58)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 499)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 58)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 500)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 58)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 501)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 58)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 502)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 58)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 577)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 67)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 578)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 67)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 579)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 67)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 580)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 67)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 581)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 67)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 582)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 67)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 583)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 67)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 658)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 76)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 659)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 76)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 660)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 76)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 661)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 76)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 662)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 76)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 663)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 76)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 664)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 76)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 739)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 85)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 740)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 85)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 741)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 85)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 742)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 85)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 743)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 85)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 744)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 85)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 745)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 85)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 820)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 94)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 821)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 94)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 822)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 94)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 823)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 94)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 824)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 94)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 825)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 94)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 826)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 94)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 901)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 103)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 902)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 103)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 903)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 103)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 904)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 103)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 905)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 103)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 906)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 103)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 907)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 103)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 982)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 112)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 983)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 112)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 984)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 112)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 985)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 112)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 986)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 112)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 987)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 112)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 988)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 112)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1063)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 121)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1064)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 121)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1065)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 121)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1066)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 121)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1067)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 121)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1068)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 121)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1069)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 121)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1144)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 130)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1145)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 130)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1146)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 130)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1147)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 130)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1148)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 130)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1149)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 130)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1150)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 130)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1225)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 139)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1226)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 139)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1227)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 139)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1228)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 139)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1229)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 139)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1230)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 139)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1231)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 139)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 11)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 5)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 12)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 5)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 13)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 5)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 14)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 5)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 15)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 5)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 16)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 5)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 17)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 5)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 92)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 14)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 93)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 14)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 94)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 14)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 95)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 14)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 96)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 14)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 97)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 14)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 98)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 14)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 173)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 23)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 174)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 23)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 175)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 23)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 176)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 23)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 177)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 23)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 178)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 23)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 179)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 23)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 254)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 32)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 255)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 32)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 256)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 32)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 257)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 32)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 258)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 32)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 259)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 32)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 260)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 32)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 335)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 41)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 336)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 41)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 337)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 41)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 338)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 41)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 339)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 41)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 340)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 41)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 341)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 41)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 416)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 50)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 417)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 50)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 418)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 50)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 419)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 50)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 420)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 50)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 421)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 50)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 422)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 50)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 497)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 59)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 498)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 59)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 499)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 59)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 500)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 59)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 501)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 59)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 502)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 59)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 503)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 59)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 578)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 68)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 579)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 68)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 580)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 68)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 581)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 68)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 582)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 68)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 583)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 68)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 584)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 68)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 659)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 77)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 660)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 77)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 661)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 77)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 662)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 77)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 663)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 77)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 664)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 77)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 665)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 77)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 740)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 86)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 741)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 86)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 742)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 86)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 743)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 86)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 744)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 86)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 745)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 86)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 746)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 86)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 821)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 95)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 822)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 95)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 823)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 95)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 824)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 95)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 825)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 95)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 826)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 95)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 827)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 95)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 902)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 104)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 903)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 104)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 904)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 104)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 905)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 104)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 906)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 104)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 907)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 104)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 908)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 104)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 983)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 113)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 984)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 113)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 985)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 113)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 986)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 113)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 987)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 113)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 988)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 113)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 989)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 113)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1064)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 122)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1065)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 122)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1066)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 122)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1067)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 122)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1068)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 122)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1069)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 122)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1070)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 122)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1145)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 131)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1146)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 131)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1147)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 131)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1148)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 131)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1149)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 131)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1150)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 131)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1151)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 131)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1226)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 140)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1227)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 140)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1228)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 140)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1229)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 140)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1230)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 140)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1231)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 140)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1232)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 140)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 18)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 6)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 19)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 6)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 20)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 6)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 21)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 6)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 22)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 6)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 23)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 6)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 24)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 6)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 99)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 15)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 100)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 15)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 101)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 15)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 102)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 15)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 103)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 15)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 104)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 15)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 105)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 15)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 180)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 24)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 181)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 24)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 182)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 24)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 183)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 24)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 184)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 24)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 185)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 24)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 186)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 24)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 261)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 33)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 262)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 33)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 263)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 33)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 264)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 33)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 265)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 33)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 266)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 33)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 267)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 33)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 342)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 42)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 343)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 42)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 344)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 42)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 345)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 42)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 346)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 42)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 347)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 42)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 348)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 42)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 423)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 51)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 424)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 51)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 425)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 51)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 426)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 51)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 427)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 51)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 428)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 51)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 429)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 51)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 504)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 60)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 505)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 60)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 506)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 60)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 507)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 60)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 508)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 60)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 509)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 60)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 510)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 60)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 585)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 69)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 586)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 69)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 587)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 69)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 588)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 69)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 589)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 69)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 590)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 69)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 591)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 69)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 666)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 78)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 667)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 78)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 668)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 78)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 669)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 78)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 670)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 78)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 671)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 78)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 672)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 78)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 747)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 87)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 748)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 87)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 749)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 87)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 750)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 87)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 751)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 87)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 752)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 87)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 753)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 87)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 828)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 96)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 829)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 96)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 830)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 96)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 831)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 96)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 832)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 96)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 833)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 96)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 834)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 96)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 909)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 105)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 910)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 105)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 911)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 105)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 912)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 105)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 913)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 105)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 914)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 105)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 915)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 105)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 990)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 114)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 991)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 114)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 992)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 114)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 993)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 114)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 994)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 114)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 995)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 114)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 996)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 114)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1071)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 123)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1072)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 123)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1073)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 123)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1074)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 123)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1075)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 123)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1076)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 123)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1077)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 123)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1152)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 132)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1153)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 132)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1154)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 132)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1155)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 132)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1156)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 132)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1157)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 132)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1158)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 132)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1233)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 141)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1234)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 141)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1235)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 141)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1236)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 141)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1237)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 141)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1238)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 141)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1239)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 141)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 19)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 7)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 20)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 7)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 21)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 7)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 22)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 7)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 23)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 7)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 24)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 7)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 25)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 7)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 100)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 16)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 101)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 16)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 102)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 16)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 103)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 16)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 104)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 16)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 105)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 16)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 106)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 16)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 181)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 25)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 182)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 25)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 183)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 25)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 184)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 25)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 185)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 25)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 186)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 25)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 187)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 25)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 262)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 34)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 263)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 34)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 264)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 34)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 265)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 34)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 266)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 34)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 267)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 34)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 268)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 34)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 343)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 43)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 344)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 43)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 345)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 43)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 346)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 43)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 347)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 43)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 348)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 43)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 349)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 43)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 424)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 52)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 425)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 52)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 426)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 52)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 427)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 52)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 428)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 52)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 429)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 52)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 430)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 52)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 505)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 61)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 506)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 61)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 507)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 61)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 508)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 61)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 509)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 61)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 510)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 61)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 511)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 61)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 586)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 70)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 587)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 70)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 588)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 70)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 589)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 70)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 590)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 70)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 591)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 70)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 592)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 70)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 667)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 79)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 668)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 79)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 669)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 79)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 670)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 79)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 671)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 79)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 672)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 79)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 673)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 79)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 748)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 88)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 749)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 88)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 750)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 88)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 751)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 88)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 752)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 88)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 753)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 88)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 754)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 88)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 829)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 97)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 830)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 97)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 831)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 97)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 832)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 97)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 833)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 97)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 834)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 97)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 835)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 97)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 910)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 106)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 911)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 106)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 912)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 106)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 913)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 106)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 914)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 106)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 915)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 106)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 916)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 106)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 991)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 115)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 992)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 115)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 993)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 115)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 994)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 115)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 995)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 115)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 996)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 115)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 997)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 115)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1072)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 124)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1073)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 124)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1074)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 124)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1075)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 124)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1076)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 124)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1077)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 124)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1078)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 124)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1153)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 133)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1154)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 133)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1155)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 133)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1156)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 133)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1157)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 133)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1158)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 133)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1159)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 133)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1234)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 142)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1235)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 142)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1236)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 142)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1237)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 142)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1238)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 142)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1239)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 142)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1240)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 142)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 20)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 8)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 21)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 8)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 22)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 8)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 23)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 8)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 24)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 8)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 25)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 8)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 26)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 8)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 101)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 17)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 102)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 17)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 103)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 17)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 104)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 17)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 105)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 17)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 106)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 17)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 107)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 17)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 182)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 26)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 183)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 26)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 184)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 26)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 185)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 26)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 186)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 26)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 187)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 26)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 188)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 26)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 263)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 35)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 264)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 35)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 265)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 35)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 266)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 35)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 267)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 35)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 268)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 35)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 269)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 35)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 344)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 44)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 345)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 44)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 346)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 44)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 347)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 44)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 348)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 44)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 349)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 44)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 350)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 44)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 425)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 53)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 426)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 53)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 427)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 53)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 428)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 53)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 429)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 53)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 430)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 53)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 431)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 53)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 506)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 62)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 507)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 62)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 508)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 62)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 509)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 62)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 510)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 62)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 511)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 62)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 512)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 62)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 587)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 71)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 588)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 71)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 589)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 71)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 590)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 71)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 591)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 71)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 592)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 71)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 593)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 71)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 668)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 80)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 669)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 80)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 670)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 80)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 671)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 80)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 672)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 80)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 673)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 80)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 674)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 80)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 749)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 89)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 750)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 89)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 751)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 89)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 752)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 89)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 753)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 89)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 754)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 89)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 755)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 89)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 830)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 98)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 831)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 98)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 832)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 98)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 833)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 98)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 834)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 98)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 835)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 98)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 836)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 98)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 911)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 107)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 912)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 107)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 913)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 107)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 914)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 107)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 915)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 107)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 916)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 107)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 917)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 107)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 992)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 116)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 993)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 116)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 994)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 116)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 995)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 116)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 996)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 116)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 997)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 116)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 998)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 116)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1073)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 125)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1074)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 125)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1075)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 125)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1076)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 125)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1077)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 125)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1078)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 125)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1079)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 125)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1154)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 134)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1155)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 134)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1156)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 134)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1157)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 134)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1158)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 134)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1159)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 134)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1160)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 134)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1235)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 143)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1236)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 143)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1237)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 143)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1238)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 143)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1239)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 143)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1240)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 143)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1241)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 143)]))
}
}
- compute_3: Buffer(compute_2, float32, [25088], [])[((blockIdx.x*1568) + (threadIdx.x*7))] = max((conv2d_nchw_1[0] + bias_3: Buffer(bias_2, float32, [512], [])[((blockIdx.x*32) + floordiv(threadIdx.x, 7))]), 0f32)
- compute_3[(((blockIdx.x*1568) + (threadIdx.x*7)) + 1)] = max((conv2d_nchw_1[1] + bias_3[((blockIdx.x*32) + floordiv(threadIdx.x, 7))]), 0f32)
- compute_3[(((blockIdx.x*1568) + (threadIdx.x*7)) + 2)] = max((conv2d_nchw_1[2] + bias_3[((blockIdx.x*32) + floordiv(threadIdx.x, 7))]), 0f32)
- compute_3[(((blockIdx.x*1568) + (threadIdx.x*7)) + 3)] = max((conv2d_nchw_1[3] + bias_3[((blockIdx.x*32) + floordiv(threadIdx.x, 7))]), 0f32)
- compute_3[(((blockIdx.x*1568) + (threadIdx.x*7)) + 4)] = max((conv2d_nchw_1[4] + bias_3[((blockIdx.x*32) + floordiv(threadIdx.x, 7))]), 0f32)
- compute_3[(((blockIdx.x*1568) + (threadIdx.x*7)) + 5)] = max((conv2d_nchw_1[5] + bias_3[((blockIdx.x*32) + floordiv(threadIdx.x, 7))]), 0f32)
- compute_3[(((blockIdx.x*1568) + (threadIdx.x*7)) + 6)] = max((conv2d_nchw_1[6] + bias_3[((blockIdx.x*32) + floordiv(threadIdx.x, 7))]), 0f32)
+ for (i1.inner: int32, 0, 2) {
+ for (i3.inner: int32, 0, 7) {
+ compute_3: Buffer(compute_2, float32, [25088], [])[(((((floordiv(blockIdx.x, 7)*6272) + (threadIdx.x*98)) + (i1.inner*49)) + (floormod(blockIdx.x, 7)*7)) + i3.inner)] = max((conv2d_nchw_1[((i1.inner*7) + i3.inner)] + bias_3: Buffer(bias_2, float32, [512], [])[(((floordiv(blockIdx.x, 7)*128) + (threadIdx.x*2)) + i1.inner)]), 0f32)
+ }
+ }
}
}
@@ -1386,7 +773,7 @@ We build the binary and check its correctness and performance.
.. code-block:: none
- Execution time of this operator: 0.225 ms
+ Execution time of this operator: 0.362 ms
@@ -1435,36 +822,36 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
conv2d_nchw_nn_o_o_o_i, conv2d_nchw_nn_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_i, factor=1)
conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
- conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=1)
- conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=32)
+ conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=2)
+ conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=64)
conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
- conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=7)
+ conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=1)
conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
- conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
+ conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=7)
conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
- conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=7)
- conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=16)
- conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=1)
+ conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
+ conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=2)
+ conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=4)
conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1)
- conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=3)
+ conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=1)
conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=1)
conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=3)
s[conv2d_nchw].reorder(conv2d_nchw_nn_o_o_o_o, conv2d_nchw_ff_o_o_o_o, conv2d_nchw_yy_o_o_o_o, conv2d_nchw_xx_o_o_o_o, conv2d_nchw_nn_o_o_o_i, conv2d_nchw_ff_o_o_o_i, conv2d_nchw_yy_o_o_o_i, conv2d_nchw_xx_o_o_o_i, conv2d_nchw_nn_o_o_i, conv2d_nchw_ff_o_o_i, conv2d_nchw_yy_o_o_i, conv2d_nchw_xx_o_o_i, conv2d_nchw_rc_o_o, conv2d_nchw_ry_o_o, conv2d_nchw_rx_o_o, conv2d_nchw_rc_o_i, conv2d_nchw_ry_o_i, conv2d_nchw_rx_o_i, conv2d_nchw_nn_o_i, conv2d_nchw_ff_o_i, conv2d_nchw_yy_o_i, conv2 [...]
compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1)
compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
- compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=1)
- compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=32)
+ compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=2)
+ compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=64)
compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
- compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=7)
+ compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=1)
compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
- compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
+ compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
- compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=7)
+ compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
kernel_shared = s.cache_read(kernel, "shared", [conv2d_nchw])
@@ -1483,14 +870,14 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
- kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=224)
+ kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
- pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
+ pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=4)
s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
- pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=224)
+ pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
- s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 1024)
+ s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 512)
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "unroll_explicit", True)
CUDA source code:
@@ -1508,10 +895,10 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
#define int64_t long long
#define uint64_t unsigned long long
#endif
- extern "C" __global__ void __launch_bounds__(224) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
- float conv2d_nchw[7];
- __shared__ float pad_temp_shared[1296];
- __shared__ float kernel_shared[4608];
+ extern "C" __global__ void __launch_bounds__(64) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+ float conv2d_nchw[14];
+ __shared__ float pad_temp_shared[72];
+ __shared__ float kernel_shared[3072];
conv2d_nchw[0] = 0.000000e+00f;
conv2d_nchw[1] = 0.000000e+00f;
conv2d_nchw[2] = 0.000000e+00f;
@@ -1519,1056 +906,420 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
conv2d_nchw[4] = 0.000000e+00f;
conv2d_nchw[5] = 0.000000e+00f;
conv2d_nchw[6] = 0.000000e+00f;
- for (int rc_outer_outer = 0; rc_outer_outer < 32; ++rc_outer_outer) {
- __syncthreads();
- pad_temp_shared[((int)threadIdx.x)] = (((((9 <= (((int)threadIdx.x) % 81)) && ((((int)threadIdx.x) % 81) < 72)) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((int)threadIdx.x) / 81) * 49)) + (((((int)threadIdx.x) % 81) / 9) * 7)) + (((int)threadIdx.x) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 224)] = (((((9 <= ((((int)threadIdx.x) + 62) % 81)) && (((((int)threadIdx.x) + 62) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 8) % 9))) && (((((int)threadIdx.x) + 8) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 224) / 81) * 49)) + ((((((int)threadIdx.x) + 62) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 8) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 448)] = (((((9 <= ((((int)threadIdx.x) + 43) % 81)) && (((((int)threadIdx.x) + 43) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 7) % 9))) && (((((int)threadIdx.x) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 448) / 81) * 49)) + ((((((int)threadIdx.x) + 43) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 7) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 672)] = (((((9 <= ((((int)threadIdx.x) + 24) % 81)) && (((((int)threadIdx.x) + 24) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 6) % 9))) && (((((int)threadIdx.x) + 6) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 672) / 81) * 49)) + ((((((int)threadIdx.x) + 24) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 6) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 896)] = (((((9 <= ((((int)threadIdx.x) + 5) % 81)) && (((((int)threadIdx.x) + 5) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 5) % 9))) && (((((int)threadIdx.x) + 5) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 896) / 81) * 49)) + ((((((int)threadIdx.x) + 5) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 5) % 9)) - 8)] : 0.000000e+00f);
- if (((int)threadIdx.x) < 176) {
- pad_temp_shared[(((int)threadIdx.x) + 1120)] = (((((9 <= ((((int)threadIdx.x) + 67) % 81)) && (((((int)threadIdx.x) + 67) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 4) % 9))) && (((((int)threadIdx.x) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 1120) / 81) * 49)) + ((((((int)threadIdx.x) + 67) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 4) % 9)) - 8)] : 0.000000e+00f);
+ conv2d_nchw[7] = 0.000000e+00f;
+ conv2d_nchw[8] = 0.000000e+00f;
+ conv2d_nchw[9] = 0.000000e+00f;
+ conv2d_nchw[10] = 0.000000e+00f;
+ conv2d_nchw[11] = 0.000000e+00f;
+ conv2d_nchw[12] = 0.000000e+00f;
+ conv2d_nchw[13] = 0.000000e+00f;
+ for (int rc_outer_outer = 0; rc_outer_outer < 64; ++rc_outer_outer) {
+ for (int ry_outer_outer = 0; ry_outer_outer < 3; ++ry_outer_outer) {
+ __syncthreads();
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[(((int)threadIdx.x) * 4)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) * 4) % 9))) && (((((int)threadIdx.x) * 4) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + (((((int)threadIdx.x) * 4) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) * 4) % 9)) - 8)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 4) + 1)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 1) % 9))) && ((((((int)threadIdx.x) * 4) + 1) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 1) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 1) % 9)) - 8)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 4) + 2)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 2) % 9))) && ((((((int)threadIdx.x) * 4) + 2) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 2) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 2) % 9)) - 8)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 4) + 3)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 3) % 9))) && ((((((int)threadIdx.x) * 4) + 3) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 3) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 3) % 9)) - 8)] : 0.000000e+00f);
+ }
+ kernel_shared[((int)threadIdx.x)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 64)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 64) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 128)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 128) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 192)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 36864)];
+ kernel_shared[(((int)threadIdx.x) + 256)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 256) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 320)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 320) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 384)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 73728)];
+ kernel_shared[(((int)threadIdx.x) + 448)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 448) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 512)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 512) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 576)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 110592)];
+ kernel_shared[(((int)threadIdx.x) + 640)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 640) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 704)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 704) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 768)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 147456)];
+ kernel_shared[(((int)threadIdx.x) + 832)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 832) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 896)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 896) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 960)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 184320)];
+ kernel_shared[(((int)threadIdx.x) + 1024)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1024) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1088)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1088) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1152)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 221184)];
+ kernel_shared[(((int)threadIdx.x) + 1216)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1216) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1280)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1280) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1344)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 258048)];
+ kernel_shared[(((int)threadIdx.x) + 1408)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1408) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1472)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1472) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1536)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 294912)];
+ kernel_shared[(((int)threadIdx.x) + 1600)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1600) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1664)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1664) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1728)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 331776)];
+ kernel_shared[(((int)threadIdx.x) + 1792)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1792) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1856)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1856) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1920)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 368640)];
+ kernel_shared[(((int)threadIdx.x) + 1984)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1984) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2048)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2048) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2112)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 405504)];
+ kernel_shared[(((int)threadIdx.x) + 2176)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2176) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2240)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2240) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2304)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 442368)];
+ kernel_shared[(((int)threadIdx.x) + 2368)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2368) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2432)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2432) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2496)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 479232)];
+ kernel_shared[(((int)threadIdx.x) + 2560)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2560) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2624)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2624) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2688)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 516096)];
+ kernel_shared[(((int)threadIdx.x) + 2752)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2752) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2816)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2816) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2880)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 552960)];
+ kernel_shared[(((int)threadIdx.x) + 2944)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2944) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 3008)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 3008) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ __syncthreads();
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[0] * kernel_shared[(((int)threadIdx.x) * 48)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[1] * kernel_shared[(((int)threadIdx.x) * 48)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[2] * kernel_shared[(((int)threadIdx.x) * 48)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[3] * kernel_shared[(((int)threadIdx.x) * 48)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[4] * kernel_shared[(((int)threadIdx.x) * 48)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[5] * kernel_shared[(((int)threadIdx.x) * 48)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[6] * kernel_shared[(((int)threadIdx.x) * 48)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[0] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
}
- kernel_shared[((int)threadIdx.x)] = kernel[((((((int)blockIdx.x) * 147456) + ((((int)threadIdx.x) / 144) * 4608)) + (rc_outer_outer * 144)) + (((int)threadIdx.x) % 144))];
- kernel_shared[(((int)threadIdx.x) + 224)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 224) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 80) % 144) / 9) * 9)) + ((((((int)threadIdx.x) + 8) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 448)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 448) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 16) % 144) / 9) * 9)) + ((((((int)threadIdx.x) + 7) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 672)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 672) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 96) % 144) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 2) % 3) * 3)) + (((int)threadIdx.x) % 3))];
- kernel_shared[(((int)threadIdx.x) + 896)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 896) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 32) % 144) / 9) * 9)) + ((((((int)threadIdx.x) + 5) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1120)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1120) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 112) % 144) / 9) * 9)) + ((((((int)threadIdx.x) + 4) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1344)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1344) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 48) % 144) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 1) % 3) * 3)) + (((int)threadIdx.x) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1568)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1568) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 128) % 144) / 9) * 9)) + ((((int)threadIdx.x) + 2) % 9))];
- kernel_shared[(((int)threadIdx.x) + 1792)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1792) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 64) % 144) / 9) * 9)) + ((((int)threadIdx.x) + 1) % 9))];
- kernel_shared[(((int)threadIdx.x) + 2016)] = kernel[(((((((int)blockIdx.x) * 147456) + ((((int)threadIdx.x) / 144) * 4608)) + (rc_outer_outer * 144)) + (((int)threadIdx.x) % 144)) + 64512)];
- kernel_shared[(((int)threadIdx.x) + 2240)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2240) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 80) % 144) / 9) * 9)) + ((((((int)threadIdx.x) + 8) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2464)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2464) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 16) % 144) / 9) * 9)) + ((((((int)threadIdx.x) + 7) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2688)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2688) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 96) % 144) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 2) % 3) * 3)) + (((int)threadIdx.x) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2912)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2912) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 32) % 144) / 9) * 9)) + ((((((int)threadIdx.x) + 5) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 3136)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 3136) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 112) % 144) / 9) * 9)) + ((((((int)threadIdx.x) + 4) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 3360)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 3360) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 48) % 144) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 1) % 3) * 3)) + (((int)threadIdx.x) % 3))];
- kernel_shared[(((int)threadIdx.x) + 3584)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 3584) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 128) % 144) / 9) * 9)) + ((((int)threadIdx.x) + 2) % 9))];
- kernel_shared[(((int)threadIdx.x) + 3808)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 3808) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 64) % 144) / 9) * 9)) + ((((int)threadIdx.x) + 1) % 9))];
- kernel_shared[(((int)threadIdx.x) + 4032)] = kernel[(((((((int)blockIdx.x) * 147456) + ((((int)threadIdx.x) / 144) * 4608)) + (rc_outer_outer * 144)) + (((int)threadIdx.x) % 144)) + 129024)];
- kernel_shared[(((int)threadIdx.x) + 4256)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 4256) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 80) % 144) / 9) * 9)) + ((((((int)threadIdx.x) + 8) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- if (((int)threadIdx.x) < 128) {
- kernel_shared[(((int)threadIdx.x) + 4480)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 4480) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) / 9) * 9)) + ((((((int)threadIdx.x) + 7) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ }
+ for (int i1_inner = 0; i1_inner < 2; ++i1_inner) {
+ for (int i3_inner = 0; i3_inner < 7; ++i3_inner) {
+ compute[((((((((int)blockIdx.x) / 7) * 6272) + (((int)threadIdx.x) * 98)) + (i1_inner * 49)) + ((((int)blockIdx.x) % 7) * 7)) + i3_inner)] = max((conv2d_nchw[((i1_inner * 7) + i3_inner)] + bias[((((((int)blockIdx.x) / 7) * 128) + (((int)threadIdx.x) * 2)) + i1_inner)]), 0.000000e+00f);
}
- __syncthreads();
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) * 9)] * kernel_shared[((((int)threadIdx.x) / 7) * 144)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1)] * kernel_shared[((((int)threadIdx.x) / 7) * 144)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 2)] * kernel_shared[((((int)threadIdx.x) / 7) * 144)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 3)] * kernel_shared[((((int)threadIdx.x) / 7) * 144)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 4)] * kernel_shared[((((int)threadIdx.x) / 7) * 144)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 5)] * kernel_shared[((((int)threadIdx.x) / 7) * 144)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 6)] * kernel_shared[((((int)threadIdx.x) / 7) * 144)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 81)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 9)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 82)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 9)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 83)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 9)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 84)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 9)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 85)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 9)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 86)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 9)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 87)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 9)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 162)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 18)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 163)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 18)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 164)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 18)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 165)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 18)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 166)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 18)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 167)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 18)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 168)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 18)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 243)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 27)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 244)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 27)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 245)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 27)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 246)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 27)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 247)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 27)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 248)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 27)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 249)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 27)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 324)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 36)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 325)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 36)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 326)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 36)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 327)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 36)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 328)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 36)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 329)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 36)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 330)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 36)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 405)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 45)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 406)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 45)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 407)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 45)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 408)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 45)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 409)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 45)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 410)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 45)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 411)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 45)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 486)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 54)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 487)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 54)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 488)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 54)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 489)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 54)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 490)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 54)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 491)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 54)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 492)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 54)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 567)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 63)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 568)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 63)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 569)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 63)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 570)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 63)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 571)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 63)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 572)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 63)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 573)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 63)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 648)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 72)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 649)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 72)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 650)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 72)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 651)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 72)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 652)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 72)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 653)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 72)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 654)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 72)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 729)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 81)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 730)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 81)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 731)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 81)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 732)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 81)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 733)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 81)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 734)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 81)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 735)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 81)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 810)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 90)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 811)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 90)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 812)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 90)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 813)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 90)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 814)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 90)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 815)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 90)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 816)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 90)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 891)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 99)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 892)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 99)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 893)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 99)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 894)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 99)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 895)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 99)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 896)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 99)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 897)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 99)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 972)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 108)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 973)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 108)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 974)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 108)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 975)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 108)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 976)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 108)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 977)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 108)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 978)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 108)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1053)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 117)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1054)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 117)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1055)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 117)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1056)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 117)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1057)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 117)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1058)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 117)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1059)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 117)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1134)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 126)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1135)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 126)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1136)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 126)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1137)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 126)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1138)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 126)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1139)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 126)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1140)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 126)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1215)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 135)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1216)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 135)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1217)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 135)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1218)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 135)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1219)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 135)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1220)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 135)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1221)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 135)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 1)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 2)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 1)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 3)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 1)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 4)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 1)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 5)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 1)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 6)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 1)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 7)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 1)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 82)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 10)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 83)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 10)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 84)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 10)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 85)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 10)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 86)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 10)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 87)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 10)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 88)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 10)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 163)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 19)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 164)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 19)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 165)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 19)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 166)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 19)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 167)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 19)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 168)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 19)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 169)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 19)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 244)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 28)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 245)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 28)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 246)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 28)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 247)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 28)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 248)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 28)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 249)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 28)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 250)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 28)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 325)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 37)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 326)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 37)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 327)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 37)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 328)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 37)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 329)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 37)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 330)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 37)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 331)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 37)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 406)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 46)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 407)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 46)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 408)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 46)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 409)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 46)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 410)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 46)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 411)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 46)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 412)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 46)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 487)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 55)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 488)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 55)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 489)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 55)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 490)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 55)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 491)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 55)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 492)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 55)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 493)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 55)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 568)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 64)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 569)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 64)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 570)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 64)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 571)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 64)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 572)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 64)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 573)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 64)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 574)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 64)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 649)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 73)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 650)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 73)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 651)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 73)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 652)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 73)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 653)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 73)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 654)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 73)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 655)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 73)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 730)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 82)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 731)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 82)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 732)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 82)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 733)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 82)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 734)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 82)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 735)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 82)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 736)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 82)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 811)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 91)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 812)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 91)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 813)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 91)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 814)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 91)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 815)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 91)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 816)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 91)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 817)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 91)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 892)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 100)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 893)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 100)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 894)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 100)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 895)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 100)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 896)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 100)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 897)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 100)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 898)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 100)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 973)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 109)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 974)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 109)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 975)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 109)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 976)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 109)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 977)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 109)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 978)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 109)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 979)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 109)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1054)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 118)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1055)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 118)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1056)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 118)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1057)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 118)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1058)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 118)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1059)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 118)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1060)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 118)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1135)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 127)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1136)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 127)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1137)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 127)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1138)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 127)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1139)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 127)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1140)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 127)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1141)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 127)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1216)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 136)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1217)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 136)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1218)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 136)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1219)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 136)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1220)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 136)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1221)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 136)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1222)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 136)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 2)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 2)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 3)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 2)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 4)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 2)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 5)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 2)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 6)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 2)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 7)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 2)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 8)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 2)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 83)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 11)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 84)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 11)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 85)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 11)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 86)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 11)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 87)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 11)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 88)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 11)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 89)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 11)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 164)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 20)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 165)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 20)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 166)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 20)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 167)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 20)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 168)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 20)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 169)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 20)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 170)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 20)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 245)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 29)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 246)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 29)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 247)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 29)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 248)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 29)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 249)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 29)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 250)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 29)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 251)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 29)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 326)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 38)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 327)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 38)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 328)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 38)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 329)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 38)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 330)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 38)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 331)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 38)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 332)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 38)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 407)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 47)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 408)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 47)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 409)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 47)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 410)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 47)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 411)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 47)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 412)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 47)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 413)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 47)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 488)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 56)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 489)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 56)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 490)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 56)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 491)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 56)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 492)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 56)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 493)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 56)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 494)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 56)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 569)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 65)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 570)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 65)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 571)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 65)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 572)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 65)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 573)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 65)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 574)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 65)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 575)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 65)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 650)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 74)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 651)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 74)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 652)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 74)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 653)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 74)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 654)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 74)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 655)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 74)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 656)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 74)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 731)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 83)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 732)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 83)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 733)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 83)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 734)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 83)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 735)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 83)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 736)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 83)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 737)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 83)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 812)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 92)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 813)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 92)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 814)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 92)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 815)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 92)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 816)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 92)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 817)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 92)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 818)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 92)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 893)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 101)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 894)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 101)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 895)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 101)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 896)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 101)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 897)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 101)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 898)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 101)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 899)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 101)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 974)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 110)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 975)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 110)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 976)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 110)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 977)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 110)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 978)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 110)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 979)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 110)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 980)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 110)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1055)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 119)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1056)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 119)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1057)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 119)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1058)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 119)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1059)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 119)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1060)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 119)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1061)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 119)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1136)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 128)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1137)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 128)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1138)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 128)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1139)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 128)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1140)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 128)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1141)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 128)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1142)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 128)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1217)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 137)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1218)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 137)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1219)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 137)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1220)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 137)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1221)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 137)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1222)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 137)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1223)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 137)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 9)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 3)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 10)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 3)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 11)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 3)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 12)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 3)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 13)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 3)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 14)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 3)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 15)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 3)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 90)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 12)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 91)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 12)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 92)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 12)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 93)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 12)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 94)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 12)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 95)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 12)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 96)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 12)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 171)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 21)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 172)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 21)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 173)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 21)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 174)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 21)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 175)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 21)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 176)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 21)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 177)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 21)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 252)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 30)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 253)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 30)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 254)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 30)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 255)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 30)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 256)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 30)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 257)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 30)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 258)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 30)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 333)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 39)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 334)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 39)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 335)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 39)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 336)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 39)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 337)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 39)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 338)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 39)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 339)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 39)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 414)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 48)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 415)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 48)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 416)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 48)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 417)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 48)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 418)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 48)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 419)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 48)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 420)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 48)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 495)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 57)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 496)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 57)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 497)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 57)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 498)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 57)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 499)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 57)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 500)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 57)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 501)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 57)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 576)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 66)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 577)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 66)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 578)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 66)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 579)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 66)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 580)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 66)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 581)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 66)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 582)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 66)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 657)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 75)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 658)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 75)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 659)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 75)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 660)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 75)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 661)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 75)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 662)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 75)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 663)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 75)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 738)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 84)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 739)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 84)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 740)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 84)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 741)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 84)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 742)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 84)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 743)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 84)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 744)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 84)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 819)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 93)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 820)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 93)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 821)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 93)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 822)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 93)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 823)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 93)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 824)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 93)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 825)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 93)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 900)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 102)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 901)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 102)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 902)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 102)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 903)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 102)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 904)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 102)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 905)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 102)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 906)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 102)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 981)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 111)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 982)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 111)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 983)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 111)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 984)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 111)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 985)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 111)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 986)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 111)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 987)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 111)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1062)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 120)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1063)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 120)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1064)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 120)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1065)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 120)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1066)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 120)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1067)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 120)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1068)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 120)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1143)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 129)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1144)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 129)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1145)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 129)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1146)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 129)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1147)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 129)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1148)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 129)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1149)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 129)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1224)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 138)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1225)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 138)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1226)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 138)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1227)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 138)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1228)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 138)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1229)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 138)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1230)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 138)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 10)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 4)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 11)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 4)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 12)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 4)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 13)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 4)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 14)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 4)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 15)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 4)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 16)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 4)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 91)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 13)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 92)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 13)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 93)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 13)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 94)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 13)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 95)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 13)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 96)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 13)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 97)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 13)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 172)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 22)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 173)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 22)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 174)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 22)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 175)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 22)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 176)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 22)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 177)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 22)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 178)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 22)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 253)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 31)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 254)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 31)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 255)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 31)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 256)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 31)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 257)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 31)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 258)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 31)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 259)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 31)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 334)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 40)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 335)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 40)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 336)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 40)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 337)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 40)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 338)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 40)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 339)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 40)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 340)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 40)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 415)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 49)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 416)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 49)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 417)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 49)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 418)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 49)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 419)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 49)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 420)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 49)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 421)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 49)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 496)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 58)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 497)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 58)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 498)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 58)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 499)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 58)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 500)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 58)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 501)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 58)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 502)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 58)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 577)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 67)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 578)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 67)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 579)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 67)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 580)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 67)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 581)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 67)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 582)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 67)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 583)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 67)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 658)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 76)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 659)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 76)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 660)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 76)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 661)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 76)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 662)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 76)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 663)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 76)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 664)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 76)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 739)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 85)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 740)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 85)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 741)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 85)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 742)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 85)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 743)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 85)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 744)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 85)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 745)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 85)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 820)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 94)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 821)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 94)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 822)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 94)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 823)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 94)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 824)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 94)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 825)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 94)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 826)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 94)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 901)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 103)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 902)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 103)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 903)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 103)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 904)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 103)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 905)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 103)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 906)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 103)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 907)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 103)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 982)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 112)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 983)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 112)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 984)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 112)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 985)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 112)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 986)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 112)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 987)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 112)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 988)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 112)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1063)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 121)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1064)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 121)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1065)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 121)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1066)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 121)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1067)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 121)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1068)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 121)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1069)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 121)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1144)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 130)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1145)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 130)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1146)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 130)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1147)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 130)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1148)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 130)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1149)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 130)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1150)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 130)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1225)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 139)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1226)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 139)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1227)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 139)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1228)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 139)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1229)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 139)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1230)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 139)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1231)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 139)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 11)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 5)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 12)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 5)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 13)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 5)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 14)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 5)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 15)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 5)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 16)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 5)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 17)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 5)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 92)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 14)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 93)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 14)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 94)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 14)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 95)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 14)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 96)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 14)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 97)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 14)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 98)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 14)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 173)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 23)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 174)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 23)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 175)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 23)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 176)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 23)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 177)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 23)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 178)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 23)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 179)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 23)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 254)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 32)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 255)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 32)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 256)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 32)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 257)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 32)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 258)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 32)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 259)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 32)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 260)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 32)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 335)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 41)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 336)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 41)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 337)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 41)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 338)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 41)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 339)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 41)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 340)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 41)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 341)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 41)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 416)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 50)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 417)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 50)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 418)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 50)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 419)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 50)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 420)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 50)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 421)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 50)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 422)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 50)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 497)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 59)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 498)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 59)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 499)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 59)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 500)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 59)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 501)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 59)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 502)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 59)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 503)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 59)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 578)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 68)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 579)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 68)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 580)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 68)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 581)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 68)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 582)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 68)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 583)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 68)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 584)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 68)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 659)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 77)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 660)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 77)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 661)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 77)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 662)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 77)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 663)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 77)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 664)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 77)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 665)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 77)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 740)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 86)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 741)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 86)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 742)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 86)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 743)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 86)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 744)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 86)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 745)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 86)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 746)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 86)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 821)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 95)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 822)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 95)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 823)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 95)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 824)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 95)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 825)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 95)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 826)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 95)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 827)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 95)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 902)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 104)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 903)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 104)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 904)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 104)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 905)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 104)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 906)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 104)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 907)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 104)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 908)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 104)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 983)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 113)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 984)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 113)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 985)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 113)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 986)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 113)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 987)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 113)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 988)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 113)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 989)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 113)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1064)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 122)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1065)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 122)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1066)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 122)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1067)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 122)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1068)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 122)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1069)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 122)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1070)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 122)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1145)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 131)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1146)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 131)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1147)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 131)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1148)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 131)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1149)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 131)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1150)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 131)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1151)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 131)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1226)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 140)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1227)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 140)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1228)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 140)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1229)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 140)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1230)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 140)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1231)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 140)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1232)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 140)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 18)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 6)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 19)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 6)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 20)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 6)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 21)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 6)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 22)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 6)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 23)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 6)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 24)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 6)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 99)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 15)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 100)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 15)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 101)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 15)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 102)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 15)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 103)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 15)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 104)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 15)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 105)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 15)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 180)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 24)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 181)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 24)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 182)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 24)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 183)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 24)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 184)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 24)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 185)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 24)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 186)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 24)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 261)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 33)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 262)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 33)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 263)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 33)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 264)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 33)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 265)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 33)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 266)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 33)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 267)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 33)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 342)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 42)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 343)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 42)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 344)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 42)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 345)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 42)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 346)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 42)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 347)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 42)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 348)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 42)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 423)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 51)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 424)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 51)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 425)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 51)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 426)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 51)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 427)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 51)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 428)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 51)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 429)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 51)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 504)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 60)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 505)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 60)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 506)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 60)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 507)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 60)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 508)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 60)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 509)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 60)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 510)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 60)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 585)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 69)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 586)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 69)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 587)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 69)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 588)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 69)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 589)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 69)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 590)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 69)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 591)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 69)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 666)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 78)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 667)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 78)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 668)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 78)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 669)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 78)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 670)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 78)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 671)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 78)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 672)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 78)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 747)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 87)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 748)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 87)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 749)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 87)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 750)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 87)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 751)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 87)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 752)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 87)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 753)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 87)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 828)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 96)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 829)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 96)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 830)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 96)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 831)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 96)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 832)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 96)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 833)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 96)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 834)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 96)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 909)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 105)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 910)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 105)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 911)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 105)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 912)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 105)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 913)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 105)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 914)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 105)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 915)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 105)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 990)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 114)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 991)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 114)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 992)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 114)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 993)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 114)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 994)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 114)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 995)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 114)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 996)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 114)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1071)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 123)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1072)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 123)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1073)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 123)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1074)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 123)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1075)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 123)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1076)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 123)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1077)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 123)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1152)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 132)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1153)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 132)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1154)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 132)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1155)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 132)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1156)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 132)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1157)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 132)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1158)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 132)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1233)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 141)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1234)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 141)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1235)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 141)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1236)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 141)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1237)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 141)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1238)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 141)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1239)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 141)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 19)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 7)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 20)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 7)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 21)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 7)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 22)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 7)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 23)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 7)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 24)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 7)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 25)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 7)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 100)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 16)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 101)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 16)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 102)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 16)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 103)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 16)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 104)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 16)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 105)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 16)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 106)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 16)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 181)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 25)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 182)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 25)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 183)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 25)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 184)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 25)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 185)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 25)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 186)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 25)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 187)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 25)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 262)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 34)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 263)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 34)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 264)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 34)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 265)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 34)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 266)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 34)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 267)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 34)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 268)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 34)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 343)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 43)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 344)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 43)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 345)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 43)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 346)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 43)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 347)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 43)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 348)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 43)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 349)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 43)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 424)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 52)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 425)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 52)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 426)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 52)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 427)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 52)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 428)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 52)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 429)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 52)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 430)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 52)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 505)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 61)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 506)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 61)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 507)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 61)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 508)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 61)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 509)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 61)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 510)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 61)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 511)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 61)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 586)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 70)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 587)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 70)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 588)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 70)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 589)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 70)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 590)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 70)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 591)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 70)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 592)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 70)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 667)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 79)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 668)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 79)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 669)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 79)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 670)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 79)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 671)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 79)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 672)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 79)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 673)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 79)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 748)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 88)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 749)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 88)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 750)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 88)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 751)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 88)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 752)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 88)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 753)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 88)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 754)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 88)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 829)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 97)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 830)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 97)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 831)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 97)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 832)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 97)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 833)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 97)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 834)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 97)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 835)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 97)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 910)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 106)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 911)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 106)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 912)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 106)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 913)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 106)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 914)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 106)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 915)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 106)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 916)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 106)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 991)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 115)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 992)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 115)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 993)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 115)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 994)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 115)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 995)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 115)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 996)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 115)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 997)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 115)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1072)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 124)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1073)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 124)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1074)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 124)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1075)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 124)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1076)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 124)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1077)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 124)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1078)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 124)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1153)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 133)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1154)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 133)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1155)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 133)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1156)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 133)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1157)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 133)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1158)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 133)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1159)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 133)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1234)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 142)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1235)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 142)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1236)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 142)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1237)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 142)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1238)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 142)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1239)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 142)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1240)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 142)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 20)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 8)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 21)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 8)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 22)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 8)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 23)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 8)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 24)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 8)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 25)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 8)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 26)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 8)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 101)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 17)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 102)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 17)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 103)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 17)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 104)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 17)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 105)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 17)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 106)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 17)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 107)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 17)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 182)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 26)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 183)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 26)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 184)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 26)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 185)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 26)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 186)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 26)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 187)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 26)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 188)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 26)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 263)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 35)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 264)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 35)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 265)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 35)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 266)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 35)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 267)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 35)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 268)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 35)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 269)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 35)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 344)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 44)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 345)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 44)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 346)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 44)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 347)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 44)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 348)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 44)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 349)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 44)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 350)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 44)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 425)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 53)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 426)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 53)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 427)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 53)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 428)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 53)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 429)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 53)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 430)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 53)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 431)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 53)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 506)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 62)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 507)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 62)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 508)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 62)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 509)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 62)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 510)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 62)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 511)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 62)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 512)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 62)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 587)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 71)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 588)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 71)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 589)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 71)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 590)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 71)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 591)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 71)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 592)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 71)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 593)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 71)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 668)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 80)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 669)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 80)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 670)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 80)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 671)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 80)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 672)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 80)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 673)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 80)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 674)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 80)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 749)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 89)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 750)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 89)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 751)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 89)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 752)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 89)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 753)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 89)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 754)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 89)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 755)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 89)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 830)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 98)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 831)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 98)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 832)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 98)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 833)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 98)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 834)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 98)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 835)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 98)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 836)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 98)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 911)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 107)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 912)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 107)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 913)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 107)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 914)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 107)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 915)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 107)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 916)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 107)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 917)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 107)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 992)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 116)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 993)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 116)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 994)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 116)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 995)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 116)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 996)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 116)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 997)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 116)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 998)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 116)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1073)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 125)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1074)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 125)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1075)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 125)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1076)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 125)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1077)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 125)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1078)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 125)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1079)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 125)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1154)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 134)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1155)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 134)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1156)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 134)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1157)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 134)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1158)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 134)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1159)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 134)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1160)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 134)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1235)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 143)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1236)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 143)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1237)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 143)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1238)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 143)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1239)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 143)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1240)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 143)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1241)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 143)]));
}
- compute[((((int)blockIdx.x) * 1568) + (((int)threadIdx.x) * 7))] = max((conv2d_nchw[0] + bias[((((int)blockIdx.x) * 32) + (((int)threadIdx.x) / 7))]), 0.000000e+00f);
- compute[(((((int)blockIdx.x) * 1568) + (((int)threadIdx.x) * 7)) + 1)] = max((conv2d_nchw[1] + bias[((((int)blockIdx.x) * 32) + (((int)threadIdx.x) / 7))]), 0.000000e+00f);
- compute[(((((int)blockIdx.x) * 1568) + (((int)threadIdx.x) * 7)) + 2)] = max((conv2d_nchw[2] + bias[((((int)blockIdx.x) * 32) + (((int)threadIdx.x) / 7))]), 0.000000e+00f);
- compute[(((((int)blockIdx.x) * 1568) + (((int)threadIdx.x) * 7)) + 3)] = max((conv2d_nchw[3] + bias[((((int)blockIdx.x) * 32) + (((int)threadIdx.x) / 7))]), 0.000000e+00f);
- compute[(((((int)blockIdx.x) * 1568) + (((int)threadIdx.x) * 7)) + 4)] = max((conv2d_nchw[4] + bias[((((int)blockIdx.x) * 32) + (((int)threadIdx.x) / 7))]), 0.000000e+00f);
- compute[(((((int)blockIdx.x) * 1568) + (((int)threadIdx.x) * 7)) + 5)] = max((conv2d_nchw[5] + bias[((((int)blockIdx.x) * 32) + (((int)threadIdx.x) / 7))]), 0.000000e+00f);
- compute[(((((int)blockIdx.x) * 1568) + (((int)threadIdx.x) * 7)) + 6)] = max((conv2d_nchw[6] + bias[((((int)blockIdx.x) * 32) + (((int)threadIdx.x) / 7))]), 0.000000e+00f);
}
@@ -2629,7 +1380,7 @@ In the example below we resume the status and do more 5 trials.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 5 minutes 30.470 seconds)
+ **Total running time of the script:** ( 5 minutes 37.098 seconds)
.. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
index cae8a1ff14..9b535c249c 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
@@ -647,7 +647,7 @@ so we can read the log file and load the best schedules.
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 7.8515 7.8508 7.8602 7.8435 0.0068
+ 7.8957 7.8972 7.8982 7.8916 0.0029
@@ -675,7 +675,7 @@ Other Tips
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 1.205 seconds)
+ **Total running time of the script:** ( 1 minutes 1.802 seconds)
.. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_network_cuda.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
index 570560c5d5..f65c0d5a37 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
@@ -666,7 +666,7 @@ so we can read the log file and load the best schedules.
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 745.1719 745.3651 745.7341 744.4164 0.5550
+ 748.8855 748.8227 749.0615 748.7722 0.1262
@@ -694,7 +694,7 @@ Other Tips
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 31.128 seconds)
+ **Total running time of the script:** ( 1 minutes 32.166 seconds)
.. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_network_x86.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
index 6503231b2b..bdb4933a33 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
@@ -390,103 +390,31 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
placeholder_4: Buffer(placeholder_14: Pointer(float32), float32, [128, 512], []),
compute: Buffer(compute_2: Pointer(float32), float32, [128, 512], [])}
buffer_map = {placeholder_5: placeholder, placeholder_6: placeholder_1, placeholder_7: placeholder_2, placeholder_8: placeholder_3, placeholder_9: placeholder_4, compute_1: compute} {
- for (i0.outer.i1.outer.fused: int32, 0, 256) "parallel" {
- allocate(compute_3: Pointer(global float32), float32, [256]), storage_scope = global {
- for (i.inner.init: int32, 0, 16) {
- let cse_var_1: int32 = (i.inner.init*16)
- {
- compute_4: Buffer(compute_3, float32, [256], [])[cse_var_1] = 0f32
- compute_4[(cse_var_1 + 1)] = 0f32
- compute_4[(cse_var_1 + 2)] = 0f32
- compute_4[(cse_var_1 + 3)] = 0f32
- compute_4[(cse_var_1 + 4)] = 0f32
- compute_4[(cse_var_1 + 5)] = 0f32
- compute_4[(cse_var_1 + 6)] = 0f32
- compute_4[(cse_var_1 + 7)] = 0f32
- compute_4[(cse_var_1 + 8)] = 0f32
- compute_4[(cse_var_1 + 9)] = 0f32
- compute_4[(cse_var_1 + 10)] = 0f32
- compute_4[(cse_var_1 + 11)] = 0f32
- compute_4[(cse_var_1 + 12)] = 0f32
- compute_4[(cse_var_1 + 13)] = 0f32
- compute_4[(cse_var_1 + 14)] = 0f32
- compute_4[(cse_var_1 + 15)] = 0f32
- }
- }
- for (elem_idx: int32, 0, let cse_var_2: int32 = floormod(i0.outer.i1.outer.fused, 32) in (placeholder_15: Buffer(placeholder_13, int32, [33], [])[(cse_var_2 + 1)] - placeholder_15[cse_var_2])) {
- for (i.inner: int32, 0, 16) {
- let cse_var_3: int32 = floormod(i0.outer.i1.outer.fused, 32)
- {
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_4: int32 = (i.inner*16)
- compute_4[cse_var_4] = (compute_4[cse_var_4] + (placeholder_16: Buffer(placeholder_11, float32, [78656], [])[((placeholder_15[cse_var_3]*16) + (elem_idx*16))]*max(placeholder_17: Buffer(placeholder_10, float32, [32768], [])[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18: Buffer(placeholder_12, int32, [4916], [])[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_5: int32 = ((i.inner*16) + 1)
- compute_4[cse_var_5] = (compute_4[cse_var_5] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 1)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_6: int32 = ((i.inner*16) + 2)
- compute_4[cse_var_6] = (compute_4[cse_var_6] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 2)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_7: int32 = ((i.inner*16) + 3)
- compute_4[cse_var_7] = (compute_4[cse_var_7] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 3)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_8: int32 = ((i.inner*16) + 4)
- compute_4[cse_var_8] = (compute_4[cse_var_8] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 4)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_9: int32 = ((i.inner*16) + 5)
- compute_4[cse_var_9] = (compute_4[cse_var_9] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 5)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_10: int32 = ((i.inner*16) + 6)
- compute_4[cse_var_10] = (compute_4[cse_var_10] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 6)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_11: int32 = ((i.inner*16) + 7)
- compute_4[cse_var_11] = (compute_4[cse_var_11] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 7)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
+ for (i0.outer.i1.outer.fused: int32, 0, 64) "parallel" {
+ allocate(compute_3: Pointer(global float32), float32, [1024]), storage_scope = global {
+ for (i.outer.inner: int32, 0, 4) {
+ for (nb_j.inner: int32, 0, 2) {
+ for (i.inner.init: int32, 0, 8) {
+ for (j.init: int32, 0, 16) {
+ compute_4: Buffer(compute_3, float32, [1024], [])[((((i.outer.inner*256) + (i.inner.init*32)) + (nb_j.inner*16)) + j.init)] = 0f32
}
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_12: int32 = ((i.inner*16) + 8)
- compute_4[cse_var_12] = (compute_4[cse_var_12] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 8)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_13: int32 = ((i.inner*16) + 9)
- compute_4[cse_var_13] = (compute_4[cse_var_13] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 9)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_14: int32 = ((i.inner*16) + 10)
- compute_4[cse_var_14] = (compute_4[cse_var_14] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 10)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_15: int32 = ((i.inner*16) + 11)
- compute_4[cse_var_15] = (compute_4[cse_var_15] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 11)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_16: int32 = ((i.inner*16) + 12)
- compute_4[cse_var_16] = (compute_4[cse_var_16] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 12)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_17: int32 = ((i.inner*16) + 13)
- compute_4[cse_var_17] = (compute_4[cse_var_17] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 13)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_18: int32 = ((i.inner*16) + 14)
- compute_4[cse_var_18] = (compute_4[cse_var_18] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 14)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_19: int32 = ((i.inner*16) + 15)
- compute_4[cse_var_19] = (compute_4[cse_var_19] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 15)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
+ }
+ for (elem_idx: int32, 0, let cse_var_1: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner) in (placeholder_15: Buffer(placeholder_13, int32, [33], [])[(cse_var_1 + 1)] - placeholder_15[cse_var_1])) {
+ for (i.inner: int32, 0, 8) {
+ for (j: int32, 0, 16) {
+ let cse_var_3: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)
+ let cse_var_2: int32 = ((((i.outer.inner*256) + (i.inner*32)) + (nb_j.inner*16)) + j)
+ compute_4[cse_var_2] = (compute_4[cse_var_2] + (placeholder_16: Buffer(placeholder_11, float32, [78656], [])[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + j)]*max(placeholder_17: Buffer(placeholder_10, float32, [32768], [])[((((floordiv(i0.outer.i1.outer.fused, 16)*8192) + (i.outer.inner*2048)) + (i.inner*256)) + placeholder_18: Buffer(placeholder_12, int32, [4916], [])[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
+ }
}
}
}
}
- for (i0.inner: int32, 0, 16) {
- let cse_var_20: int32 = (((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 32)*16))
- compute_5: Buffer(compute_2, float32, [65536], [])[ramp(cse_var_20, 1, 16)] = max((compute_4[ramp((i0.inner*16), 1, 16)] + placeholder_19: Buffer(placeholder_14, float32, [65536], [])[ramp(cse_var_20, 1, 16)]), broadcast(0f32, 16))
+ for (i0.inner: int32, 0, 32) {
+ for (i1.inner: int32, 0, 32) {
+ let cse_var_4: int32 = ((((floordiv(i0.outer.i1.outer.fused, 16)*16384) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 16)*32)) + i1.inner)
+ compute_5: Buffer(compute_2, float32, [65536], [])[cse_var_4] = max((compute_4[((i0.inner*32) + i1.inner)] + placeholder_19: Buffer(placeholder_14, float32, [65536], [])[cse_var_4]), 0f32)
+ }
}
}
}
@@ -542,7 +470,7 @@ We build the binary and check its correctness and performance.
.. code-block:: none
- Execution time of this operator: 1.939 ms
+ Execution time of this operator: 1.608 ms
diff --git a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
index 3deccd6b77..58e08f40c1 100644
--- a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
@@ -5,16 +5,16 @@
Computation times
=================
-**00:51.902** total execution time for **how_to_tune_with_autotvm** files:
+**00:42.433** total execution time for **how_to_tune_with_autotvm** files:
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``) | 00:51.868 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``) | 00:42.398 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``) | 00:00.020 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``) | 00:00.022 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_cuda.py` (``tune_relay_cuda.py``) | 00:00.005 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_arm.py` (``tune_relay_arm.py``) | 00:00.004 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_arm.py` (``tune_relay_arm.py``) | 00:00.005 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_mobile_gpu.py` (``tune_relay_mobile_gpu.py``) | 00:00.004 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
index a3c8be5b5e..2b95036d1c 100644
--- a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
@@ -390,9 +390,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 4, 16]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9369538
- No: 2 GFLOPS: 119.97/119.97 result: MeasureResult(costs=(0.0019296020655737705,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.6506681442260742, timestamp=1673586277.1947305) [('tile_f', [-1, 1, 8, 1]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 64, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,795987
- No: 3 GFLOPS: 0.00/119.97 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 2, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 8, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9788789
+ No: 2 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -514,8 +513,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 256]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1156339
- No: 4 GFLOPS: 0.00/119.97 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 16, 8, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 64]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7714645
+ No: 3 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -637,8 +636,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 1, 16]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 32, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,212907
- No: 5 GFLOPS: 0.00/119.97 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 2, 64]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 64]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5588645
+ No: 4 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -760,9 +759,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 2, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 64, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6059911
- No: 6 GFLOPS: 46.72/119.97 result: MeasureResult(costs=(0.004955297272727273,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.4855399131774902, timestamp=1673586282.7600145) [('tile_f', [-1, 1, 8, 16]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8392739
- No: 7 GFLOPS: 0.00/119.97 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 32, 4]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 128, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5253730
+ No: 5 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -884,8 +882,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 1, 16]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 32, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,3310726
- No: 8 GFLOPS: 0.00/119.97 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 2, 32]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 16, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,3374552
+ No: 6 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1007,8 +1005,131 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 32, 16]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 32, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7024784
- No: 9 GFLOPS: 0.00/119.97 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 16, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 256, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,996417
+ No: 7 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
+ func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
+ func = build(s, args, target_host=task.target_host, runtime=runtime)
+ File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+ input_mod = lower(inputs, args, name=name, binds=binds)
+ File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+ return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
+ File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
+ tvm._ffi.base.TVMError: Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1730
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1670
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1645
+ 13: operator()
+ at ../src/driver/driver_api.cc:395
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:381
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:276
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:454
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1749
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1693
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1617
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+ Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1730
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1670
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1645
+ 13: operator()
+ at ../src/driver/driver_api.cc:395
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:381
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:276
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:454
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1749
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1693
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1617
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 16, 2]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 1, 256]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2509406
+ No: 8 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 742, in __call__
yield remote, remote.load_module(os.path.split(build_result.filename)[1])
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 706, in run_through_rpc
@@ -1096,7 +1217,7 @@ for this template
15: _PyEval_EvalFrameDefault
14: 0x0000000000537c30
13: _PyObject_FastCallKeywords
- 12: 0x00007f28e7ebcfa2
+ 12: 0x00007f25ca6b6fa2
11: _ctypes_callproc
10: ffi_call
9: ffi_call_unix64
@@ -1160,8 +1281,501 @@ for this template
22: _PyEval_EvalFrameDefault
21: _PyFunction_FastCallKeywords
20: _PyEval_EvalFrameDefault
- 19: _PyFunction_FastCall [('tile_f', [-1, 8, 4, 16]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6619318
- No: 10 GFLOPS: 0.00/119.97 result: Traceback (most recent call last):
+ 19: _PyFunction_FastCall [('tile_f', [-1, 16, 1, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,3133869
+ No: 9 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
+ func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
+ func = build(s, args, target_host=task.target_host, runtime=runtime)
+ File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+ input_mod = lower(inputs, args, name=name, binds=binds)
+ File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+ return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
+ File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
+ tvm._ffi.base.TVMError: Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1730
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1670
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1645
+ 13: operator()
+ at ../src/driver/driver_api.cc:395
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:381
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:276
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:454
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1749
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1693
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1617
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+ Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1730
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1670
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1645
+ 13: operator()
+ at ../src/driver/driver_api.cc:395
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:381
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:276
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:454
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1749
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1693
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1617
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 1, 32]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 16, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9729688
+ No: 10 GFLOPS: 48.18/48.18 result: MeasureResult(costs=(0.004804756133333333,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.991562604904175, timestamp=1673589251.1908367) [('tile_f', [-1, 2, 2, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 16]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,5158625
+ No: 11 GFLOPS: 0.00/48.18 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
+ func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
+ func = build(s, args, target_host=task.target_host, runtime=runtime)
+ File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+ input_mod = lower(inputs, args, name=name, binds=binds)
+ File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+ return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
+ File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
+ tvm._ffi.base.TVMError: Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1730
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1670
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1645
+ 13: operator()
+ at ../src/driver/driver_api.cc:395
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:381
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:276
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:454
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1749
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1693
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1617
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+ Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1730
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1670
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1645
+ 13: operator()
+ at ../src/driver/driver_api.cc:395
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:381
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:276
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:454
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1749
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1693
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1617
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 4, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 32, 16]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6721776
+ No: 12 GFLOPS: 0.00/48.18 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
+ func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
+ func = build(s, args, target_host=task.target_host, runtime=runtime)
+ File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+ input_mod = lower(inputs, args, name=name, binds=binds)
+ File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+ return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
+ File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
+ tvm._ffi.base.TVMError: Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1730
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1670
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1645
+ 13: operator()
+ at ../src/driver/driver_api.cc:395
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:381
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:276
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:454
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1749
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1693
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1617
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+ Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1730
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1670
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1645
+ 13: operator()
+ at ../src/driver/driver_api.cc:395
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:381
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:276
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:454
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1749
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1693
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1617
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 4, 32]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 16]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2837756
+ No: 13 GFLOPS: 0.00/48.18 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
+ func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
+ func = build(s, args, target_host=task.target_host, runtime=runtime)
+ File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+ input_mod = lower(inputs, args, name=name, binds=binds)
+ File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+ return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
+ File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
+ tvm._ffi.base.TVMError: Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1730
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1670
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1645
+ 13: operator()
+ at ../src/driver/driver_api.cc:395
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:381
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:276
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:454
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1749
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1693
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1617
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+ Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1730
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1670
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1630
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1645
+ 13: operator()
+ at ../src/driver/driver_api.cc:395
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:381
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:276
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:454
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1749
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1693
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1617
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 16, 8, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 128, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7225763
+ No: 14 GFLOPS: 0.00/48.18 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1283,9 +1897,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 4, 2]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 16, 32]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2094252
- No: 11 GFLOPS: 100.15/119.97 result: MeasureResult(costs=(0.0023116415,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.5159871578216553, timestamp=1673586289.386429) [('tile_f', [-1, 1, 8, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,3327967
- No: 12 GFLOPS: 0.00/119.97 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 16, 8, 2]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 8]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10164303
+ No: 15 GFLOPS: 0.00/48.18 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1407,28 +2020,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 4, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10409449
- No: 13 GFLOPS: 0.00/119.97 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 142, in build
- res = future.result()
- File "/usr/lib/python3.7/concurrent/futures/_base.py", line 435, in result
- return self.__get_result()
- File "/usr/lib/python3.7/concurrent/futures/_base.py", line 384, in __get_result
- raise self._exception
- File "/usr/lib/python3.7/concurrent/futures/thread.py", line 57, in run
- result = self.fn(*self.args, **self.kwargs)
- File "/workspace/python/tvm/contrib/popen_pool.py", line 432, in <lambda>
- worker = lambda *args: self._worker_run(*args)
- File "/workspace/python/tvm/contrib/popen_pool.py", line 401, in _worker_run
- return proc.recv()
- File "/workspace/python/tvm/contrib/popen_pool.py", line 309, in recv
- raise TimeoutError()
- TimeoutError
-
- [('tile_f', [-1, 32, 2, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,8942013
- No: 14 GFLOPS: 87.14/119.97 result: MeasureResult(costs=(0.002656774325,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.108745574951172, timestamp=1673586300.4445662) [('tile_f', [-1, 4, 16, 2]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 16]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6509227
- No: 15 GFLOPS: 67.33/119.97 result: MeasureResult(costs=(0.003438151804347826,), error_no=MeasureErrorNo.NO_ERROR, all_cost=6.566650152206421, timestamp=1673586301.4699934) [('tile_f', [-1, 4, 8, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 8, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9535489
- No: 16 GFLOPS: 0.00/119.97 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 64, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 128, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7223306
+ No: 16 GFLOPS: 0.00/48.18 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1550,8 +2143,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 4, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 32, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5473675
- No: 17 GFLOPS: 0.00/119.97 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 2, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 32, 16]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10399291
+ No: 17 GFLOPS: 0.00/48.18 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1673,9 +2266,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 16, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 128, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9707097
- No: 18 GFLOPS: 3.91/119.97 result: MeasureResult(costs=(0.05920431125,), error_no=MeasureErrorNo.NO_ERROR, all_cost=9.836127281188965, timestamp=1673586311.4931674) [('tile_f', [-1, 1, 8, 16]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 1, 16]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3801559
- No: 19 GFLOPS: 0.00/119.97 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 64, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 64, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5673187
+ No: 18 GFLOPS: 0.00/48.18 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1797,8 +2389,9 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 16, 16]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 2, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4597523
- No: 20 GFLOPS: 33.33/119.97 result: MeasureResult(costs=(0.006945141066666666,), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.379587173461914, timestamp=1673586312.2758782) [('tile_f', [-1, 1, 16, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 64]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7903094
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 8, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 8, 8]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6107449
+ No: 19 GFLOPS: 48.71/48.71 result: MeasureResult(costs=(0.004753035772727273,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.9338581562042236, timestamp=1673589262.4795296) [('tile_f', [-1, 8, 4, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7012178
+ No: 20 GFLOPS: 48.06/48.71 result: MeasureResult(costs=(0.0048172828571428575,), error_no=MeasureErrorNo.NO_ERROR, all_cost=10.59103798866272, timestamp=1673589263.2117553) [('tile_f', [-1, 8, 1, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 8]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10164323
@@ -1853,9 +2446,9 @@ and measure running time.
Finish loading 20 records
Best config:
- [('tile_f', [-1, 1, 8, 1]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 64, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,795987
+ [('tile_f', [-1, 8, 4, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7012178
Finish loading 20 records
- Time cost of this operator: 0.002147
+ Time cost of this operator: 0.005091
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
index 6df7f951a0..f3585dbe1d 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
@@ -363,10 +363,10 @@ Timing the untuned program
########## Build without Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs Measurements(us)
--------- --- -------- ------- ----- ------ ------- ----------------
- tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 311.1 98.71 (1, 2, 10, 10, 3) 2 1 [311.1]
- tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.088 0.98 (1, 6, 10, 10) 1 1 [3.088]
- tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.979 0.311 (1, 1, 10, 10, 3) 1 1 [0.979]
- Total_time - 315.167 - - - - -
+ tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 311.6 98.728 (1, 2, 10, 10, 3) 2 1 [311.6]
+ tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.044 0.964 (1, 6, 10, 10) 1 1 [3.044]
+ tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.971 0.308 (1, 1, 10, 10, 3) 1 1 [0.971]
+ Total_time - 315.615 - - - - -
@@ -431,10 +431,10 @@ Timing the tuned program
########## Build with Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs Measurements(us)
--------- --- -------- ------- ----- ------ ------- ----------------
- tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 138.1 98.093 (1, 6, 10, 10, 1) 2 1 [138.1]
- tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.826 1.297 (1, 6, 10, 10) 1 1 [1.826]
- tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.858 0.61 (1, 3, 10, 10, 1) 1 1 [0.858]
- Total_time - 140.785 - - - - -
+ tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 102.3 97.398 (1, 6, 10, 10, 1) 2 1 [102.3]
+ tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.767 1.682 (1, 6, 10, 10) 1 1 [1.767]
+ tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.966 0.92 (1, 1, 10, 10, 3) 1 1 [0.966]
+ Total_time - 105.033 - - - - -
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_pytorch.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_pytorch.rst.txt
index 4f97986787..c3c4c85596 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_pytorch.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_pytorch.rst.txt
@@ -117,7 +117,7 @@ download a cat image and preprocess it to use as the model input.
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/torch/ao/quantization/utils.py:281: UserWarning: must run observer before calling calculate_qparams. Returning default values.
"must run observer before calling calculate_qparams. " +
Downloading: "https://download.pytorch.org/models/quantized/mobilenet_v2_qnnpack_37f702c5.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2_qnnpack_37f702c5.pth
-
0%| | 0.00/3.42M [00:00<?, ?B/s]
100%|##########| 3.42M/3.42M [00:00<00:00, 45.4MB/s]
+
0%| | 0.00/3.42M [00:00<?, ?B/s]
100%|##########| 3.42M/3.42M [00:00<00:00, 39.5MB/s]
/workspace/python/tvm/relay/frontend/pytorch_utils.py:47: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
return LooseVersion(torch_ver) > ver
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/setuptools/_distutils/version.py:346: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
@@ -322,7 +322,7 @@ Look up prediction top 1 index in 1000 class synset.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 2.402 seconds)
+ **Total running time of the script:** ( 1 minutes 4.216 seconds)
.. _sphx_glr_download_how_to_work_with_microtvm_micro_pytorch.py:
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
index e55dd30bb7..c5e6f488fc 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
@@ -218,7 +218,7 @@ take about **2 minutes** to download the Stanford Cars, while COCO 2017 validati
.. code-block:: none
- '/tmp/tmpydjojyxm/images/random'
+ '/tmp/tmpznybj2eg/images/random'
@@ -309,7 +309,7 @@ objects to other stuff? We can display some examples from our datasets using ``m
.. image-sg:: /how_to/work_with_microtvm/images/sphx_glr_micro_train_001.png
- :alt: [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]
+ :alt: [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0]
:srcset: /how_to/work_with_microtvm/images/sphx_glr_micro_train_001.png
:class: sphx-glr-single-img
@@ -318,8 +318,8 @@ objects to other stuff? We can display some examples from our datasets using ``m
.. code-block:: none
- /tmp/tmpydjojyxm/images/target contains 8144 images
- /tmp/tmpydjojyxm/images/random contains 5000 images
+ /tmp/tmpznybj2eg/images/target contains 8144 images
+ /tmp/tmpznybj2eg/images/random contains 5000 images
@@ -494,13 +494,13 @@ the time on our validation set).
.. code-block:: none
Epoch 1/3
- 328/328 - 47s - loss: 0.2238 - accuracy: 0.9261 - val_loss: 0.1346 - val_accuracy: 0.9588 - 47s/epoch - 142ms/step
+ 328/328 - 47s - loss: 0.2242 - accuracy: 0.9216 - val_loss: 0.1633 - val_accuracy: 0.9520 - 47s/epoch - 145ms/step
Epoch 2/3
- 328/328 - 43s - loss: 0.0968 - accuracy: 0.9632 - val_loss: 0.1043 - val_accuracy: 0.9649 - 43s/epoch - 131ms/step
+ 328/328 - 44s - loss: 0.0945 - accuracy: 0.9651 - val_loss: 0.1446 - val_accuracy: 0.9551 - 44s/epoch - 133ms/step
Epoch 3/3
- 328/328 - 43s - loss: 0.0726 - accuracy: 0.9717 - val_loss: 0.1013 - val_accuracy: 0.9649 - 43s/epoch - 131ms/step
+ 328/328 - 43s - loss: 0.0677 - accuracy: 0.9756 - val_loss: 0.1315 - val_accuracy: 0.9569 - 43s/epoch - 132ms/step
- <keras.callbacks.History object at 0x7f78ca9f8710>
+ <keras.callbacks.History object at 0x7f70d4431410>
@@ -857,7 +857,7 @@ Arduino tutorial for how to do that `on GitHub <https://github.com/guberti/tvm-a
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 5 minutes 0.605 seconds)
+ **Total running time of the script:** ( 4 minutes 26.523 seconds)
.. _sphx_glr_download_how_to_work_with_microtvm_micro_train.py:
diff --git a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
index 8f970e5892..b10f727017 100644
--- a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
@@ -5,22 +5,22 @@
Computation times
=================
-**07:05.688** total execution time for **how_to_work_with_microtvm** files:
+**06:34.283** total execution time for **how_to_work_with_microtvm** files:
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py` (``micro_train.py``) | 05:00.605 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py` (``micro_train.py``) | 04:26.523 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_pytorch.py` (``micro_pytorch.py``) | 01:02.402 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_pytorch.py` (``micro_pytorch.py``) | 01:04.216 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``) | 00:51.076 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``) | 00:51.761 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_aot.py` (``micro_aot.py``) | 00:07.810 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_aot.py` (``micro_aot.py``) | 00:07.958 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``) | 00:03.794 | 0.0 MB |
-+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_ethosu.py` (``micro_ethosu.py``) | 00:00.000 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``) | 00:03.825 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tvmc.py` (``micro_tvmc.py``) | 00:00.000 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_ethosu.py` (``micro_ethosu.py``) | 00:00.000 | 0.0 MB |
++---------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_microtvm_micro_reference_vm.py` (``micro_reference_vm.py``) | 00:00.000 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
index e22841e07e..52df9d40f3 100644
--- a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
Computation times
=================
-**00:43.660** total execution time for **how_to_work_with_relay** files:
+**00:43.986** total execution time for **how_to_work_with_relay** files:
+----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_using_pipeline_executor.py` (``using_pipeline_executor.py``) | 00:32.199 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_using_pipeline_executor.py` (``using_pipeline_executor.py``) | 00:32.343 | 0.0 MB |
+----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``) | 00:09.707 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``) | 00:10.036 | 0.0 MB |
+----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``) | 00:01.747 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``) | 00:01.601 | 0.0 MB |
+----------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_relay_using_relay_viz.py` (``using_relay_viz.py``) | 00:00.006 | 0.0 MB |
+----------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt b/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
index f7aec291f5..28ef062a33 100644
--- a/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
@@ -264,7 +264,7 @@ The following example customizes CUDA lowering rule for :code:`exp`.
.. code-block:: none
- <function my_cuda_math_rule at 0x7f78cad5a950>
+ <function my_cuda_math_rule at 0x7f6ef8921950>
diff --git a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
index 0b5bde6046..f211577f4f 100644
--- a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
@@ -5,22 +5,22 @@
Computation times
=================
-**00:04.892** total execution time for **how_to_work_with_schedules** files:
+**00:05.055** total execution time for **how_to_work_with_schedules** files:
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``) | 00:02.380 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``) | 00:02.452 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``) | 00:01.147 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``) | 00:01.233 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``) | 00:00.588 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``) | 00:00.592 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``) | 00:00.566 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_extern_op.py` (``extern_op.py``) | 00:00.112 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_extern_op.py` (``extern_op.py``) | 00:00.113 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_schedules_schedule_primitives.py` (``schedule_primitives.py``) | 00:00.049 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tedd.py` (``tedd.py``) | 00:00.027 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tedd.py` (``tedd.py``) | 00:00.028 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_schedules_tuple_inputs.py` (``tuple_inputs.py``) | 00:00.023 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
index 3eadb9a90c..f693910715 100644
--- a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
@@ -347,7 +347,7 @@ The importing needs to happen before the tensorized GEMV being executed.
B: Buffer(B_2: Pointer(float32), float32, [512, 64], []),
C: Buffer(C_2: Pointer(float32), float32, [1024, 512], [])}
buffer_map = {A_1: A, B_1: B, C_1: C} {
- attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmp63fa7_ry/input0.cc'\nsource_filename = \"/tmp/tmp63fa7_ry/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = alloca float*, align 8\n %8 = alloca float*, align 8\n %9 = alloca floa [...]
+ attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmpjscc11gc/input0.cc'\nsource_filename = \"/tmp/tmpjscc11gc/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = alloca float*, align 8\n %8 = alloca float*, align 8\n %9 = alloca floa [...]
for (i, 0, 1024) {
for (j.outer: int32, 0, 32) {
@tir.call_extern("gemv_update", @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), C_2, ((i*512) + (j.outer*16)), 16, 2, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), A_2, (i*64), 64, 1, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), B_2, (j.outer*1024), 1024, 1, dtype=handle), 16, 64, 64, dtype=int32)
diff --git a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
index 48a75c079d..a71bf32a07 100644
--- a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**00:26.347** total execution time for **topic_vta_tutorials_autotvm** files:
+**00:26.745** total execution time for **topic_vta_tutorials_autotvm** files:
+---------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``) | 00:26.341 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``) | 00:26.739 | 0.0 MB |
+---------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_alu_vta.py` (``tune_alu_vta.py``) | 00:00.006 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_alu_vta.py` (``tune_alu_vta.py``) | 00:00.007 | 0.0 MB |
+---------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
index 8123aa28df..e62deb3cfd 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
@@ -293,7 +293,7 @@ The compilation steps are:
DeprecationWarning,
/workspace/vta/tutorials/frontend/deploy_classification.py:213: DeprecationWarning: legacy graph executor behavior of producing json / lib / params will be removed in the next release. Please see documents of tvm.contrib.graph_executor.GraphModule for the new recommended usage.
relay_prog, target=tvm.target.Target(target, host=env.target_host), params=params
- resnet18_v1 inference graph built in 27.78s!
+ resnet18_v1 inference graph built in 29.31s!
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
index aa753d92f5..2a079e1f51 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
@@ -337,7 +337,7 @@ The compilation steps are:
/workspace/python/tvm/relay/build_module.py:348: DeprecationWarning: Please use input parameter mod (tvm.IRModule) instead of deprecated parameter mod (tvm.relay.function.Function)
DeprecationWarning,
- yolov3-tiny inference graph built in 19.36s!
+ yolov3-tiny inference graph built in 19.70s!
diff --git a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
index 195de70a26..e8cc4deb9e 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**01:30.668** total execution time for **topic_vta_tutorials_frontend** files:
+**01:32.668** total execution time for **topic_vta_tutorials_frontend** files:
+------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``) | 00:46.101 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``) | 00:46.480 | 0.0 MB |
+------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``) | 00:44.567 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``) | 00:46.188 | 0.0 MB |
+------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
index e388af5ae6..6503a7b9be 100644
--- a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**00:03.152** total execution time for **topic_vta_tutorials_optimize** files:
+**00:03.168** total execution time for **topic_vta_tutorials_optimize** files:
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``) | 00:02.680 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``) | 00:02.695 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``) | 00:00.472 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
index da95f01665..f17f59dc36 100644
--- a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**00:00.843** total execution time for **topic_vta_tutorials** files:
+**00:00.833** total execution time for **topic_vta_tutorials** files:
+---------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``) | 00:00.452 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``) | 00:00.434 | 0.0 MB |
+---------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``) | 00:00.391 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``) | 00:00.399 | 0.0 MB |
+---------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
index 1e029626da..3fdbcf8fa8 100644
--- a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
@@ -329,7 +329,7 @@ We build the binary and check its correctness and performance.
.. code-block:: none
- Execution time of this operator: 96.889 ms
+ Execution time of this operator: 94.767 ms
@@ -447,7 +447,7 @@ operations.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 12.697 seconds)
+ **Total running time of the script:** ( 1 minutes 29.244 seconds)
.. _sphx_glr_download_tutorial_auto_scheduler_matmul_x86.py:
diff --git a/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt b/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
index bcd77e8dcf..f85fc4b417 100644
--- a/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
@@ -454,16 +454,16 @@ reduce variance, we take 5 measurements and average them.
waiting for device...
device available
Get devices for measurement successfully!
- No: 1 GFLOPS: 11.52/11.52 result: MeasureResult(costs=(0.0233082582,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.618699312210083, timestamp=1673584898.2973168) [('tile_y', [-1, 256]), ('tile_x', [-1, 32])],None,58
- No: 2 GFLOPS: 13.02/13.02 result: MeasureResult(costs=(0.0206117574,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6708076000213623, timestamp=1673584898.8869452) [('tile_y', [-1, 128]), ('tile_x', [-1, 128])],None,77
- No: 3 GFLOPS: 11.10/13.02 result: MeasureResult(costs=(0.0241779036,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6823415756225586, timestamp=1673584900.2792547) [('tile_y', [-1, 4]), ('tile_x', [-1, 128])],None,72
- No: 4 GFLOPS: 12.48/13.02 result: MeasureResult(costs=(0.021511234400000002,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.569068431854248, timestamp=1673584900.881405) [('tile_y', [-1, 256]), ('tile_x', [-1, 512])],None,98
- No: 5 GFLOPS: 3.53/13.02 result: MeasureResult(costs=(0.0760025172,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4577972888946533, timestamp=1673584902.46397) [('tile_y', [-1, 8]), ('tile_x', [-1, 8])],None,33
- No: 6 GFLOPS: 2.42/13.02 result: MeasureResult(costs=(0.11107341100000001,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.0046513080596924, timestamp=1673584905.2472649) [('tile_y', [-1, 2]), ('tile_x', [-1, 4])],None,21
- No: 7 GFLOPS: 12.87/13.02 result: MeasureResult(costs=(0.0208598232,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5794329643249512, timestamp=1673584906.581666) [('tile_y', [-1, 8]), ('tile_x', [-1, 512])],None,93
- No: 8 GFLOPS: 0.50/13.02 result: MeasureResult(costs=(0.5343837918000001,), error_no=MeasureErrorNo.NO_ERROR, all_cost=8.791759252548218, timestamp=1673584915.3846312) [('tile_y', [-1, 64]), ('tile_x', [-1, 1])],None,6
- No: 9 GFLOPS: 13.28/13.28 result: MeasureResult(costs=(0.020211948200000003,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6139936447143555, timestamp=1673584916.1124177) [('tile_y', [-1, 256]), ('tile_x', [-1, 128])],None,78
- No: 10 GFLOPS: 3.11/13.28 result: MeasureResult(costs=(0.08643492659999999,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.589315414428711, timestamp=1673584917.7466106) [('tile_y', [-1, 256]), ('tile_x', [-1, 8])],None,38
+ No: 1 GFLOPS: 1.60/1.60 result: MeasureResult(costs=(0.1681077812,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.923945426940918, timestamp=1673587821.3763597) [('tile_y', [-1, 8]), ('tile_x', [-1, 1])],None,3
+ No: 2 GFLOPS: 8.21/8.21 result: MeasureResult(costs=(0.0327139448,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.7715928554534912, timestamp=1673587822.1536953) [('tile_y', [-1, 4]), ('tile_x', [-1, 32])],None,52
+ No: 3 GFLOPS: 13.66/13.66 result: MeasureResult(costs=(0.0196553876,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5808987617492676, timestamp=1673587823.505404) [('tile_y', [-1, 128]), ('tile_x', [-1, 64])],None,67
+ No: 4 GFLOPS: 9.18/13.66 result: MeasureResult(costs=(0.0292502722,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.7435061931610107, timestamp=1673587825.0071871) [('tile_y', [-1, 16]), ('tile_x', [-1, 32])],None,54
+ No: 5 GFLOPS: 2.76/13.66 result: MeasureResult(costs=(0.0973814744,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.7957935333251953, timestamp=1673587827.7336202) [('tile_y', [-1, 512]), ('tile_x', [-1, 8])],None,39
+ No: 6 GFLOPS: 0.51/13.66 result: MeasureResult(costs=(0.5282505302,), error_no=MeasureErrorNo.NO_ERROR, all_cost=8.686627626419067, timestamp=1673587836.4408274) [('tile_y', [-1, 128]), ('tile_x', [-1, 1])],None,7
+ No: 7 GFLOPS: 11.63/13.66 result: MeasureResult(costs=(0.0230880318,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6502699851989746, timestamp=1673587837.0687687) [('tile_y', [-1, 16]), ('tile_x', [-1, 256])],None,84
+ No: 8 GFLOPS: 3.65/13.66 result: MeasureResult(costs=(0.0736199068,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4238529205322266, timestamp=1673587838.5028274) [('tile_y', [-1, 128]), ('tile_x', [-1, 16])],None,47
+ No: 9 GFLOPS: 0.90/13.66 result: MeasureResult(costs=(0.2992170852,), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.99980092048645, timestamp=1673587843.6152205) [('tile_y', [-1, 128]), ('tile_x', [-1, 2])],None,17
+ No: 10 GFLOPS: 3.10/13.66 result: MeasureResult(costs=(0.0866249486,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.5909092426300049, timestamp=1673587845.2550483) [('tile_y', [-1, 128]), ('tile_x', [-1, 8])],None,37
diff --git a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
index 1d3d0ccda1..1e66b6ede8 100644
--- a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
@@ -311,7 +311,7 @@ standard deviation.
.. code-block:: none
- {'mean': 511.403784489994, 'median': 511.6111203500168, 'std': 2.3768068098193593}
+ {'mean': 514.6882055699962, 'median': 515.0596661999998, 'std': 2.204362758970251}
@@ -545,30 +545,28 @@ the tuning data to.
.. code-block:: none
-
[Task 1/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 1/25] Current/Best: 10.98/ 22.44 GFLOPS | Progress: (4/20) | 7.34 s
[Task 1/25] Current/Best: 9.12/ 22.44 GFLOPS | Progress: (8/20) | 12.37 s
[Task 1/25] Current/Best: 4.84/ 22.44 GFLOPS | Progress: (12/20) | 18.72 s
[Task 1/25] Current/Best: 22.22/ 22.44 GFLOPS | Progress: (16/20) | 20.82 s
[Task 1/25] Current/Best: 6.74/ 22.44 GFLOPS | Progress: (20/20) | 24.79 s Done.
-
[Task 2/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 2/25] Current/Best: 12.52/ 16.64 GFLOPS | Progress: (4/20) | 3.53 s
[Task 2/25] Current/Best: 5.51/ 16.64 GFLOPS | Progress: (8/20) | 5.21 s
[Task 2/25] Current/Best: 6.32/ 16.64 GFLOPS | Progress: (12/20) | 7.24 s
[Task 2/25] Current/Best: 3.83/ 18.13 GFLOPS | Progress: (16/20) | 8.89 s
[Task 2/25] Current/Best: 7.65/ 18.13 GFLOPS | Progress: (20/20) | 10.51 s Done.
-
[Task 3/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 3/25] Current/Best: 9.27/ 21.76 GFLOPS | Progress: (4/20) | 4.09 s
[Task 3/25] Current/Best: 12.45/ 22.46 GFLOPS | Progress: (8/20) | 6.57 s
[Task 3/25] Current/Best: 5.38/ 23.45 GFLOPS | Progress: (12/20) | 8.87 s
[Task 3/25] Current/Best: 7.07/ 23.45 GFLOPS | Progress: (16/20) | 12.49 s
[Task 3/25] Current/Best: 14.83/ 23.45 GFLOPS | Progress: (20/20) | 14.45 s Done.
-
[Task 4/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 4/25] Current/Best: 6.32/ 9.05 GFLOPS | Progress: (4/20) | 10.85 s
[Task 4/25] Current/Best: 16.62/ 16.62 GFLOPS | Progress: (8/20) | 12.73 s
[Task 4/25] Current/Best: 16.76/ 16.76 GFLOPS | Progress: (12/20) | 21.33 s
[Task 4/25] Current/Best: 12.50/ 16.76 GFLOPS | Progress: (16/20) | 23.49 s
[Task 4/25] Current/Best: 13.25/ 16.76 GFLOPS | Progress: (20/20) | 30.10 s Done.
-
[Task 5/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 5/25] Current/Best: 3.50/ 15.27 GFLOPS | Progress: (4/20) | 4.42 s
[Task 5/25] Current/Best: 16.14/ 18.51 GFLOPS | Progress: (8/20) | 6.42 s
[Task 5/25] Current/Best: 14.04/ 18.51 GFLOPS | Progress: (12/20) | 8.47 s
[Task 5/25] Current/Best: 6.86/ 18.51 GFLOPS | Progress: (16/20) | 10.71 s
[Task 5/25] Current/Best: 8.18/ 18.51 GFLOPS | Progress: (20/20) | 12.60 s Done.
-
[Task 6/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 6/25] Current/Best: 7.16/ 17.81 GFLOPS | Progress: (4/20) | 4.05 s
[Task 6/25] Current/Best: 11.87/ 17.81 GFLOPS | Progress: (8/20) | 7.37 s
[Task 6/25] Current/Best: 20.63/ 20.63 GFLOPS | Progress: (12/20) | 9.72 s
[Task 6/25] Current/Best: 11.44/ 20.63 GFLOPS | Progress: (16/20) | 13.61 s
[Task 6/25] Current/Best: 4.61/ 20.63 GFLOPS | Progress: (20/20) | 16.49 s Done.
-
[Task 7/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 7/25] Current/Best: 11.78/ 15.98 GFLOPS | Progress: (4/20) | 5.53 s
[Task 7/25] Current/Best: 6.34/ 15.98 GFLOPS | Progress: (8/20) | 8.28 s
[Task 7/25] Current/Best: 6.06/ 18.20 GFLOPS | Progress: (12/20) | 11.94 s
[Task 7/25] Current/Best: 9.86/ 18.20 GFLOPS | Progress: (16/20) | 14.46 s
[Task 7/25] Current/Best: 18.08/ 18.20 GFLOPS | Progress: (20/20) | 16.57 s Done.
-
[Task 8/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 8/25] Current/Best: 9.77/ 10.91 GFLOPS | Progress: (4/20) | 10.88 s
[Task 8/25] Current/Best: 18.65/ 18.65 GFLOPS | Progress: (8/20) | 14.25 s
[Task 8/25] Current/Best: 11.19/ 18.65 GFLOPS | Progress: (12/20) | 16.81 s
[Task 8/25] Current/Best: 7.48/ 18.65 GFLOPS | Progress: (16/20) | 28.98 s
[Task 8/25] Current/Best: 12.41/ 18.65 GFLOPS | Progress: (20/20) | 34.53 s
[Task 9/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 9/25] Current/Best: 5.19/ 9.69 GFLOPS | Progress: (4/20) | 8.08 s
[Task 9/25] Current/Best: 13.99/ 19.34 GFLOPS | Progress: (8/20) | 10.95 s
[Task 9/25] Current/Best: 8.61/ 20.65 GFLOPS | Progress: (12/20) | 12.77 s
[Task 9/25] Current/Best: 18.26/ 20.65 GFLOPS | Progress: (16/20) | 14.52 s
[Task 9/25] Current/Best: 18.14/ 22.80 GFLOPS | Progress: (20
/20) | 19.18 s Done.
-
[Task 10/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 10/25] Current/Best: 3.86/ 15.59 GFLOPS | Progress: (4/20) | 4.17 s
[Task 10/25] Current/Best: 11.23/ 15.59 GFLOPS | Progress: (8/20) | 6.25 s
[Task 10/25] Current/Best: 22.05/ 22.05 GFLOPS | Progress: (12/20) | 8.18 s
[Task 10/25] Current/Best: 16.14/ 22.05 GFLOPS | Progress: (16/20) | 12.50 s
[Task 10/25] Current/Best: 12.29/ 22.05 GFLOPS | Progress: (20/20) | 14.12 s Done.
-
[Task 11/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 11/25] Current/Best: 15.90/ 18.04 GFLOPS | Progress: (4/20) | 4.13 s
[Task 11/25] Current/Best: 7.89/ 18.04 GFLOPS | Progress: (8/20) | 6.66 s
[Task 11/25] Current/Best: 17.11/ 18.42 GFLOPS | Progress: (12/20) | 8.82 s
[Task 11/25] Current/Best: 7.01/ 18.42 GFLOPS | Progress: (16/20) | 11.72 s
[Task 11/25] Current/Best: 14.40/ 18.42 GFLOPS | Progress: (20/20) | 14.04 s Done.
-
[Task 12/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 12/25] Current/Best: 13.59/ 13.59 GFLOPS | Progress: (4/20) | 5.95 s
[Task 12/25] Current/Best: 18.65/ 18.65 GFLOPS | Progress: (8/20) | 8.20 s
[Task 12/25] Current/Best: 22.13/ 22.13 GFLOPS | Progress: (12/20) | 14.56 s
[Task 12/25] Current/Best: 15.95/ 22.13 GFLOPS | Progress: (16/20) | 17.40 s
[Task 12/25] Current/Best: 4.08/ 22.13 GFLOPS | Progress: (20/20) | 20.56 s Done.
-
[Task 13/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 13/25] Current/Best: 20.17/ 20.17 GFLOPS | Progress: (4/20) | 4.80 s
[Task 13/25] Current/Best: 18.18/ 20.33 GFLOPS | Progress: (8/20) | 7.32 s
[Task 13/25] Current/Best: 9.83/ 22.14 GFLOPS | Progress: (12/20) | 9.54 s
[Task 13/25] Current/Best: 13.52/ 22.14 GFLOPS | Progress: (16/20) | 12.94 s
[Task 13/25] Current/Best: 11.96/ 22.14 GFLOPS | Progress: (20/20) | 16.98 s Done.
-
[Task 14/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 14/25] Current/Best: 12.71/ 13.40 GFLOPS | Progress: (4/20) | 8.49 s
[Task 14/25] Current/Best: 8.45/ 14.55 GFLOPS | Progress: (8/20) | 16.17 s
[Task 14/25] Current/Best: 8.68/ 14.55 GFLOPS | Progress: (12/20) | 18.72 s
[Task 14/25] Current/Best: 4.66/ 14.55 GFLOPS | Progress: (16/20) | 26.43 s
[Task 14/25] Current/Best: 11.41/ 14.55 GFLOPS | Progress: (20/20) | 28.87 s
[Task 15/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 15/25] Current/Best: 9.18/ 17.91 GFLOPS | Progress: (4/20) | 4.01 s
[Task 15/25] Current/Best: 3.16/ 17.91 GFLOPS | Progress: (8/20) | 11.40 s
[Task 15/25] Current/Best: 17.68/ 17.91 GFLOPS | Progress: (12/20) | 13.27 s
[Task 15/25] Current/Best: 18.59/ 20.68 GFLOPS | Progress: (16/20) | 15.60 s
[Task 15/25] Current/Best: 18.40/ 20.68 GFLOPS | Progress: (20/
20) | 18.53 s Done.
-
[Task 16/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 16/25] Current/Best: 14.32/ 18.02 GFLOPS | Progress: (4/20) | 3.71 s
[Task 16/25] Current/Best: 10.47/ 18.02 GFLOPS | Progress: (8/20) | 5.55 s
[Task 16/25] Current/Best: 15.09/ 18.02 GFLOPS | Progress: (12/20) | 8.04 s
[Task 16/25] Current/Best: 13.55/ 18.02 GFLOPS | Progress: (16/20) | 10.33 s
[Task 16/25] Current/Best: 10.65/ 18.02 GFLOPS | Progress: (20/20) | 11.91 s Done.
-
[Task 17/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 17/25] Current/Best: 9.48/ 16.56 GFLOPS | Progress: (4/20) | 5.05 s
[Task 17/25] Current/Best: 10.07/ 16.56 GFLOPS | Progress: (8/20) | 7.65 s Done.
- Done.
-
[Task 17/25] Current/Best: 15.52/ 18.97 GFLOPS | Progress: (12/20) | 11.04 s
[Task 17/25] Current/Best: 8.88/ 19.47 GFLOPS | Progress: (16/20) | 14.05 s
[Task 17/25] Current/Best: 21.37/ 21.37 GFLOPS | Progress: (20/20) | 16.75 s Done.
-
[Task 18/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 18/25] Current/Best: 21.04/ 21.04 GFLOPS | Progress: (4/20) | 4.13 s
[Task 18/25] Current/Best: 11.44/ 21.04 GFLOPS | Progress: (8/20) | 8.24 s
[Task 18/25] Current/Best: 5.76/ 21.04 GFLOPS | Progress: (12/20) | 12.16 s
[Task 18/25] Current/Best: 9.65/ 21.04 GFLOPS | Progress: (16/20) | 17.39 s
[Task 18/25] Current/Best: 10.33/ 21.04 GFLOPS | Progress: (20/20) | 23.27 s Done.
-
[Task 19/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 19/25] Current/Best: 12.79/ 18.71 GFLOPS | Progress: (4/20) | 4.33 s
[Task 19/25] Current/Best: 15.75/ 18.71 GFLOPS | Progress: (8/20) | 6.63 s
[Task 19/25] Current/Best: 5.38/ 18.71 GFLOPS | Progress: (12/20) | 9.64 s
[Task 19/25] Current/Best: 18.76/ 23.77 GFLOPS | Progress: (16/20) | 12.81 s
[Task 19/25] Current/Best: 16.53/ 23.77 GFLOPS | Progress: (20/20) | 18.05 s Done.
-
[Task 20/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 20/25] Current/Best: 15.08/ 15.08 GFLOPS | Progress: (4/20) | 3.84 s
[Task 20/25] Current/Best: 20.41/ 20.41 GFLOPS | Progress: (8/20) | 7.21 s
[Task 20/25] Current/Best: 13.20/ 20.41 GFLOPS | Progress: (12/20) | 9.04 s
[Task 20/25] Current/Best: 4.81/ 20.41 GFLOPS | Progress: (16/20) | 13.56 s
[Task 20/25] Current/Best: 8.27/ 20.41 GFLOPS | Progress: (20/20) | 16.46 s
[Task 21/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 21/25] Current/Best: 6.67/ 7.36 GFLOPS | Progress: (4/20) | 5.53 s
[Task 21/25] Current/Best: 12.38/ 12.58 GFLOPS | Progress: (8/20) | 6.99 s
[Task 21/25] Current/Best: 14.83/ 14.83 GFLOPS | Progress: (12/20) | 11.90 s
[Task 21/25] Current/Best: 12.37/ 14.83 GFLOPS | Progress: (16/20) | 14.19 s Done.
-
[Task 21/25] Current/Best: 10.10/ 17.54 GFLOPS | Progress: (20/20) | 17.64 s
[Task 22/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 22/25] Current/Best: 15.20/ 18.49 GFLOPS | Progress: (4/20) | 4.39 s
[Task 22/25] Current/Best: 4.15/ 18.49 GFLOPS | Progress: (8/20) | 6.37 s
[Task 22/25] Current/Best: 10.90/ 18.49 GFLOPS | Progress: (12/20) | 8.07 s
[Task 22/25] Current/Best: 17.94/ 18.49 GFLOPS | Progress: (16/20) | 10.05 s
[Task 22/25] Current/Best: 4.58/ 18.49 GFLOPS | Progress: (20/20) | 12.20 s Done.
-
[Task 23/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 23/25] Current/Best: 9.50/ 22.46 GFLOPS | Progress: (4/20) | 5.12 s
[Task 23/25] Current/Best: 18.87/ 22.46 GFLOPS | Progress: (8/20) | 13.19 s
[Task 23/25] Current/Best: 8.18/ 22.46 GFLOPS | Progress: (12/20) | 16.88 s
[Task 23/25] Current/Best: 14.83/ 22.46 GFLOPS | Progress: (16/20) | 21.46 s
[Task 23/25] Current/Best: 2.38/ 22.46 GFLOPS | Progress: (20/20) | 25.82 s Done.
-
[Task 24/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 24/25] Current/Best: 2.87/ 3.14 GFLOPS | Progress: (4/20) | 7.47 s
[Task 24/25] Current/Best: 10.31/ 10.31 GFLOPS | Progress: (8/20) | 16.81 s
[Task 24/25] Current/Best: 3.29/ 10.31 GFLOPS | Progress: (12/20) | 27.46 s
[Task 24/25] Current/Best: 3.12/ 10.31 GFLOPS | Progress: (16/20) | 38.24 s
[Task 24/25] Current/Best: 4.25/ 10.31 GFLOPS | Progress: (20/20) | 44.49 s
[Task 25/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 25/25] Current/Best: 1.55/ 8.03 GFLOPS | Progress: (4/20) | 12.77 s Done.
-
[Task 25/25] Current/Best: 1.54/ 8.03 GFLOPS | Progress: (8/20) | 17.75 s
[Task 25/25] Current/Best: 5.28/ 8.03 GFLOPS | Progress: (12/20) | 20.57 s
[Task 25/25] Current/Best: 5.74/ 8.03 GFLOPS | Progress: (16/20) | 31.53 s
[Task 25/25] Current/Best: 3.02/ 8.03 GFLOPS | Progress: (20/20) | 42.50 s
+
[Task 1/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 1/25] Current/Best: 6.65/ 21.92 GFLOPS | Progress: (4/20) | 7.79 s
[Task 1/25] Current/Best: 9.92/ 22.16 GFLOPS | Progress: (8/20) | 10.84 s
[Task 1/25] Current/Best: 8.54/ 22.16 GFLOPS | Progress: (12/20) | 14.55 s
[Task 1/25] Current/Best: 5.65/ 22.16 GFLOPS | Progress: (16/20) | 18.37 s
[Task 1/25] Current/Best: 10.97/ 22.16 GFLOPS | Progress: (20/20) | 21.47 s Done.
+
[Task 2/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 2/25] Current/Best: 9.86/ 11.86 GFLOPS | Progress: (4/20) | 3.61 s
[Task 2/25] Current/Best: 11.53/ 11.86 GFLOPS | Progress: (8/20) | 5.51 s
[Task 2/25] Current/Best: 5.60/ 13.35 GFLOPS | Progress: (12/20) | 7.25 s
[Task 2/25] Current/Best: 13.32/ 21.92 GFLOPS | Progress: (16/20) | 8.73 s
[Task 2/25] Current/Best: 17.91/ 21.92 GFLOPS | Progress: (20/20) | 11.09 s Done.
+
[Task 3/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 3/25] Current/Best: 16.91/ 19.46 GFLOPS | Progress: (4/20) | 4.09 s
[Task 3/25] Current/Best: 5.74/ 23.42 GFLOPS | Progress: (8/20) | 6.60 s
[Task 3/25] Current/Best: 12.65/ 23.42 GFLOPS | Progress: (12/20) | 9.68 s
[Task 3/25] Current/Best: 17.62/ 23.42 GFLOPS | Progress: (16/20) | 12.21 s
[Task 3/25] Current/Best: 11.21/ 23.42 GFLOPS | Progress: (20/20) | 14.28 s Done.
+
[Task 4/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 4/25] Current/Best: 14.75/ 16.67 GFLOPS | Progress: (4/20) | 4.63 s
[Task 4/25] Current/Best: 18.59/ 18.59 GFLOPS | Progress: (8/20) | 15.71 s
[Task 4/25] Current/Best: 12.89/ 18.59 GFLOPS | Progress: (12/20) | 22.03 s
[Task 4/25] Current/Best: 13.24/ 21.66 GFLOPS | Progress: (16/20) | 24.55 s
[Task 4/25] Current/Best: 5.04/ 21.66 GFLOPS | Progress: (20/20) | 31.22 s Done.
+
[Task 5/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 5/25] Current/Best: 5.19/ 14.15 GFLOPS | Progress: (4/20) | 3.82 s
[Task 5/25] Current/Best: 19.06/ 19.06 GFLOPS | Progress: (8/20) | 5.69 s
[Task 5/25] Current/Best: 18.13/ 19.06 GFLOPS | Progress: (12/20) | 7.30 s
[Task 5/25] Current/Best: 8.80/ 19.06 GFLOPS | Progress: (16/20) | 9.43 s
[Task 5/25] Current/Best: 6.10/ 19.34 GFLOPS | Progress: (20/20) | 11.71 s Done.
+
[Task 6/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 6/25] Current/Best: 3.20/ 18.09 GFLOPS | Progress: (4/20) | 4.86 s
[Task 6/25] Current/Best: 14.52/ 20.10 GFLOPS | Progress: (8/20) | 7.40 s
[Task 6/25] Current/Best: 11.62/ 20.10 GFLOPS | Progress: (12/20) | 11.34 s
[Task 6/25] Current/Best: 17.79/ 20.10 GFLOPS | Progress: (16/20) | 16.03 s
[Task 6/25] Current/Best: 22.65/ 22.65 GFLOPS | Progress: (20/20) | 18.27 s Done.
+
[Task 7/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 7/25] Current/Best: 7.61/ 19.16 GFLOPS | Progress: (4/20) | 4.03 s
[Task 7/25] Current/Best: 13.26/ 19.16 GFLOPS | Progress: (8/20) | 6.27 s
[Task 7/25] Current/Best: 6.14/ 22.56 GFLOPS | Progress: (12/20) | 8.75 s
[Task 7/25] Current/Best: 11.62/ 22.56 GFLOPS | Progress: (16/20) | 11.33 s
[Task 7/25] Current/Best: 16.93/ 22.56 GFLOPS | Progress: (20/20) | 13.63 s Done.
+
[Task 8/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 8/25] Current/Best: 16.08/ 18.64 GFLOPS | Progress: (4/20) | 9.71 s
[Task 8/25] Current/Best: 7.90/ 23.09 GFLOPS | Progress: (8/20) | 12.25 s
[Task 8/25] Current/Best: 4.63/ 23.09 GFLOPS | Progress: (12/20) | 15.82 s
[Task 8/25] Current/Best: 15.66/ 23.09 GFLOPS | Progress: (16/20) | 18.77 s
[Task 8/25] Current/Best: 9.08/ 23.09 GFLOPS | Progress: (20/20) | 27.69 s Done.
+
[Task 9/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 9/25] Current/Best: 11.64/ 16.31 GFLOPS | Progress: (4/20) | 6.91 s
[Task 9/25] Current/Best: 21.12/ 21.12 GFLOPS | Progress: (8/20) | 8.46 s
[Task 9/25] Current/Best: 15.47/ 21.12 GFLOPS | Progress: (12/20) | 11.43 s
[Task 9/25] Current/Best: 6.55/ 21.12 GFLOPS | Progress: (16/20) | 15.67 s
[Task 9/25] Current/Best: 18.09/ 21.12 GFLOPS | Progress: (20/20) | 26.68 s
[Task 10/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 10/25] Current/Best: 6.92/ 12.05 GFLOPS | Progress: (4/20) | 4.70 s
[Task 10/25] Current/Best: 10.36/ 12.99 GFLOPS | Progress: (8/20) | 8.13 s
[Task 10/25] Current/Best: 14.83/ 15.21 GFLOPS | Progress: (12/20) | 10.34 s
[Task 10/25] Current/Best: 11.83/ 21.92 GFLOPS | Progress: (16/20) | 12.76 s
[Task 10/25] Current/Best: 4.29/ 21.92 GFLOPS | Progress: (20/20
) | 15.17 s Done.
+
[Task 11/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 11/25] Current/Best: 9.62/ 22.56 GFLOPS | Progress: (4/20) | 3.89 s
[Task 11/25] Current/Best: 9.42/ 22.63 GFLOPS | Progress: (8/20) | 6.27 s
[Task 11/25] Current/Best: 21.71/ 22.63 GFLOPS | Progress: (12/20) | 8.77 s
[Task 11/25] Current/Best: 19.16/ 22.63 GFLOPS | Progress: (16/20) | 10.94 s
[Task 11/25] Current/Best: 10.07/ 22.63 GFLOPS | Progress: (20/20) | 13.40 s Done.
+
[Task 12/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 12/25] Current/Best: 9.16/ 15.20 GFLOPS | Progress: (4/20) | 6.13 s
[Task 12/25] Current/Best: 18.71/ 20.26 GFLOPS | Progress: (8/20) | 10.34 s
[Task 12/25] Current/Best: 11.63/ 20.26 GFLOPS | Progress: (12/20) | 14.73 s
[Task 12/25] Current/Best: 10.43/ 20.34 GFLOPS | Progress: (16/20) | 20.22 s
[Task 12/25] Current/Best: 13.89/ 20.34 GFLOPS | Progress: (20/20) | 24.89 s Done.
+
[Task 13/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 13/25] Current/Best: 16.23/ 16.23 GFLOPS | Progress: (4/20) | 4.32 s
[Task 13/25] Current/Best: 11.21/ 16.23 GFLOPS | Progress: (8/20) | 7.48 s
[Task 13/25] Current/Best: 22.79/ 22.79 GFLOPS | Progress: (12/20) | 10.76 s
[Task 13/25] Current/Best: 11.42/ 22.79 GFLOPS | Progress: (16/20) | 14.23 s
[Task 13/25] Current/Best: 13.06/ 22.79 GFLOPS | Progress: (20/20) | 16.59 s Done.
+
[Task 14/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 14/25] Current/Best: 16.85/ 17.54 GFLOPS | Progress: (4/20) | 3.49 s
[Task 14/25] Current/Best: 20.97/ 20.97 GFLOPS | Progress: (8/20) | 7.44 s
[Task 14/25] Current/Best: 2.39/ 20.97 GFLOPS | Progress: (12/20) | 11.64 s
[Task 14/25] Current/Best: 14.74/ 20.97 GFLOPS | Progress: (16/20) | 15.69 s Done.
+
[Task 14/25] Current/Best: 6.42/ 20.97 GFLOPS | Progress: (20/20) | 19.76 s
[Task 15/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 15/25] Current/Best: 14.39/ 17.74 GFLOPS | Progress: (4/20) | 4.01 s
[Task 15/25] Current/Best: 16.36/ 20.38 GFLOPS | Progress: (8/20) | 6.24 s
[Task 15/25] Current/Best: 12.33/ 20.38 GFLOPS | Progress: (12/20) | 11.28 s
[Task 15/25] Current/Best: 11.78/ 20.38 GFLOPS | Progress: (16/20) | 16.12 s
[Task 15/25] Current/Best: 17.52/ 20.38 GFLOPS | Progress: (20/20) | 17.94 s
[Task 16/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 16/25] Current/Best: 11.66/ 21.07 GFLOPS | Progress: (4/20) | 5.14 s
[Task 16/25] Current/Best: 9.41/ 21.07 GFLOPS | Progress: (8/20) | 8.16 s
[Task 16/25] Current/Best: 18.10/ 21.07 GFLOPS | Progress: (12/20) | 9.93 s
[Task 16/25] Current/Best: 14.17/ 21.07 GFLOPS | Progress: (16/20)
| 12.08 s
[Task 16/25] Current/Best: 1.57/ 21.07 GFLOPS | Progress: (20/20) | 16.02 s Done.
+
[Task 17/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 17/25] Current/Best: 17.55/ 17.55 GFLOPS | Progress: (4/20) | 5.01 s
[Task 17/25] Current/Best: 18.68/ 18.68 GFLOPS | Progress: (8/20) | 8.00 s
[Task 17/25] Current/Best: 18.22/ 21.81 GFLOPS | Progress: (12/20) | 10.61 s
[Task 17/25] Current/Best: 13.61/ 21.81 GFLOPS | Progress: (16/20) | 12.98 s
[Task 17/25] Current/Best: 16.55/ 21.81 GFLOPS | Progress: (20/20) | 15.37 s Done.
+
[Task 18/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 18/25] Current/Best: 15.04/ 16.28 GFLOPS | Progress: (4/20) | 4.21 s
[Task 18/25] Current/Best: 13.67/ 16.28 GFLOPS | Progress: (8/20) | 6.33 s
[Task 18/25] Current/Best: 11.95/ 17.35 GFLOPS | Progress: (12/20) | 9.32 s
[Task 18/25] Current/Best: 5.09/ 17.35 GFLOPS | Progress: (16/20) | 15.63 s
[Task 18/25] Current/Best: 9.82/ 17.35 GFLOPS | Progress: (20/20) | 17.97 s Done.
+
[Task 19/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 19/25] Current/Best: 10.61/ 11.29 GFLOPS | Progress: (4/20) | 7.61 s
[Task 19/25] Current/Best: 8.78/ 11.29 GFLOPS | Progress: (8/20) | 13.52 s
[Task 19/25] Current/Best: 23.04/ 23.04 GFLOPS | Progress: (12/20) | 17.27 s
[Task 19/25] Current/Best: 8.46/ 23.04 GFLOPS | Progress: (16/20) | 19.81 s
[Task 19/25] Current/Best: 2.69/ 23.04 GFLOPS | Progress: (20/20) | 23.50 s Done.
+
[Task 20/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 20/25] Current/Best: 5.32/ 5.32 GFLOPS | Progress: (4/20) | 4.00 s
[Task 20/25] Current/Best: 5.20/ 17.73 GFLOPS | Progress: (8/20) | 8.45 s
[Task 20/25] Current/Best: 15.65/ 19.27 GFLOPS | Progress: (12/20) | 11.42 s
[Task 20/25] Current/Best: 5.71/ 19.27 GFLOPS | Progress: (16/20) | 13.58 s Done.
+
[Task 20/25] Current/Best: 11.10/ 19.27 GFLOPS | Progress: (20/20) | 16.61 s
[Task 21/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 21/25] Current/Best: 9.02/ 10.22 GFLOPS | Progress: (4/20) | 5.24 s
[Task 21/25] Current/Best: 7.50/ 10.22 GFLOPS | Progress: (8/20) | 7.86 s
[Task 21/25] Current/Best: 8.66/ 16.63 GFLOPS | Progress: (12/20) | 11.39 s
[Task 21/25] Current/Best: 8.55/ 17.61 GFLOPS | Progress: (16/20) | 13.00 s
[Task 21/25] Current/Best: 19.92/ 19.92 GFLOPS | Progress: (20/20) | 15.38 s
[Task 22/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 22/25] Current/Best: 15.84/ 15.84 GFLOPS | Progress: (4/20) | 3.47 s
[Task 22/25] Current/Best: 6.75/ 15.84 GFLOPS | Progress: (8/20) | 7.81 s
[Task 22/25] Current/Best: 20.25/ 20.25 GFLOPS | Progress: (12/20) | 10.89 s
[Task 22/25] Current/Best: 13.26/ 20.25 GFLOPS | Progress: (16/20
) | 15.09 s
[Task 22/25] Current/Best: 10.96/ 20.25 GFLOPS | Progress: (20/20) | 18.18 s Done.
+
[Task 23/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 23/25] Current/Best: 1.55/ 23.84 GFLOPS | Progress: (4/20) | 6.46 s
[Task 23/25] Current/Best: 12.86/ 23.84 GFLOPS | Progress: (8/20) | 13.78 s
[Task 23/25] Current/Best: 2.68/ 23.84 GFLOPS | Progress: (12/20) | 18.03 s
[Task 23/25] Current/Best: 10.26/ 23.84 GFLOPS | Progress: (16/20) | 20.60 s
[Task 23/25] Current/Best: 18.62/ 23.84 GFLOPS | Progress: (20/20) | 23.67 s Done.
+
[Task 24/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 24/25] Current/Best: 3.52/ 3.52 GFLOPS | Progress: (4/20) | 12.16 s
[Task 24/25] Current/Best: 9.75/ 9.75 GFLOPS | Progress: (8/20) | 15.02 s Done.
+
[Task 24/25] Current/Best: 9.06/ 10.19 GFLOPS | Progress: (12/20) | 25.65 s
[Task 24/25] Current/Best: 3.09/ 10.76 GFLOPS | Progress: (16/20) | 32.22 s
[Task 24/25] Current/Best: 5.89/ 10.76 GFLOPS | Progress: (20/20) | 42.58 s
[Task 25/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 25/25] Current/Best: 8.71/ 8.71 GFLOPS | Progress: (4/20) | 7.36 s
[Task 25/25] Current/Best: 6.68/ 9.04 GFLOPS | Progress: (8/20) | 9.88 s
[Task 25/25] Current/Best: 8.95/ 9.04 GFLOPS | Progress: (12/20) | 12.43 s
[Task 25/25] Current/Best: 1.55/ 9.04 GFLOPS | Progress: (16/20) | 23.40 s
[Task 25/25] Current/Best: 2.88/ 9.04 GFLOPS | Progress: (20/20) | 34.36 s
@@ -664,8 +662,8 @@ Verify that the optimized model runs and produces the same results:
.. code-block:: none
- class='n02123045 tabby, tabby cat' with probability=0.621102
- class='n02123159 tiger cat' with probability=0.356379
+ class='n02123045 tabby, tabby cat' with probability=0.621104
+ class='n02123159 tiger cat' with probability=0.356378
class='n02124075 Egyptian cat' with probability=0.019712
class='n02129604 tiger, Panthera tigris' with probability=0.001215
class='n04040759 radiator' with probability=0.000262
@@ -722,8 +720,8 @@ improvement in comparing the optimized model to the unoptimized model.
.. code-block:: none
- optimized: {'mean': 428.7723751500198, 'median': 427.57894520000264, 'std': 4.358784697100389}
- unoptimized: {'mean': 511.403784489994, 'median': 511.6111203500168, 'std': 2.3768068098193593}
+ optimized: {'mean': 395.48081421999996, 'median': 395.524960100056, 'std': 0.497322881618209}
+ unoptimized: {'mean': 514.6882055699962, 'median': 515.0596661999998, 'std': 2.204362758970251}
@@ -746,7 +744,7 @@ profiling/benchmarking.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 12 minutes 10.853 seconds)
+ **Total running time of the script:** ( 11 minutes 56.526 seconds)
.. _sphx_glr_download_tutorial_autotvm_relay_x86.py:
diff --git a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
index 8bdaf7e387..99a1b1197d 100644
--- a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
+++ b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
@@ -274,7 +274,7 @@ device and returns the measured cost. Network overhead is excluded.
.. code-block:: none
- 1.305e-07 secs/op
+ 1.51e-07 secs/op
diff --git a/docs/_sources/tutorial/intro_topi.rst.txt b/docs/_sources/tutorial/intro_topi.rst.txt
index ce4ea093a9..aa74c789ff 100644
--- a/docs/_sources/tutorial/intro_topi.rst.txt
+++ b/docs/_sources/tutorial/intro_topi.rst.txt
@@ -263,7 +263,7 @@ As you can see, scheduled stages of computation have been accumulated and we can
.. code-block:: none
- [stage(a, placeholder(a, 0x8c56520)), stage(b, placeholder(b, 0x194af2b0)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min [...]
+ [stage(a, placeholder(a, 0x72da850)), stage(b, placeholder(b, 0x60cad60)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min= [...]
diff --git a/docs/_sources/tutorial/sg_execution_times.rst.txt b/docs/_sources/tutorial/sg_execution_times.rst.txt
index d24c2a3a26..0ab8b0906e 100644
--- a/docs/_sources/tutorial/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorial/sg_execution_times.rst.txt
@@ -5,24 +5,24 @@
Computation times
=================
-**15:21.143** total execution time for **tutorial** files:
+**15:31.328** total execution time for **tutorial** files:
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``) | 12:10.853 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``) | 11:56.526 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``) | 01:12.697 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``) | 01:29.244 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``) | 00:58.100 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``) | 00:59.130 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``) | 00:33.539 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``) | 00:33.731 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``) | 00:24.389 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``) | 00:31.117 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``) | 00:00.819 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``) | 00:00.818 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``) | 00:00.590 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``) | 00:00.600 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``) | 00:00.156 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``) | 00:00.162 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_tutorial_uma.py` (``uma.py``) | 00:00.000 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
index 62206a8d7c..ffa468f5a7 100644
--- a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
+++ b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
@@ -384,7 +384,7 @@ compile and run this new schedule with the parallel operation applied:
.. code-block:: none
- parallel: 0.000013
+ parallel: 0.000007
@@ -439,7 +439,7 @@ factor to be the number of threads on your CPU.
.. code-block:: none
- vector: 0.000045
+ vector: 0.000025
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [n: int32], [stride: int32], type="auto"),
@@ -490,10 +490,10 @@ We can now compare the different schedules
.. code-block:: none
Operator Timing Performance
- numpy 7.8052899971226e-06 1.0
- naive 6.6741999999999996e-06 0.8550867427681005
- parallel 1.2623599999999997e-05 1.6173133867740541
- vector 4.4663e-05 5.72214485515143
+ numpy 7.739580005363678e-06 1.0
+ naive 6.676e-06 0.8625791057619927
+ parallel 6.9759e-06 0.9013279784129845
+ vector 2.47214e-05 3.1941526520647883
@@ -914,7 +914,7 @@ matrix multiplication.
.. code-block:: none
- Numpy running time: 0.017866
+ Numpy running time: 0.018384
@@ -972,7 +972,7 @@ optimizations.
.. code-block:: none
- none: 3.199724
+ none: 3.255440
@@ -1074,7 +1074,7 @@ schedule.
.. code-block:: none
- blocking: 0.296646
+ blocking: 0.317528
@@ -1169,7 +1169,7 @@ already cache friendly from our previous optimizations.
.. code-block:: none
- vectorization: 0.333349
+ vectorization: 0.336595
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], []),
@@ -1242,7 +1242,7 @@ more cache friendly.
.. code-block:: none
- loop permutation: 0.115069
+ loop permutation: 0.116478
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], []),
@@ -1340,7 +1340,7 @@ optimized schedule.
.. code-block:: none
- array packing: 0.107997
+ array packing: 0.107964
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], []),
@@ -1432,7 +1432,7 @@ to `C` when all the block results are ready.
.. code-block:: none
- block caching: 0.110341
+ block caching: 0.110722
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], []),
@@ -1517,7 +1517,7 @@ of thread-level parallelization.
.. code-block:: none
- parallelization: 0.146212
+ parallelization: 0.143333
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], []),
@@ -1597,13 +1597,13 @@ working, we can compare the results.
.. code-block:: none
Operator Timing Performance
- none 3.1997244635 1.0
- blocking 0.29664561580000004 0.09270973772395263
- vectorization 0.3333486351 0.10418041894000102
- loop permutation 0.11506930789999999 0.035962255254357776
- array packing 0.1079969512 0.033751953467227036
- block caching 0.1103413642 0.03448464561829919
- parallelization 0.14621170560000002 0.04569509258308673
+ none 3.2554399361999997 1.0
+ blocking 0.3175280135 0.09753766609825495
+ vectorization 0.33659461879999997 0.10339451054130003
+ loop permutation 0.116477927 0.03577947352208317
+ array packing 0.1079637838 0.033164114809632654
+ block caching 0.1107224816 0.0340115264818075
+ parallelization 0.1433327474 0.044028687430587035
diff --git a/docs/commit_hash b/docs/commit_hash
index bf1b7576d4..4e03a5ea32 100644
--- a/docs/commit_hash
+++ b/docs/commit_hash
@@ -1 +1 @@
-287597b45daafab435913b8fd6566b89923a743a
+f9759920e0f9fc2d01b86ed540e5528f0de896e9
diff --git a/docs/how_to/compile_models/from_darknet.html b/docs/how_to/compile_models/from_darknet.html
index 5559e25054..b6358a4d7d 100644
--- a/docs/how_to/compile_models/from_darknet.html
+++ b/docs/how_to/compile_models/from_darknet.html
@@ -585,7 +585,7 @@ class:['truck 0.9266'] left:471 top:83 right:689 bottom:169
class:['bicycle 0.9984'] left:111 top:113 right:577 bottom:447
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 8.780 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 9.985 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-darknet-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7716f96385bd5abb6e822041e285be54/from_darknet.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_darknet.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/from_keras.html b/docs/how_to/compile_models/from_keras.html
index 2393029917..d94f1dbdc4 100644
--- a/docs/how_to/compile_models/from_keras.html
+++ b/docs/how_to/compile_models/from_keras.html
@@ -506,7 +506,7 @@ Tensorflow is also required since itβs used as the default backend of keras.</
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Relay top-1 id: 285, class name: Egyptian cat
1/1 [==============================] - ETA: 0s
-1/1 [==============================] - 1s 911ms/step
+1/1 [==============================] - 1s 969ms/step
Keras top-1 id: 285, class name: Egyptian cat
</pre></div>
</div>
diff --git a/docs/how_to/compile_models/from_mxnet.html b/docs/how_to/compile_models/from_mxnet.html
index 69df9f7ed8..4b0dbe6476 100644
--- a/docs/how_to/compile_models/from_mxnet.html
+++ b/docs/how_to/compile_models/from_mxnet.html
@@ -439,7 +439,7 @@
<span class="nb">print</span><span class="p">(</span><span class="s2">"x"</span><span class="p">,</span> <a href="https://docs.python.org/3/library/stdtypes.html#tuple" title="builtins.tuple" class="sphx-glr-backref-module-builtins sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">x</span><span class="o">.</span><span class="n">shape</span></a><span class="p">)</span>
</pre></div>
</div>
-<img src="../../_images/sphx_glr_from_mxnet_001.png" srcset="../../_images/sphx_glr_from_mxnet_001.png" alt="from mxnet" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zipd6e61b3c-d97a-43be-82a8-fe50d3492aa1 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+<img src="../../_images/sphx_glr_from_mxnet_001.png" srcset="../../_images/sphx_glr_from_mxnet_001.png" alt="from mxnet" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip87b1ba24-3b71-44a9-87e6-cb35d462db82 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
x (1, 3, 224, 224)
</pre></div>
</div>
diff --git a/docs/how_to/compile_models/from_oneflow.html b/docs/how_to/compile_models/from_oneflow.html
index 3997c7eeb0..5eb155d5b6 100644
--- a/docs/how_to/compile_models/from_oneflow.html
+++ b/docs/how_to/compile_models/from_oneflow.html
@@ -449,12 +449,11 @@ Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdo
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip" to /workspace/.oneflow/flowvision_cache/resnet18.zip
0%| | 0.00/41.5M [00:00<?, ?B/s]
- 19%|#9 | 7.99M/41.5M [00:00<00:00, 66.0MB/s]
- 39%|###8 | 16.0M/41.5M [00:00<00:00, 71.7MB/s]
- 55%|#####5 | 22.9M/41.5M [00:00<00:00, 67.3MB/s]
- 71%|####### | 29.3M/41.5M [00:00<00:00, 62.1MB/s]
- 92%|#########2| 38.3M/41.5M [00:00<00:00, 67.9MB/s]
-100%|##########| 41.5M/41.5M [00:00<00:00, 67.6MB/s]
+ 19%|#9 | 7.99M/41.5M [00:00<00:00, 56.1MB/s]
+ 39%|###8 | 16.0M/41.5M [00:00<00:00, 61.1MB/s]
+ 58%|#####7 | 24.0M/41.5M [00:00<00:00, 52.6MB/s]
+ 77%|#######7 | 32.0M/41.5M [00:00<00:00, 57.4MB/s]
+100%|##########| 41.5M/41.5M [00:00<00:00, 64.2MB/s]
</pre></div>
</div>
</div>
diff --git a/docs/how_to/compile_models/from_pytorch.html b/docs/how_to/compile_models/from_pytorch.html
index 4a937b4608..315d36291f 100644
--- a/docs/how_to/compile_models/from_pytorch.html
+++ b/docs/how_to/compile_models/from_pytorch.html
@@ -432,11 +432,10 @@ be unstable.</p>
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
0%| | 0.00/44.7M [00:00<?, ?B/s]
- 18%|#7 | 7.99M/44.7M [00:00<00:00, 65.3MB/s]
- 38%|###7 | 16.9M/44.7M [00:00<00:00, 80.3MB/s]
- 68%|######7 | 30.3M/44.7M [00:00<00:00, 105MB/s]
- 91%|######### | 40.5M/44.7M [00:00<00:00, 94.5MB/s]
-100%|##########| 44.7M/44.7M [00:00<00:00, 97.0MB/s]
+ 24%|##3 | 10.6M/44.7M [00:00<00:00, 97.0MB/s]
+ 54%|#####3 | 24.0M/44.7M [00:00<00:00, 114MB/s]
+ 78%|#######8 | 34.8M/44.7M [00:00<00:00, 111MB/s]
+100%|##########| 44.7M/44.7M [00:00<00:00, 107MB/s]
</pre></div>
</div>
</div>
diff --git a/docs/how_to/compile_models/from_tensorflow.html b/docs/how_to/compile_models/from_tensorflow.html
index 3daafa322f..4eab7c3ddd 100644
--- a/docs/how_to/compile_models/from_tensorflow.html
+++ b/docs/how_to/compile_models/from_tensorflow.html
@@ -649,7 +649,7 @@ banana (score = 0.00022)
desk (score = 0.00019)
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 9.956 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 11.470 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-tensorflow-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7f1d3d1b878694c201c614c807cdebc8/from_tensorflow.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_tensorflow.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/sg_execution_times.html b/docs/how_to/compile_models/sg_execution_times.html
index ab92856e41..cfd93f3242 100644
--- a/docs/how_to/compile_models/sg_execution_times.html
+++ b/docs/how_to/compile_models/sg_execution_times.html
@@ -340,7 +340,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-compile-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">ΒΆ</a></h1>
-<p><strong>05:35.607</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
+<p><strong>05:42.742</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 81%" />
@@ -349,43 +349,43 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="from_tensorflow.html#sphx-glr-how-to-compile-models-from-tensorflow-py"><span class="std std-ref">Compile Tensorflow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tensorflow.py</span></code>)</p></td>
-<td><p>01:09.956</p></td>
+<td><p>01:11.470</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_darknet.html#sphx-glr-how-to-compile-models-from-darknet-py"><span class="std std-ref">Compile YOLO-V2 and YOLO-V3 in DarkNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_darknet.py</span></code>)</p></td>
-<td><p>01:08.780</p></td>
+<td><p>01:09.985</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="from_paddle.html#sphx-glr-how-to-compile-models-from-paddle-py"><span class="std std-ref">Compile PaddlePaddle Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_paddle.py</span></code>)</p></td>
-<td><p>00:45.931</p></td>
+<td><p>00:47.277</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_oneflow.html#sphx-glr-how-to-compile-models-from-oneflow-py"><span class="std std-ref">Compile OneFlow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_oneflow.py</span></code>)</p></td>
-<td><p>00:30.721</p></td>
+<td><p>00:32.329</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="from_mxnet.html#sphx-glr-how-to-compile-models-from-mxnet-py"><span class="std std-ref">Compile MXNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_mxnet.py</span></code>)</p></td>
-<td><p>00:27.368</p></td>
+<td><p>00:27.763</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_coreml.html#sphx-glr-how-to-compile-models-from-coreml-py"><span class="std std-ref">Compile CoreML Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_coreml.py</span></code>)</p></td>
-<td><p>00:25.862</p></td>
+<td><p>00:26.274</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="from_tflite.html#sphx-glr-how-to-compile-models-from-tflite-py"><span class="std std-ref">Compile TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tflite.py</span></code>)</p></td>
-<td><p>00:25.269</p></td>
+<td><p>00:25.260</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_pytorch.html#sphx-glr-how-to-compile-models-from-pytorch-py"><span class="std std-ref">Compile PyTorch Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_pytorch.py</span></code>)</p></td>
-<td><p>00:22.186</p></td>
+<td><p>00:22.601</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="from_keras.html#sphx-glr-how-to-compile-models-from-keras-py"><span class="std std-ref">Compile Keras Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_keras.py</span></code>)</p></td>
-<td><p>00:17.144</p></td>
+<td><p>00:17.300</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_onnx.html#sphx-glr-how-to-compile-models-from-onnx-py"><span class="std std-ref">Compile ONNX Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_onnx.py</span></code>)</p></td>
-<td><p>00:02.390</p></td>
+<td><p>00:02.482</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/deploy_models/deploy_model_on_adreno.html b/docs/how_to/deploy_models/deploy_model_on_adreno.html
index c0f087a55c..38ee5beef7 100644
--- a/docs/how_to/deploy_models/deploy_model_on_adreno.html
+++ b/docs/how_to/deploy_models/deploy_model_on_adreno.html
@@ -920,7 +920,7 @@ Top5 predictions:
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 2755.0304 2754.1602 2761.8323 2752.2916 2.6508
+ 2752.8046 2752.1136 2757.1940 2750.7929 2.0615
</pre></div>
</div>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-model-on-adreno-py">
diff --git a/docs/how_to/deploy_models/deploy_model_on_android.html b/docs/how_to/deploy_models/deploy_model_on_android.html
index 23dccf7a44..613c3e863c 100644
--- a/docs/how_to/deploy_models/deploy_model_on_android.html
+++ b/docs/how_to/deploy_models/deploy_model_on_android.html
@@ -662,7 +662,7 @@ to the remote android device.</p>
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 15.7371 15.7188 15.8776 15.6246 0.0824
+ 16.4263 16.4800 16.9027 15.8362 0.3722
</pre></div>
</div>
</div>
diff --git a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
index 52e6592ad9..f45e8c0756 100644
--- a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
+++ b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
@@ -454,26 +454,20 @@ be unstable.</p>
Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
0%| | 0.00/170M [00:00<?, ?B/s]
- 5%|4 | 8.17M/170M [00:00<00:01, 85.7MB/s]
- 10%|9 | 16.3M/170M [00:00<00:01, 84.8MB/s]
- 15%|#5 | 26.1M/170M [00:00<00:01, 92.8MB/s]
- 21%|## | 35.0M/170M [00:00<00:01, 71.8MB/s]
- 25%|##4 | 42.3M/170M [00:00<00:02, 65.6MB/s]
- 30%|##9 | 50.6M/170M [00:00<00:01, 71.6MB/s]
- 34%|###4 | 58.5M/170M [00:00<00:01, 74.8MB/s]
- 39%|###8 | 66.0M/170M [00:00<00:01, 70.9MB/s]
- 44%|####4 | 75.1M/170M [00:01<00:01, 77.9MB/s]
- 49%|####8 | 82.8M/170M [00:01<00:01, 77.4MB/s]
- 53%|#####3 | 90.3M/170M [00:01<00:01, 77.8MB/s]
- 58%|#####7 | 98.1M/170M [00:01<00:00, 78.8MB/s]
- 62%|######2 | 106M/170M [00:01<00:00, 69.1MB/s]
- 69%|######8 | 116M/170M [00:01<00:00, 80.8MB/s]
- 73%|#######3 | 124M/170M [00:01<00:00, 80.8MB/s]
- 80%|######## | 136M/170M [00:01<00:00, 76.3MB/s]
- 85%|########4 | 144M/170M [00:01<00:00, 75.3MB/s]
- 89%|########9 | 152M/170M [00:02<00:00, 72.6MB/s]
- 95%|#########5| 162M/170M [00:02<00:00, 70.8MB/s]
-100%|##########| 170M/170M [00:02<00:00, 76.5MB/s]
+ 9%|8 | 15.0M/170M [00:00<00:01, 156MB/s]
+ 18%|#7 | 29.9M/170M [00:00<00:01, 98.5MB/s]
+ 26%|##5 | 43.6M/170M [00:00<00:01, 114MB/s]
+ 33%|###2 | 55.6M/170M [00:00<00:01, 109MB/s]
+ 39%|###9 | 66.6M/170M [00:00<00:01, 98.7MB/s]
+ 45%|####5 | 76.5M/170M [00:00<00:00, 98.7MB/s]
+ 52%|#####1 | 88.0M/170M [00:00<00:00, 96.2MB/s]
+ 57%|#####7 | 97.4M/170M [00:01<00:00, 96.6MB/s]
+ 65%|######5 | 111M/170M [00:01<00:00, 109MB/s]
+ 71%|#######1 | 121M/170M [00:01<00:00, 88.2MB/s]
+ 80%|######## | 136M/170M [00:01<00:00, 94.5MB/s]
+ 88%|########8 | 150M/170M [00:01<00:00, 107MB/s]
+ 95%|#########4| 161M/170M [00:01<00:00, 98.5MB/s]
+100%|##########| 170M/170M [00:01<00:00, 103MB/s]
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/torch/nn/functional.py:3897: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
for i in range(dim)
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/detection/anchor_utils.py:124: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode=& [...]
@@ -571,7 +565,7 @@ torchvision rcnn models.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Get 9 valid boxes
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes 11.913 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes 15.819 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-object-detection-pytorch-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7795da4b258c8feff986668b95ef57ad/deploy_object_detection_pytorch.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_object_detection_pytorch.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized.html b/docs/how_to/deploy_models/deploy_prequantized.html
index 1184ee02db..a5ca8b3b1f 100644
--- a/docs/how_to/deploy_models/deploy_prequantized.html
+++ b/docs/how_to/deploy_models/deploy_prequantized.html
@@ -495,8 +495,7 @@ training. Other models require a full post training calibration.</p>
Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
0%| | 0.00/13.6M [00:00<?, ?B/s]
- 59%|#####8 | 7.99M/13.6M [00:00<00:00, 60.6MB/s]
-100%|##########| 13.6M/13.6M [00:00<00:00, 82.4MB/s]
+100%|##########| 13.6M/13.6M [00:00<00:00, 203MB/s]
</pre></div>
</div>
</div>
@@ -587,7 +586,7 @@ output values are identical out of 1000 outputs from mobilenet v2.</p>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 90.2995 90.2060 92.0427 90.0098 0.2888
+ 90.3102 90.2674 92.8391 89.9773 0.2968
</pre></div>
</div>
<div class="admonition note">
@@ -626,7 +625,7 @@ This includes support for the VNNI 8 bit dot product instruction (CascadeLake or
<div class="section" id="deploy-a-quantized-tflite-model">
<h2>Deploy a quantized TFLite Model<a class="headerlink" href="#deploy-a-quantized-tflite-model" title="Permalink to this headline">ΒΆ</a></h2>
<p>TODO</p>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 5.862 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 6.547 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/fb8217c13f4351224c6cf3aacf1a87fc/deploy_prequantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized_tflite.html b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
index 2dc4f632ff..c303f32504 100644
--- a/docs/how_to/deploy_models/deploy_prequantized_tflite.html
+++ b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
@@ -580,7 +580,7 @@ TFLite Top-5 labels: [387 102 386 341 349]
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 120.1069 120.0560 124.1617 118.7299 0.5242
+ 121.1376 121.0263 129.5003 120.4150 0.9146
</pre></div>
</div>
<div class="admonition note">
@@ -608,7 +608,7 @@ network for ARM CPU</span></a>.</p></li>
</ul>
</div></blockquote>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 27.541 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 27.427 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-tflite-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/56691c7a27d45da61d112276334640d3/deploy_prequantized_tflite.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized_tflite.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_quantized.html b/docs/how_to/deploy_models/deploy_quantized.html
index a6e1b55e78..1fb88fe51f 100644
--- a/docs/how_to/deploy_models/deploy_quantized.html
+++ b/docs/how_to/deploy_models/deploy_quantized.html
@@ -521,7 +521,7 @@ for calibration. But the accuracy might be impacted.</p>
DeprecationWarning,
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 24.340 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 26.831 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-quantized-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7810ecf51bfc05f7d5e8a400ac3e815d/deploy_quantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_quantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
index 149817dc62..15552d0804 100644
--- a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
+++ b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
@@ -463,23 +463,22 @@ to your device.</p>
Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
0%| | 0/132723 [00:00<?, ?KB/s]
- 3%|2 | 3629/132723 [00:00<00:03, 36285.36KB/s]
- 7%|7 | 9309/132723 [00:00<00:02, 48349.68KB/s]
- 14%|#3 | 17934/132723 [00:00<00:01, 65651.45KB/s]
- 20%|## | 26629/132723 [00:00<00:01, 74056.30KB/s]
- 27%|##6 | 35236/132723 [00:00<00:01, 78385.63KB/s]
- 33%|###3 | 43816/132723 [00:00<00:01, 80902.97KB/s]
- 40%|###9 | 52512/132723 [00:00<00:00, 82881.68KB/s]
- 46%|####6 | 61183/132723 [00:00<00:00, 84099.05KB/s]
- 53%|#####2 | 69909/132723 [00:00<00:00, 85084.77KB/s]
- 59%|#####9 | 78513/132723 [00:01<00:00, 85377.54KB/s]
- 66%|######5 | 87252/132723 [00:01<00:00, 85987.55KB/s]
- 72%|#######2 | 95988/132723 [00:01<00:00, 86399.54KB/s]
- 79%|#######8 | 104732/132723 [00:01<00:00, 86711.41KB/s]
- 86%|########5 | 113481/132723 [00:01<00:00, 86944.14KB/s]
- 92%|#########2| 122282/132723 [00:01<00:00, 87263.73KB/s]
- 99%|#########8| 131009/132723 [00:01<00:00, 84488.17KB/s]
-100%|##########| 132723/132723 [00:01<00:00, 81301.66KB/s]
+ 4%|4 | 5724/132723 [00:00<00:02, 57232.55KB/s]
+ 11%|# | 14554/132723 [00:00<00:01, 75500.87KB/s]
+ 18%|#7 | 23420/132723 [00:00<00:01, 81506.20KB/s]
+ 24%|##4 | 32324/132723 [00:00<00:01, 84476.40KB/s]
+ 31%|###1 | 41247/132723 [00:00<00:01, 86187.23KB/s]
+ 38%|###7 | 50149/132723 [00:00<00:00, 87147.37KB/s]
+ 44%|####4 | 59012/132723 [00:00<00:00, 87629.41KB/s]
+ 51%|#####1 | 67897/132723 [00:00<00:00, 88014.44KB/s]
+ 58%|#####7 | 76831/132723 [00:00<00:00, 88424.08KB/s]
+ 65%|######4 | 85753/132723 [00:01<00:00, 88665.17KB/s]
+ 71%|#######1 | 94620/132723 [00:01<00:00, 88640.48KB/s]
+ 78%|#######7 | 103501/132723 [00:01<00:00, 88690.33KB/s]
+ 85%|########4 | 112371/132723 [00:01<00:00, 88673.56KB/s]
+ 91%|#########1| 121300/132723 [00:01<00:00, 88855.76KB/s]
+ 98%|#########8| 130262/132723 [00:01<00:00, 89082.90KB/s]
+100%|##########| 132723/132723 [00:01<00:00, 86750.30KB/s]
</pre></div>
</div>
<p>Create TVM runtime and do inference
@@ -518,7 +517,7 @@ Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from h
<span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
-<img src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" srcset="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" alt="deploy ssd gluoncv" class = "sphx-glr-single-img"/><p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes 4.912 seconds)</p>
+<img src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" srcset="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" alt="deploy ssd gluoncv" class = "sphx-glr-single-img"/><p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes 7.842 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-ssd-gluoncv-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/cccb17d28e5e8b2e94ea8cd5ec59f6ed/deploy_ssd_gluoncv.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_ssd_gluoncv.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/sg_execution_times.html b/docs/how_to/deploy_models/sg_execution_times.html
index dfeb3990da..6f5eb5f9af 100644
--- a/docs/how_to/deploy_models/sg_execution_times.html
+++ b/docs/how_to/deploy_models/sg_execution_times.html
@@ -340,7 +340,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-deploy-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">ΒΆ</a></h1>
-<p><strong>13:33.904</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
+<p><strong>13:45.448</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 86%" />
@@ -349,39 +349,39 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_object_detection_pytorch.html#sphx-glr-how-to-deploy-models-deploy-object-detection-pytorch-py"><span class="std std-ref">Compile PyTorch Object Detection Models</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_object_detection_pytorch.py</span></code>)</p></td>
-<td><p>03:11.913</p></td>
+<td><p>03:15.819</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_ssd_gluoncv.html#sphx-glr-how-to-deploy-models-deploy-ssd-gluoncv-py"><span class="std std-ref">Deploy Single Shot Multibox Detector(SSD) model</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_ssd_gluoncv.py</span></code>)</p></td>
-<td><p>03:04.912</p></td>
+<td><p>03:07.842</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_prequantized_tflite.html#sphx-glr-how-to-deploy-models-deploy-prequantized-tflite-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM - Part 3 (TFLite)</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized_tflite.py</span></code>)</p></td>
-<td><p>02:27.541</p></td>
+<td><p>02:27.427</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_quantized.html#sphx-glr-how-to-deploy-models-deploy-quantized-py"><span class="std std-ref">Deploy a Quantized Model on Cuda</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_quantized.py</span></code>)</p></td>
-<td><p>01:24.340</p></td>
+<td><p>01:26.831</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_prequantized.html#sphx-glr-how-to-deploy-models-deploy-prequantized-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized.py</span></code>)</p></td>
-<td><p>01:05.862</p></td>
+<td><p>01:06.547</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_model_on_adreno.html#sphx-glr-how-to-deploy-models-deploy-model-on-adreno-py"><span class="std std-ref">Deploy the Pretrained Model on Adreno</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_adreno.py</span></code>)</p></td>
-<td><p>00:53.764</p></td>
+<td><p>00:54.025</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_model_on_android.html#sphx-glr-how-to-deploy-models-deploy-model-on-android-py"><span class="std std-ref">Deploy the Pretrained Model on Android</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_android.py</span></code>)</p></td>
-<td><p>00:35.025</p></td>
+<td><p>00:35.851</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_model_on_nano.html#sphx-glr-how-to-deploy-models-deploy-model-on-nano-py"><span class="std std-ref">Deploy the Pretrained Model on Jetson Nano</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_nano.py</span></code>)</p></td>
-<td><p>00:25.473</p></td>
+<td><p>00:25.722</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_model_on_rasp.html#sphx-glr-how-to-deploy-models-deploy-model-on-rasp-py"><span class="std std-ref">Deploy the Pretrained Model on Raspberry Pi</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_rasp.py</span></code>)</p></td>
-<td><p>00:25.068</p></td>
+<td><p>00:25.379</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_sparse.html#sphx-glr-how-to-deploy-models-deploy-sparse-py"><span class="std std-ref">Deploy a Hugging Face Pruned Model on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_sparse.py</span></code>)</p></td>
diff --git a/docs/how_to/extend_tvm/bring_your_own_datatypes.html b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
index bec068108b..24387bece1 100644
--- a/docs/how_to/extend_tvm/bring_your_own_datatypes.html
+++ b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
@@ -619,7 +619,7 @@ In this alpha state of the Bring Your Own Datatypes framework, we have not imple
<span class="n">module</span><span class="p">,</span> <a href="https://docs.python.org/3/library/stdtypes.html#dict" title="builtins.dict" class="sphx-glr-backref-module-builtins sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">params</span></a> <span class="o">=</span> <span class="n">get_mobilenet</span><span class="p">()</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip08804c12-a636-4348-8548-412315d09f61 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zipc704b22f-922e-4cf7-b41b-561c0f17a593 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
</pre></div>
</div>
<p>Itβs easy to execute MobileNet with native TVM:</p>
diff --git a/docs/how_to/extend_tvm/sg_execution_times.html b/docs/how_to/extend_tvm/sg_execution_times.html
index 4cfcc997b6..ea88801f93 100644
--- a/docs/how_to/extend_tvm/sg_execution_times.html
+++ b/docs/how_to/extend_tvm/sg_execution_times.html
@@ -340,7 +340,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-extend-tvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">ΒΆ</a></h1>
-<p><strong>00:47.421</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
+<p><strong>00:47.417</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 84%" />
@@ -349,15 +349,15 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="bring_your_own_datatypes.html#sphx-glr-how-to-extend-tvm-bring-your-own-datatypes-py"><span class="std std-ref">Bring Your Own Datatypes to TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">bring_your_own_datatypes.py</span></code>)</p></td>
-<td><p>00:44.004</p></td>
+<td><p>00:43.952</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="use_pass_instrument.html#sphx-glr-how-to-extend-tvm-use-pass-instrument-py"><span class="std std-ref">How to Use TVM Pass Instrument</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_instrument.py</span></code>)</p></td>
-<td><p>00:02.383</p></td>
+<td><p>00:02.422</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="use_pass_infra.html#sphx-glr-how-to-extend-tvm-use-pass-infra-py"><span class="std std-ref">How to Use TVM Pass Infra</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_infra.py</span></code>)</p></td>
-<td><p>00:01.026</p></td>
+<td><p>00:01.037</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="low_level_custom_pass.html#sphx-glr-how-to-extend-tvm-low-level-custom-pass-py"><span class="std std-ref">Writing a Customized Pass</span></a> (<code class="docutils literal notranslate"><span class="pre">low_level_custom_pass.py</span></code>)</p></td>
diff --git a/docs/how_to/extend_tvm/use_pass_instrument.html b/docs/how_to/extend_tvm/use_pass_instrument.html
index 4cd21bcf33..ed5b291481 100644
--- a/docs/how_to/extend_tvm/use_pass_instrument.html
+++ b/docs/how_to/extend_tvm/use_pass_instrument.html
@@ -526,10 +526,10 @@ profile the execution time of each passes.</p>
</pre></div>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 7280us [7280us] (46.78%; 46.78%)
-FoldScaleAxis: 8283us [7us] (53.22%; 53.22%)
- FoldConstant: 8276us [1674us] (53.18%; 99.92%)
- InferType: 6602us [6602us] (42.42%; 79.77%)
+InferType: 7179us [7179us] (45.74%; 45.74%)
+FoldScaleAxis: 8517us [6us] (54.26%; 54.26%)
+ FoldConstant: 8511us [1685us] (54.22%; 99.93%)
+ InferType: 6826us [6826us] (43.49%; 80.21%)
</pre></div>
</div>
</div>
@@ -551,10 +551,10 @@ Refer to following sections and <a class="reference internal" href="../../refere
</pre></div>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 6620us [6620us] (44.98%; 44.98%)
-FoldScaleAxis: 8098us [5us] (55.02%; 55.02%)
- FoldConstant: 8093us [1649us] (54.99%; 99.94%)
- InferType: 6444us [6444us] (43.78%; 79.62%)
+InferType: 6620us [6620us] (45.06%; 45.06%)
+FoldScaleAxis: 8073us [5us] (54.94%; 54.94%)
+ FoldConstant: 8069us [1675us] (54.91%; 99.94%)
+ InferType: 6393us [6393us] (43.51%; 79.24%)
</pre></div>
</div>
<p>Register empty list to clear existing instruments.</p>
diff --git a/docs/how_to/optimize_operators/opt_conv_cuda.html b/docs/how_to/optimize_operators/opt_conv_cuda.html
index 92cf920bd5..db66f01a53 100644
--- a/docs/how_to/optimize_operators/opt_conv_cuda.html
+++ b/docs/how_to/optimize_operators/opt_conv_cuda.html
@@ -575,7 +575,7 @@ latency of convolution.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Convolution: </span><span class="si">%f</span><span class="s2"> ms"</span> <span class="o">%</span> <span class="p">(</span><span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">b</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span> <span class="o">*</span> <span cl [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 54.173694 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 54.149120 ms
</pre></div>
</div>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-optimize-operators-opt-conv-cuda-py">
diff --git a/docs/how_to/optimize_operators/opt_conv_tensorcore.html b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
index f522bff9ea..b6451b4ece 100644
--- a/docs/how_to/optimize_operators/opt_conv_tensorcore.html
+++ b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
@@ -915,7 +915,7 @@ be able to run on our build server</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"conv2d with tensor core: </span><span class="si">%f</span><span class="s2"> ms"</span> <span class="o">%</span> <span class="p">(</span><span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span> <span class="o">* [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 8.290288 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 13.364224 ms
</pre></div>
</div>
</div>
diff --git a/docs/how_to/optimize_operators/opt_gemm.html b/docs/how_to/optimize_operators/opt_gemm.html
index 88f5a23eb2..fc1ed396c1 100644
--- a/docs/how_to/optimize_operators/opt_gemm.html
+++ b/docs/how_to/optimize_operators/opt_gemm.html
@@ -472,8 +472,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
<span class="nb">print</span><span class="p">(</span><span class="s2">"Baseline: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.018250
-Baseline: 3.234689
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.018687
+Baseline: 3.208006
</pre></div>
</div>
<p>In TVM, we can always inspect lower level IR to debug or optimize our schedule.
@@ -532,7 +532,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt1: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.297403
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.299340
</pre></div>
</div>
<p>Here is the generated IR after blocking.</p>
@@ -598,7 +598,7 @@ vastly.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt2: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.329462
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.338976
</pre></div>
</div>
<p>Here is the generated IR after vectorization.</p>
@@ -658,7 +658,7 @@ the access pattern for A matrix is more cache friendly.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt3: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.114911
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.114522
</pre></div>
</div>
<p>Here is the generated IR after loop permutation.</p>
@@ -740,7 +740,7 @@ flattening.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt4: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.109820
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.107576
</pre></div>
</div>
<p>Here is the generated IR after array packing.</p>
@@ -825,7 +825,7 @@ write to C when all the block results are ready.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt5: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.111050
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.110665
</pre></div>
</div>
<p>Here is the generated IR after blocking.</p>
@@ -914,7 +914,7 @@ write to C when all the block results are ready.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt6: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">opt6_time</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.147391
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.146049
</pre></div>
</div>
<p>Here is the generated IR after parallelization.</p>
diff --git a/docs/how_to/optimize_operators/sg_execution_times.html b/docs/how_to/optimize_operators/sg_execution_times.html
index 6a26c6b647..30dcd9758d 100644
--- a/docs/how_to/optimize_operators/sg_execution_times.html
+++ b/docs/how_to/optimize_operators/sg_execution_times.html
@@ -340,7 +340,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-optimize-operators-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">ΒΆ</a></h1>
-<p><strong>00:34.364</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
+<p><strong>00:34.447</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 83%" />
@@ -349,15 +349,15 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="opt_gemm.html#sphx-glr-how-to-optimize-operators-opt-gemm-py"><span class="std std-ref">How to optimize GEMM on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_gemm.py</span></code>)</p></td>
-<td><p>00:31.642</p></td>
+<td><p>00:31.647</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="opt_conv_tensorcore.html#sphx-glr-how-to-optimize-operators-opt-conv-tensorcore-py"><span class="std std-ref">How to optimize convolution using TensorCores</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_tensorcore.py</span></code>)</p></td>
-<td><p>00:01.564</p></td>
+<td><p>00:01.633</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="opt_conv_cuda.html#sphx-glr-how-to-optimize-operators-opt-conv-cuda-py"><span class="std std-ref">How to optimize convolution on GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_cuda.py</span></code>)</p></td>
-<td><p>00:01.158</p></td>
+<td><p>00:01.167</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
index 66609fbbfc..c47d81d489 100644
--- a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
+++ b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
@@ -340,7 +340,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-tune-with-autoscheduler-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">ΒΆ</a></h1>
-<p><strong>08:53.171</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
+<p><strong>09:04.969</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 85%" />
@@ -349,27 +349,27 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_conv2d_layer_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py"><span class="std std-ref">Auto-scheduling a Convolution Layer for GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_layer_cuda.py</span></code>)</p></td>
-<td><p>05:30.470</p></td>
+<td><p>05:37.098</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_network_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-x86-py"><span class="std std-ref">Auto-scheduling a Neural Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_x86.py</span></code>)</p></td>
-<td><p>01:31.128</p></td>
+<td><p>01:32.166</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_network_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-cuda-py"><span class="std std-ref">Auto-scheduling a Neural Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_cuda.py</span></code>)</p></td>
-<td><p>01:01.205</p></td>
+<td><p>01:01.802</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_sparse_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-sparse-x86-py"><span class="std std-ref">Auto-scheduling Sparse Matrix Multiplication on CPU with Custom Sketch Rule</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_sparse_x86.py</span></code>)</p></td>
-<td><p>00:27.534</p></td>
+<td><p>00:30.348</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_network_arm.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-arm-py"><span class="std std-ref">Auto-scheduling a Neural Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_arm.py</span></code>)</p></td>
-<td><p>00:11.890</p></td>
+<td><p>00:12.257</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_network_mali.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-mali-py"><span class="std std-ref">Auto-scheduling a Neural Network for mali GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_mali.py</span></code>)</p></td>
-<td><p>00:10.945</p></td>
+<td><p>00:11.297</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
index 151f618a1f..7d8ff5d1ed 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
@@ -504,1097 +504,484 @@ cooperative fetching, unrolling and operator fusion.</p>
bias: Buffer(bias_2: Pointer(float32), float32, [1, 512, 1, 1], []),
compute: Buffer(compute_2: Pointer(float32), float32, [1, 512, 7, 7], [])}
buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, compute_1: compute} {
- attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 16;
- allocate(conv2d_nchw: Pointer(local float32), float32, [7]), storage_scope = local;
- allocate(pad_temp.shared: Pointer(shared float32), float32, [1296]), storage_scope = shared;
- allocate(kernel.shared: Pointer(shared float32), float32, [4608]), storage_scope = shared;
- attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224 {
- conv2d_nchw_1: Buffer(conv2d_nchw, float32, [1], [], scope="local", align=4)[0] = 0f32
+ attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 28;
+ allocate(conv2d_nchw: Pointer(local float32), float32, [14]), storage_scope = local;
+ allocate(pad_temp.shared: Pointer(shared float32), float32, [72]), storage_scope = shared;
+ allocate(kernel.shared: Pointer(shared float32), float32, [3072]), storage_scope = shared;
+ attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64 {
+ conv2d_nchw_1: Buffer(conv2d_nchw, float32, [14], [], scope="local", align=32)[0] = 0f32
conv2d_nchw_1[1] = 0f32
conv2d_nchw_1[2] = 0f32
conv2d_nchw_1[3] = 0f32
conv2d_nchw_1[4] = 0f32
conv2d_nchw_1[5] = 0f32
conv2d_nchw_1[6] = 0f32
- for (rc.outer.outer: int32, 0, 32) {
- let cse_var_2: int32 = (rc.outer.outer*784)
- let cse_var_1: int32 = (rc.outer.outer*144)
- {
- attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- pad_temp.shared_1: Buffer(pad_temp.shared, float32, [1296], [], scope="shared")[threadIdx.x_1] = @tir.if_then_else(((((9 <= floormod(threadIdx.x_1, 81)) && (floormod(threadIdx.x_1, 81) < 72)) && (1 <= floormod(threadIdx.x_1, 9))) && (floormod(threadIdx.x_1, 9) < 8)), data_3: Buffer(data_2, float32, [25088], [])[((((cse_var_2 + (floordiv(threadIdx.x_1, 81)*49)) + (floordiv(floormod(threadIdx.x_1, 81), 9)*7)) + floormod(threadIdx.x_1, 9 [...]
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- pad_temp.shared_1[(threadIdx.x_1 + 224)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 62), 81)) && (floormod((threadIdx.x_1 + 62), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 8), 9))) && (floormod((threadIdx.x_1 + 8), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 224), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 62), 81), 9)*7)) + floormod((threadIdx.x_1 + 8), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- pad_temp.shared_1[(threadIdx.x_1 + 448)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 43), 81)) && (floormod((threadIdx.x_1 + 43), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 7), 9))) && (floormod((threadIdx.x_1 + 7), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 448), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 43), 81), 9)*7)) + floormod((threadIdx.x_1 + 7), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- pad_temp.shared_1[(threadIdx.x_1 + 672)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 24), 81)) && (floormod((threadIdx.x_1 + 24), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 6), 9))) && (floormod((threadIdx.x_1 + 6), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 672), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 24), 81), 9)*7)) + floormod((threadIdx.x_1 + 6), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- pad_temp.shared_1[(threadIdx.x_1 + 896)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 5), 81)) && (floormod((threadIdx.x_1 + 5), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 5), 9))) && (floormod((threadIdx.x_1 + 5), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 896), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 5), 81), 9)*7)) + floormod((threadIdx.x_1 + 5), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- if @tir.likely((threadIdx.x_1 < 176), dtype=bool) {
- pad_temp.shared_1[(threadIdx.x_1 + 1120)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 67), 81)) && (floormod((threadIdx.x_1 + 67), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 4), 9))) && (floormod((threadIdx.x_1 + 4), 9) < 8)), data_3[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1120), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 67), 81), 9)*7)) + floormod((threadIdx.x_1 + 4), 9)) - 8)], 0f32, dtype=float32)
+ conv2d_nchw_1[7] = 0f32
+ conv2d_nchw_1[8] = 0f32
+ conv2d_nchw_1[9] = 0f32
+ conv2d_nchw_1[10] = 0f32
+ conv2d_nchw_1[11] = 0f32
+ conv2d_nchw_1[12] = 0f32
+ conv2d_nchw_1[13] = 0f32
+ for (rc.outer.outer: int32, 0, 64) {
+ for (ry.outer.outer: int32, 0, 3) {
+ let cse_var_2: int32 = (rc.outer.outer*72)
+ let cse_var_1: int32 = (ry.outer.outer*3)
+ {
+ attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64 {
+ if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
+ pad_temp.shared_1: Buffer(pad_temp.shared, float32, [72], [], scope="shared")[(threadIdx.x_1*4)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1*4), 9))) && (floormod((threadIdx.x_1*4), 9) < 8)), data_3: Buffer(data_2, float32, [25088], [])[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1*4), 9)*49)) + (ry.outer.out [...]
+ }
+ if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
+ pad_temp.shared_1[((threadIdx.x_1*4) + 1)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 1), 9))) && (floormod(((threadIdx.x_1*4) + 1), 9) < 8)), data_3[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 1), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 1), 9)) - 8)], [...]
+ }
+ if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
+ pad_temp.shared_1[((threadIdx.x_1*4) + 2)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 2), 9))) && (floormod(((threadIdx.x_1*4) + 2), 9) < 8)), data_3[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 2), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 2), 9)) - 8)], [...]
+ }
+ if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
+ pad_temp.shared_1[((threadIdx.x_1*4) + 3)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 3), 9))) && (floormod(((threadIdx.x_1*4) + 3), 9) < 8)), data_3[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 3), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 3), 9)) - 8)], [...]
+ }
+ }
+ attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1: Buffer(kernel.shared, float32, [3072], [], scope="shared")[threadIdx.x_2] = kernel_3: Buffer(kernel_2, float32, [2359296], [])[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 64)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 64), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 128)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 128), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 192)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 36864)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 256)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 256), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 320)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 320), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 384)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 73728)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 448)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 448), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 512)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 512), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 576)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 110592)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 640)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 640), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 704)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 704), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 768)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 147456)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 832)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 832), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 896)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 896), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 960)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 184320)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1024)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1024), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1088)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1088), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1152)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 221184)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1216)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1216), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1280)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1280), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1344)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 258048)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1408)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1408), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1472)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1472), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1536)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 294912)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1600)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1600), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1664)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1664), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1728)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 331776)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1792)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1792), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1856)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1856), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1920)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 368640)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 1984)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1984), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2048)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2048), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2112)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 405504)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2176)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2176), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2240)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2240), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2304)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 442368)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2368)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2368), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2432)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2432), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2496)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 479232)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2560)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2560), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2624)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2624), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2688)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 516096)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2752)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2752), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2816)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2816), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2880)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 552960)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 2944)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2944), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+ kernel.shared_1[(threadIdx.x_2 + 3008)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 3008), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[0]*kernel.shared_1[(threadIdx.x*48)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[9]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[1]*kernel.shared_1[(threadIdx.x*48)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[2]*kernel.shared_1[(threadIdx.x*48)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[3]*kernel.shared_1[(threadIdx.x*48)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[4]*kernel.shared_1[(threadIdx.x*48)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[5]*kernel.shared_1[(threadIdx.x*48)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[6]*kernel.shared_1[(threadIdx.x*48)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[0]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[9]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[8]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[17]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[8]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[17]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[18]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[27]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[18]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[27]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[26]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[35]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[26]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[35]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[36]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[45]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[36]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[45]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[44]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[53]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[44]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[53]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[54]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[63]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[54]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[63]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[62]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[71]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+ conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+ conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+ conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+ conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+ conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+ conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[62]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+ conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[71]*kernel.shared_1[((threadIdx.x*48) + 47)]))
}
- attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1: Buffer(kernel.shared, float32, [4608], [], scope="shared")[threadIdx.x_2] = kernel_3: Buffer(kernel_2, float32, [2359296], [])[((((blockIdx.x*147456) + (floordiv(threadIdx.x_2, 144)*4608)) + cse_var_1) + floormod(threadIdx.x_2, 144))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 224)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 224), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 80), 144), 9)*9)) + (floordiv(floormod((threadIdx.x_2 + 8), 9), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 448)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 448), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 16), 144), 9)*9)) + (floordiv(floormod((threadIdx.x_2 + 7), 9), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 672)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 672), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 96), 144), 9)*9)) + (floormod((floordiv(threadIdx.x_2, 3) + 2), 3)*3)) + floormod(threadIdx.x_2, 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 896)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 896), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 144), 9)*9)) + (floordiv(floormod((threadIdx.x_2 + 5), 9), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 1120)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1120), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 112), 144), 9)*9)) + (floordiv(floormod((threadIdx.x_2 + 4), 9), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 1344)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1344), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 48), 144), 9)*9)) + (floormod((floordiv(threadIdx.x_2, 3) + 1), 3)*3)) + floormod(threadIdx.x_2, 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 1568)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1568), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 128), 144), 9)*9)) + floormod((threadIdx.x_2 + 2), 9))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 1792)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1792), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 64), 144), 9)*9)) + floormod((threadIdx.x_2 + 1), 9))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 2016)] = kernel_3[(((((blockIdx.x*147456) + (floordiv(threadIdx.x_2, 144)*4608)) + cse_var_1) + floormod(threadIdx.x_2, 144)) + 64512)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 2240)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2240), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 80), 144), 9)*9)) + (floordiv(floormod((threadIdx.x_2 + 8), 9), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 2464)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2464), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 16), 144), 9)*9)) + (floordiv(floormod((threadIdx.x_2 + 7), 9), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 2688)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2688), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 96), 144), 9)*9)) + (floormod((floordiv(threadIdx.x_2, 3) + 2), 3)*3)) + floormod(threadIdx.x_2, 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 2912)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2912), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 144), 9)*9)) + (floordiv(floormod((threadIdx.x_2 + 5), 9), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 3136)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 3136), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 112), 144), 9)*9)) + (floordiv(floormod((threadIdx.x_2 + 4), 9), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 3360)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 3360), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 48), 144), 9)*9)) + (floormod((floordiv(threadIdx.x_2, 3) + 1), 3)*3)) + floormod(threadIdx.x_2, 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 3584)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 3584), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 128), 144), 9)*9)) + floormod((threadIdx.x_2 + 2), 9))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 3808)] = kernel_3[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 3808), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 64), 144), 9)*9)) + floormod((threadIdx.x_2 + 1), 9))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 4032)] = kernel_3[(((((blockIdx.x*147456) + (floordiv(threadIdx.x_2, 144)*4608)) + cse_var_1) + floormod(threadIdx.x_2, 144)) + 129024)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- kernel.shared_1[(threadIdx.x_2 + 4256)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 4256), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 80), 144), 9)*9)) + (floordiv(floormod((threadIdx.x_2 + 8), 9), 3)*3)) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
- if @tir.likely((threadIdx.x_2 < 128), dtype=bool) {
- kernel.shared_1[(threadIdx.x_2 + 4480)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 4480), 144)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 16), 144), 9)*9)) + (floordiv(floormod((threadIdx.x_2 + 7), 9), 3)*3)) + floormod((threadIdx.x_2 + 1), 3))]
- }
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 7)*9)]*kernel.shared_1[(floordiv(threadIdx.x, 7)*144)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1)]*kernel.shared_1[(floordiv(threadIdx.x, 7)*144)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 2)]*kernel.shared_1[(floordiv(threadIdx.x, 7)*144)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 3)]*kernel.shared_1[(floordiv(threadIdx.x, 7)*144)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 4)]*kernel.shared_1[(floordiv(threadIdx.x, 7)*144)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 5)]*kernel.shared_1[(floordiv(threadIdx.x, 7)*144)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 6)]*kernel.shared_1[(floordiv(threadIdx.x, 7)*144)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 81)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 9)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 82)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 9)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 83)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 9)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 84)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 9)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 85)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 9)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 86)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 9)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 87)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 9)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 162)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 18)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 163)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 18)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 164)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 18)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 165)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 18)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 166)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 18)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 167)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 18)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 168)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 18)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 243)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 27)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 244)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 27)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 245)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 27)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 246)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 27)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 247)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 27)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 248)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 27)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 249)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 27)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 324)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 36)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 325)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 36)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 326)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 36)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 327)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 36)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 328)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 36)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 329)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 36)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 330)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 36)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 405)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 45)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 406)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 45)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 407)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 45)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 408)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 45)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 409)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 45)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 410)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 45)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 411)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 45)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 486)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 54)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 487)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 54)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 488)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 54)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 489)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 54)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 490)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 54)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 491)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 54)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 492)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 54)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 567)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 63)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 568)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 63)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 569)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 63)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 570)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 63)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 571)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 63)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 572)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 63)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 573)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 63)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 648)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 72)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 649)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 72)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 650)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 72)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 651)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 72)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 652)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 72)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 653)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 72)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 654)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 72)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 729)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 81)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 730)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 81)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 731)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 81)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 732)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 81)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 733)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 81)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 734)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 81)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 735)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 81)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 810)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 90)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 811)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 90)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 812)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 90)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 813)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 90)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 814)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 90)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 815)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 90)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 816)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 90)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 891)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 99)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 892)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 99)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 893)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 99)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 894)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 99)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 895)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 99)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 896)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 99)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 897)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 99)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 972)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 108)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 973)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 108)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 974)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 108)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 975)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 108)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 976)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 108)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 977)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 108)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 978)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 108)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1053)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 117)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1054)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 117)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1055)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 117)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1056)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 117)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1057)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 117)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1058)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 117)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1059)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 117)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1134)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 126)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1135)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 126)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1136)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 126)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1137)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 126)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1138)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 126)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1139)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 126)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1140)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 126)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1215)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 135)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1216)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 135)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1217)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 135)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1218)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 135)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1219)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 135)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1220)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 135)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1221)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 135)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 1)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 2)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 1)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 3)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 1)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 4)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 1)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 5)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 1)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 6)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 1)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 7)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 1)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 82)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 10)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 83)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 10)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 84)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 10)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 85)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 10)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 86)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 10)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 87)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 10)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 88)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 10)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 163)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 19)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 164)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 19)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 165)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 19)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 166)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 19)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 167)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 19)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 168)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 19)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 169)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 19)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 244)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 28)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 245)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 28)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 246)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 28)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 247)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 28)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 248)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 28)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 249)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 28)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 250)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 28)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 325)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 37)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 326)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 37)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 327)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 37)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 328)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 37)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 329)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 37)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 330)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 37)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 331)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 37)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 406)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 46)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 407)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 46)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 408)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 46)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 409)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 46)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 410)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 46)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 411)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 46)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 412)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 46)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 487)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 55)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 488)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 55)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 489)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 55)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 490)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 55)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 491)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 55)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 492)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 55)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 493)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 55)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 568)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 64)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 569)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 64)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 570)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 64)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 571)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 64)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 572)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 64)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 573)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 64)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 574)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 64)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 649)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 73)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 650)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 73)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 651)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 73)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 652)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 73)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 653)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 73)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 654)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 73)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 655)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 73)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 730)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 82)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 731)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 82)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 732)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 82)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 733)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 82)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 734)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 82)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 735)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 82)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 736)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 82)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 811)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 91)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 812)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 91)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 813)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 91)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 814)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 91)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 815)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 91)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 816)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 91)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 817)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 91)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 892)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 100)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 893)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 100)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 894)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 100)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 895)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 100)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 896)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 100)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 897)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 100)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 898)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 100)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 973)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 109)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 974)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 109)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 975)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 109)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 976)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 109)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 977)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 109)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 978)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 109)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 979)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 109)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1054)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 118)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1055)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 118)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1056)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 118)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1057)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 118)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1058)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 118)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1059)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 118)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1060)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 118)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1135)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 127)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1136)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 127)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1137)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 127)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1138)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 127)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1139)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 127)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1140)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 127)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1141)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 127)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1216)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 136)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1217)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 136)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1218)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 136)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1219)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 136)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1220)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 136)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1221)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 136)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1222)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 136)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 2)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 2)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 3)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 2)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 4)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 2)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 5)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 2)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 6)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 2)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 7)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 2)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 8)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 2)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 83)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 11)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 84)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 11)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 85)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 11)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 86)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 11)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 87)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 11)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 88)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 11)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 89)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 11)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 164)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 20)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 165)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 20)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 166)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 20)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 167)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 20)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 168)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 20)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 169)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 20)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 170)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 20)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 245)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 29)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 246)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 29)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 247)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 29)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 248)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 29)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 249)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 29)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 250)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 29)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 251)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 29)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 326)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 38)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 327)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 38)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 328)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 38)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 329)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 38)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 330)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 38)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 331)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 38)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 332)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 38)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 407)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 47)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 408)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 47)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 409)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 47)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 410)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 47)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 411)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 47)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 412)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 47)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 413)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 47)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 488)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 56)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 489)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 56)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 490)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 56)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 491)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 56)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 492)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 56)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 493)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 56)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 494)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 56)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 569)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 65)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 570)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 65)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 571)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 65)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 572)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 65)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 573)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 65)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 574)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 65)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 575)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 65)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 650)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 74)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 651)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 74)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 652)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 74)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 653)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 74)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 654)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 74)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 655)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 74)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 656)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 74)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 731)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 83)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 732)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 83)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 733)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 83)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 734)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 83)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 735)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 83)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 736)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 83)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 737)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 83)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 812)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 92)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 813)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 92)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 814)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 92)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 815)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 92)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 816)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 92)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 817)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 92)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 818)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 92)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 893)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 101)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 894)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 101)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 895)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 101)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 896)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 101)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 897)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 101)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 898)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 101)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 899)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 101)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 974)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 110)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 975)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 110)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 976)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 110)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 977)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 110)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 978)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 110)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 979)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 110)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 980)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 110)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1055)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 119)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1056)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 119)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1057)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 119)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1058)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 119)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1059)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 119)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1060)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 119)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1061)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 119)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1136)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 128)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1137)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 128)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1138)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 128)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1139)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 128)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1140)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 128)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1141)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 128)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1142)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 128)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1217)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 137)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1218)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 137)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1219)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 137)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1220)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 137)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1221)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 137)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1222)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 137)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1223)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 137)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 9)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 3)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 10)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 3)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 11)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 3)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 12)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 3)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 13)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 3)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 14)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 3)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 15)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 3)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 90)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 12)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 91)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 12)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 92)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 12)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 93)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 12)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 94)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 12)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 95)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 12)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 96)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 12)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 171)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 21)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 172)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 21)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 173)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 21)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 174)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 21)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 175)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 21)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 176)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 21)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 177)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 21)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 252)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 30)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 253)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 30)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 254)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 30)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 255)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 30)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 256)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 30)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 257)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 30)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 258)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 30)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 333)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 39)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 334)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 39)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 335)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 39)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 336)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 39)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 337)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 39)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 338)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 39)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 339)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 39)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 414)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 48)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 415)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 48)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 416)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 48)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 417)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 48)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 418)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 48)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 419)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 48)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 420)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 48)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 495)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 57)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 496)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 57)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 497)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 57)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 498)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 57)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 499)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 57)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 500)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 57)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 501)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 57)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 576)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 66)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 577)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 66)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 578)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 66)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 579)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 66)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 580)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 66)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 581)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 66)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 582)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 66)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 657)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 75)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 658)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 75)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 659)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 75)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 660)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 75)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 661)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 75)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 662)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 75)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 663)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 75)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 738)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 84)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 739)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 84)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 740)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 84)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 741)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 84)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 742)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 84)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 743)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 84)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 744)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 84)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 819)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 93)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 820)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 93)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 821)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 93)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 822)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 93)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 823)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 93)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 824)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 93)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 825)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 93)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 900)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 102)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 901)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 102)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 902)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 102)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 903)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 102)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 904)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 102)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 905)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 102)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 906)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 102)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 981)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 111)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 982)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 111)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 983)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 111)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 984)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 111)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 985)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 111)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 986)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 111)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 987)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 111)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1062)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 120)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1063)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 120)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1064)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 120)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1065)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 120)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1066)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 120)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1067)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 120)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1068)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 120)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1143)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 129)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1144)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 129)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1145)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 129)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1146)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 129)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1147)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 129)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1148)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 129)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1149)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 129)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1224)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 138)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1225)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 138)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1226)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 138)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1227)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 138)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1228)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 138)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1229)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 138)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1230)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 138)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 10)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 4)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 11)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 4)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 12)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 4)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 13)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 4)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 14)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 4)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 15)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 4)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 16)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 4)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 91)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 13)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 92)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 13)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 93)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 13)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 94)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 13)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 95)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 13)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 96)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 13)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 97)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 13)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 172)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 22)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 173)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 22)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 174)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 22)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 175)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 22)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 176)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 22)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 177)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 22)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 178)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 22)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 253)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 31)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 254)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 31)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 255)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 31)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 256)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 31)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 257)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 31)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 258)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 31)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 259)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 31)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 334)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 40)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 335)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 40)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 336)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 40)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 337)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 40)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 338)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 40)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 339)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 40)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 340)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 40)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 415)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 49)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 416)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 49)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 417)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 49)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 418)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 49)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 419)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 49)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 420)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 49)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 421)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 49)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 496)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 58)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 497)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 58)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 498)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 58)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 499)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 58)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 500)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 58)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 501)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 58)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 502)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 58)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 577)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 67)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 578)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 67)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 579)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 67)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 580)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 67)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 581)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 67)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 582)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 67)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 583)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 67)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 658)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 76)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 659)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 76)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 660)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 76)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 661)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 76)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 662)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 76)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 663)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 76)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 664)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 76)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 739)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 85)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 740)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 85)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 741)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 85)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 742)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 85)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 743)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 85)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 744)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 85)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 745)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 85)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 820)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 94)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 821)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 94)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 822)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 94)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 823)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 94)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 824)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 94)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 825)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 94)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 826)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 94)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 901)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 103)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 902)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 103)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 903)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 103)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 904)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 103)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 905)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 103)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 906)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 103)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 907)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 103)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 982)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 112)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 983)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 112)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 984)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 112)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 985)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 112)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 986)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 112)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 987)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 112)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 988)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 112)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1063)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 121)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1064)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 121)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1065)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 121)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1066)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 121)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1067)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 121)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1068)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 121)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1069)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 121)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1144)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 130)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1145)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 130)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1146)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 130)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1147)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 130)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1148)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 130)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1149)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 130)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1150)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 130)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1225)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 139)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1226)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 139)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1227)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 139)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1228)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 139)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1229)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 139)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1230)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 139)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1231)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 139)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 11)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 5)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 12)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 5)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 13)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 5)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 14)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 5)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 15)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 5)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 16)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 5)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 17)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 5)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 92)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 14)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 93)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 14)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 94)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 14)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 95)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 14)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 96)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 14)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 97)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 14)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 98)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 14)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 173)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 23)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 174)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 23)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 175)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 23)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 176)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 23)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 177)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 23)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 178)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 23)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 179)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 23)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 254)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 32)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 255)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 32)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 256)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 32)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 257)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 32)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 258)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 32)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 259)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 32)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 260)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 32)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 335)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 41)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 336)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 41)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 337)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 41)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 338)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 41)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 339)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 41)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 340)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 41)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 341)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 41)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 416)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 50)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 417)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 50)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 418)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 50)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 419)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 50)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 420)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 50)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 421)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 50)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 422)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 50)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 497)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 59)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 498)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 59)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 499)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 59)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 500)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 59)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 501)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 59)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 502)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 59)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 503)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 59)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 578)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 68)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 579)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 68)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 580)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 68)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 581)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 68)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 582)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 68)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 583)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 68)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 584)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 68)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 659)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 77)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 660)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 77)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 661)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 77)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 662)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 77)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 663)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 77)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 664)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 77)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 665)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 77)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 740)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 86)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 741)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 86)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 742)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 86)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 743)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 86)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 744)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 86)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 745)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 86)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 746)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 86)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 821)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 95)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 822)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 95)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 823)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 95)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 824)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 95)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 825)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 95)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 826)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 95)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 827)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 95)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 902)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 104)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 903)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 104)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 904)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 104)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 905)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 104)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 906)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 104)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 907)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 104)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 908)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 104)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 983)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 113)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 984)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 113)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 985)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 113)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 986)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 113)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 987)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 113)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 988)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 113)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 989)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 113)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1064)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 122)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1065)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 122)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1066)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 122)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1067)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 122)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1068)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 122)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1069)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 122)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1070)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 122)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1145)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 131)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1146)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 131)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1147)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 131)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1148)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 131)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1149)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 131)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1150)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 131)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1151)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 131)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1226)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 140)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1227)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 140)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1228)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 140)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1229)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 140)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1230)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 140)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1231)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 140)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1232)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 140)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 18)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 6)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 19)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 6)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 20)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 6)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 21)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 6)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 22)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 6)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 23)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 6)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 24)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 6)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 99)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 15)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 100)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 15)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 101)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 15)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 102)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 15)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 103)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 15)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 104)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 15)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 105)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 15)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 180)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 24)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 181)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 24)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 182)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 24)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 183)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 24)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 184)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 24)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 185)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 24)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 186)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 24)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 261)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 33)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 262)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 33)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 263)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 33)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 264)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 33)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 265)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 33)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 266)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 33)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 267)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 33)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 342)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 42)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 343)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 42)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 344)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 42)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 345)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 42)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 346)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 42)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 347)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 42)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 348)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 42)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 423)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 51)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 424)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 51)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 425)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 51)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 426)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 51)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 427)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 51)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 428)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 51)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 429)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 51)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 504)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 60)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 505)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 60)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 506)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 60)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 507)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 60)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 508)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 60)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 509)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 60)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 510)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 60)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 585)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 69)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 586)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 69)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 587)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 69)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 588)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 69)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 589)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 69)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 590)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 69)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 591)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 69)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 666)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 78)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 667)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 78)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 668)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 78)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 669)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 78)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 670)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 78)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 671)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 78)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 672)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 78)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 747)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 87)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 748)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 87)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 749)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 87)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 750)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 87)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 751)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 87)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 752)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 87)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 753)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 87)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 828)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 96)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 829)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 96)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 830)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 96)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 831)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 96)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 832)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 96)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 833)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 96)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 834)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 96)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 909)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 105)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 910)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 105)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 911)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 105)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 912)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 105)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 913)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 105)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 914)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 105)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 915)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 105)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 990)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 114)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 991)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 114)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 992)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 114)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 993)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 114)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 994)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 114)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 995)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 114)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 996)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 114)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1071)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 123)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1072)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 123)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1073)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 123)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1074)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 123)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1075)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 123)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1076)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 123)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1077)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 123)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1152)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 132)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1153)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 132)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1154)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 132)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1155)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 132)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1156)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 132)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1157)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 132)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1158)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 132)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1233)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 141)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1234)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 141)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1235)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 141)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1236)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 141)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1237)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 141)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1238)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 141)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1239)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 141)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 19)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 7)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 20)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 7)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 21)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 7)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 22)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 7)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 23)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 7)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 24)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 7)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 25)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 7)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 100)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 16)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 101)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 16)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 102)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 16)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 103)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 16)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 104)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 16)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 105)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 16)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 106)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 16)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 181)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 25)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 182)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 25)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 183)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 25)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 184)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 25)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 185)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 25)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 186)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 25)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 187)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 25)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 262)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 34)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 263)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 34)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 264)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 34)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 265)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 34)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 266)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 34)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 267)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 34)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 268)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 34)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 343)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 43)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 344)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 43)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 345)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 43)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 346)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 43)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 347)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 43)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 348)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 43)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 349)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 43)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 424)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 52)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 425)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 52)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 426)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 52)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 427)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 52)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 428)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 52)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 429)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 52)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 430)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 52)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 505)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 61)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 506)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 61)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 507)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 61)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 508)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 61)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 509)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 61)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 510)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 61)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 511)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 61)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 586)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 70)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 587)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 70)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 588)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 70)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 589)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 70)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 590)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 70)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 591)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 70)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 592)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 70)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 667)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 79)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 668)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 79)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 669)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 79)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 670)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 79)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 671)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 79)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 672)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 79)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 673)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 79)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 748)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 88)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 749)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 88)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 750)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 88)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 751)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 88)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 752)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 88)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 753)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 88)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 754)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 88)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 829)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 97)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 830)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 97)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 831)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 97)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 832)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 97)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 833)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 97)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 834)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 97)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 835)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 97)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 910)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 106)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 911)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 106)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 912)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 106)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 913)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 106)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 914)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 106)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 915)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 106)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 916)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 106)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 991)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 115)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 992)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 115)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 993)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 115)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 994)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 115)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 995)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 115)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 996)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 115)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 997)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 115)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1072)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 124)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1073)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 124)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1074)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 124)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1075)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 124)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1076)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 124)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1077)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 124)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1078)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 124)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1153)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 133)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1154)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 133)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1155)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 133)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1156)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 133)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1157)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 133)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1158)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 133)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1159)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 133)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1234)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 142)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1235)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 142)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1236)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 142)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1237)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 142)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1238)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 142)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1239)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 142)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1240)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 142)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 20)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 8)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 21)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 8)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 22)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 8)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 23)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 8)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 24)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 8)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 25)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 8)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 26)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 8)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 101)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 17)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 102)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 17)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 103)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 17)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 104)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 17)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 105)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 17)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 106)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 17)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 107)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 17)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 182)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 26)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 183)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 26)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 184)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 26)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 185)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 26)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 186)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 26)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 187)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 26)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 188)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 26)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 263)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 35)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 264)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 35)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 265)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 35)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 266)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 35)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 267)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 35)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 268)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 35)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 269)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 35)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 344)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 44)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 345)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 44)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 346)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 44)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 347)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 44)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 348)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 44)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 349)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 44)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 350)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 44)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 425)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 53)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 426)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 53)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 427)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 53)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 428)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 53)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 429)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 53)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 430)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 53)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 431)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 53)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 506)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 62)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 507)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 62)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 508)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 62)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 509)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 62)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 510)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 62)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 511)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 62)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 512)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 62)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 587)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 71)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 588)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 71)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 589)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 71)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 590)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 71)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 591)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 71)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 592)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 71)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 593)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 71)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 668)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 80)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 669)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 80)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 670)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 80)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 671)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 80)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 672)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 80)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 673)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 80)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 674)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 80)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 749)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 89)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 750)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 89)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 751)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 89)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 752)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 89)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 753)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 89)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 754)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 89)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 755)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 89)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 830)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 98)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 831)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 98)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 832)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 98)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 833)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 98)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 834)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 98)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 835)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 98)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 836)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 98)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 911)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 107)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 912)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 107)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 913)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 107)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 914)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 107)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 915)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 107)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 916)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 107)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 917)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 107)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 992)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 116)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 993)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 116)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 994)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 116)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 995)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 116)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 996)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 116)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 997)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 116)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 998)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 116)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1073)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 125)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1074)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 125)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1075)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 125)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1076)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 125)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1077)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 125)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1078)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 125)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1079)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 125)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1154)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 134)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1155)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 134)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1156)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 134)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1157)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 134)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1158)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 134)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1159)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 134)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1160)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 134)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1235)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 143)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1236)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 143)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1237)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 143)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1238)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 143)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1239)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 143)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1240)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 143)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[((floormod(threadIdx.x, 7)*9) + 1241)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*144) + 143)]))
}
}
- compute_3: Buffer(compute_2, float32, [25088], [])[((blockIdx.x*1568) + (threadIdx.x*7))] = max((conv2d_nchw_1[0] + bias_3: Buffer(bias_2, float32, [512], [])[((blockIdx.x*32) + floordiv(threadIdx.x, 7))]), 0f32)
- compute_3[(((blockIdx.x*1568) + (threadIdx.x*7)) + 1)] = max((conv2d_nchw_1[1] + bias_3[((blockIdx.x*32) + floordiv(threadIdx.x, 7))]), 0f32)
- compute_3[(((blockIdx.x*1568) + (threadIdx.x*7)) + 2)] = max((conv2d_nchw_1[2] + bias_3[((blockIdx.x*32) + floordiv(threadIdx.x, 7))]), 0f32)
- compute_3[(((blockIdx.x*1568) + (threadIdx.x*7)) + 3)] = max((conv2d_nchw_1[3] + bias_3[((blockIdx.x*32) + floordiv(threadIdx.x, 7))]), 0f32)
- compute_3[(((blockIdx.x*1568) + (threadIdx.x*7)) + 4)] = max((conv2d_nchw_1[4] + bias_3[((blockIdx.x*32) + floordiv(threadIdx.x, 7))]), 0f32)
- compute_3[(((blockIdx.x*1568) + (threadIdx.x*7)) + 5)] = max((conv2d_nchw_1[5] + bias_3[((blockIdx.x*32) + floordiv(threadIdx.x, 7))]), 0f32)
- compute_3[(((blockIdx.x*1568) + (threadIdx.x*7)) + 6)] = max((conv2d_nchw_1[6] + bias_3[((blockIdx.x*32) + floordiv(threadIdx.x, 7))]), 0f32)
+ for (i1.inner: int32, 0, 2) {
+ for (i3.inner: int32, 0, 7) {
+ compute_3: Buffer(compute_2, float32, [25088], [])[(((((floordiv(blockIdx.x, 7)*6272) + (threadIdx.x*98)) + (i1.inner*49)) + (floormod(blockIdx.x, 7)*7)) + i3.inner)] = max((conv2d_nchw_1[((i1.inner*7) + i3.inner)] + bias_3: Buffer(bias_2, float32, [512], [])[(((floordiv(blockIdx.x, 7)*128) + (threadIdx.x*2)) + i1.inner)]), 0f32)
+ }
+ }
}
}
</pre></div>
@@ -1630,7 +1017,7 @@ cooperative fetching, unrolling and operator fusion.</p>
<span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 0.225 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 0.362 ms
</pre></div>
</div>
</div>
@@ -1660,36 +1047,36 @@ conv2d_nchw_nn_o_o_i, conv2d_nchw_nn_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o
conv2d_nchw_nn_o_o_o_i, conv2d_nchw_nn_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_i, factor=1)
conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
-conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=1)
-conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=32)
+conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=2)
+conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=64)
conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
-conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=7)
+conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=1)
conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
-conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
+conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=7)
conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
-conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=7)
-conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=16)
-conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=1)
+conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
+conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=2)
+conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=4)
conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1)
-conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=3)
+conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=1)
conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=1)
conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=3)
s[conv2d_nchw].reorder(conv2d_nchw_nn_o_o_o_o, conv2d_nchw_ff_o_o_o_o, conv2d_nchw_yy_o_o_o_o, conv2d_nchw_xx_o_o_o_o, conv2d_nchw_nn_o_o_o_i, conv2d_nchw_ff_o_o_o_i, conv2d_nchw_yy_o_o_o_i, conv2d_nchw_xx_o_o_o_i, conv2d_nchw_nn_o_o_i, conv2d_nchw_ff_o_o_i, conv2d_nchw_yy_o_o_i, conv2d_nchw_xx_o_o_i, conv2d_nchw_rc_o_o, conv2d_nchw_ry_o_o, conv2d_nchw_rx_o_o, conv2d_nchw_rc_o_i, conv2d_nchw_ry_o_i, conv2d_nchw_rx_o_i, conv2d_nchw_nn_o_i, conv2d_nchw_ff_o_i, conv2d_nchw_yy_o_i, conv2d_nc [...]
compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1)
compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
-compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=1)
-compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=32)
+compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=2)
+compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=64)
compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
-compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=7)
+compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=1)
compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
-compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
+compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
-compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=7)
+compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
kernel_shared = s.cache_read(kernel, "shared", [conv2d_nchw])
@@ -1708,14 +1095,14 @@ s[compute].bind(compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused, te.thread
kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=224)
+kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
-pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
+pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=4)
s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=224)
+pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
-s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 1024)
+s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 512)
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "unroll_explicit", True)
CUDA source code:
@@ -1733,10 +1120,10 @@ CUDA source code:
#define int64_t long long
#define uint64_t unsigned long long
#endif
-extern "C" __global__ void __launch_bounds__(224) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
- float conv2d_nchw[7];
- __shared__ float pad_temp_shared[1296];
- __shared__ float kernel_shared[4608];
+extern "C" __global__ void __launch_bounds__(64) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+ float conv2d_nchw[14];
+ __shared__ float pad_temp_shared[72];
+ __shared__ float kernel_shared[3072];
conv2d_nchw[0] = 0.000000e+00f;
conv2d_nchw[1] = 0.000000e+00f;
conv2d_nchw[2] = 0.000000e+00f;
@@ -1744,1056 +1131,420 @@ extern "C" __global__ void __launch_bounds__(224) default_function_ker
conv2d_nchw[4] = 0.000000e+00f;
conv2d_nchw[5] = 0.000000e+00f;
conv2d_nchw[6] = 0.000000e+00f;
- for (int rc_outer_outer = 0; rc_outer_outer < 32; ++rc_outer_outer) {
- __syncthreads();
- pad_temp_shared[((int)threadIdx.x)] = (((((9 <= (((int)threadIdx.x) % 81)) && ((((int)threadIdx.x) % 81) < 72)) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((int)threadIdx.x) / 81) * 49)) + (((((int)threadIdx.x) % 81) / 9) * 7)) + (((int)threadIdx.x) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 224)] = (((((9 <= ((((int)threadIdx.x) + 62) % 81)) && (((((int)threadIdx.x) + 62) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 8) % 9))) && (((((int)threadIdx.x) + 8) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 224) / 81) * 49)) + ((((((int)threadIdx.x) + 62) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 8) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 448)] = (((((9 <= ((((int)threadIdx.x) + 43) % 81)) && (((((int)threadIdx.x) + 43) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 7) % 9))) && (((((int)threadIdx.x) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 448) / 81) * 49)) + ((((((int)threadIdx.x) + 43) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 7) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 672)] = (((((9 <= ((((int)threadIdx.x) + 24) % 81)) && (((((int)threadIdx.x) + 24) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 6) % 9))) && (((((int)threadIdx.x) + 6) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 672) / 81) * 49)) + ((((((int)threadIdx.x) + 24) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 6) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 896)] = (((((9 <= ((((int)threadIdx.x) + 5) % 81)) && (((((int)threadIdx.x) + 5) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 5) % 9))) && (((((int)threadIdx.x) + 5) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 896) / 81) * 49)) + ((((((int)threadIdx.x) + 5) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 5) % 9)) - 8)] : 0.000000e+00f);
- if (((int)threadIdx.x) < 176) {
- pad_temp_shared[(((int)threadIdx.x) + 1120)] = (((((9 <= ((((int)threadIdx.x) + 67) % 81)) && (((((int)threadIdx.x) + 67) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 4) % 9))) && (((((int)threadIdx.x) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 1120) / 81) * 49)) + ((((((int)threadIdx.x) + 67) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 4) % 9)) - 8)] : 0.000000e+00f);
+ conv2d_nchw[7] = 0.000000e+00f;
+ conv2d_nchw[8] = 0.000000e+00f;
+ conv2d_nchw[9] = 0.000000e+00f;
+ conv2d_nchw[10] = 0.000000e+00f;
+ conv2d_nchw[11] = 0.000000e+00f;
+ conv2d_nchw[12] = 0.000000e+00f;
+ conv2d_nchw[13] = 0.000000e+00f;
+ for (int rc_outer_outer = 0; rc_outer_outer < 64; ++rc_outer_outer) {
+ for (int ry_outer_outer = 0; ry_outer_outer < 3; ++ry_outer_outer) {
+ __syncthreads();
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[(((int)threadIdx.x) * 4)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) * 4) % 9))) && (((((int)threadIdx.x) * 4) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + (((((int)threadIdx.x) * 4) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) * 4) % 9)) - 8)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 4) + 1)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 1) % 9))) && ((((((int)threadIdx.x) * 4) + 1) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 1) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 1) % 9)) - 8)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 4) + 2)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 2) % 9))) && ((((((int)threadIdx.x) * 4) + 2) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 2) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 2) % 9)) - 8)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 4) + 3)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 3) % 9))) && ((((((int)threadIdx.x) * 4) + 3) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 3) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 3) % 9)) - 8)] : 0.000000e+00f);
+ }
+ kernel_shared[((int)threadIdx.x)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 64)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 64) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 128)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 128) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 192)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 36864)];
+ kernel_shared[(((int)threadIdx.x) + 256)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 256) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 320)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 320) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 384)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 73728)];
+ kernel_shared[(((int)threadIdx.x) + 448)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 448) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 512)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 512) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 576)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 110592)];
+ kernel_shared[(((int)threadIdx.x) + 640)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 640) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 704)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 704) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 768)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 147456)];
+ kernel_shared[(((int)threadIdx.x) + 832)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 832) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 896)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 896) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 960)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 184320)];
+ kernel_shared[(((int)threadIdx.x) + 1024)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1024) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1088)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1088) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1152)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 221184)];
+ kernel_shared[(((int)threadIdx.x) + 1216)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1216) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1280)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1280) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1344)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 258048)];
+ kernel_shared[(((int)threadIdx.x) + 1408)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1408) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1472)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1472) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1536)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 294912)];
+ kernel_shared[(((int)threadIdx.x) + 1600)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1600) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1664)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1664) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1728)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 331776)];
+ kernel_shared[(((int)threadIdx.x) + 1792)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1792) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1856)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1856) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 1920)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 368640)];
+ kernel_shared[(((int)threadIdx.x) + 1984)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1984) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2048)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2048) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2112)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 405504)];
+ kernel_shared[(((int)threadIdx.x) + 2176)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2176) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2240)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2240) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2304)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 442368)];
+ kernel_shared[(((int)threadIdx.x) + 2368)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2368) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2432)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2432) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2496)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 479232)];
+ kernel_shared[(((int)threadIdx.x) + 2560)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2560) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2624)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2624) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2688)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 516096)];
+ kernel_shared[(((int)threadIdx.x) + 2752)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2752) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2816)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2816) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 2880)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 552960)];
+ kernel_shared[(((int)threadIdx.x) + 2944)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2944) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ kernel_shared[(((int)threadIdx.x) + 3008)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 3008) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+ __syncthreads();
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[0] * kernel_shared[(((int)threadIdx.x) * 48)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[1] * kernel_shared[(((int)threadIdx.x) * 48)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[2] * kernel_shared[(((int)threadIdx.x) * 48)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[3] * kernel_shared[(((int)threadIdx.x) * 48)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[4] * kernel_shared[(((int)threadIdx.x) * 48)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[5] * kernel_shared[(((int)threadIdx.x) * 48)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[6] * kernel_shared[(((int)threadIdx.x) * 48)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[0] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
}
- kernel_shared[((int)threadIdx.x)] = kernel[((((((int)blockIdx.x) * 147456) + ((((int)threadIdx.x) / 144) * 4608)) + (rc_outer_outer * 144)) + (((int)threadIdx.x) % 144))];
- kernel_shared[(((int)threadIdx.x) + 224)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 224) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 80) % 144) / 9) * 9)) + ((((((int)threadIdx.x) + 8) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 448)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 448) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 16) % 144) / 9) * 9)) + ((((((int)threadIdx.x) + 7) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 672)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 672) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 96) % 144) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 2) % 3) * 3)) + (((int)threadIdx.x) % 3))];
- kernel_shared[(((int)threadIdx.x) + 896)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 896) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 32) % 144) / 9) * 9)) + ((((((int)threadIdx.x) + 5) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1120)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1120) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 112) % 144) / 9) * 9)) + ((((((int)threadIdx.x) + 4) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1344)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1344) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 48) % 144) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 1) % 3) * 3)) + (((int)threadIdx.x) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1568)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1568) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 128) % 144) / 9) * 9)) + ((((int)threadIdx.x) + 2) % 9))];
- kernel_shared[(((int)threadIdx.x) + 1792)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1792) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 64) % 144) / 9) * 9)) + ((((int)threadIdx.x) + 1) % 9))];
- kernel_shared[(((int)threadIdx.x) + 2016)] = kernel[(((((((int)blockIdx.x) * 147456) + ((((int)threadIdx.x) / 144) * 4608)) + (rc_outer_outer * 144)) + (((int)threadIdx.x) % 144)) + 64512)];
- kernel_shared[(((int)threadIdx.x) + 2240)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2240) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 80) % 144) / 9) * 9)) + ((((((int)threadIdx.x) + 8) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2464)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2464) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 16) % 144) / 9) * 9)) + ((((((int)threadIdx.x) + 7) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2688)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2688) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 96) % 144) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 2) % 3) * 3)) + (((int)threadIdx.x) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2912)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2912) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 32) % 144) / 9) * 9)) + ((((((int)threadIdx.x) + 5) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 3136)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 3136) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 112) % 144) / 9) * 9)) + ((((((int)threadIdx.x) + 4) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 3360)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 3360) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 48) % 144) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 1) % 3) * 3)) + (((int)threadIdx.x) % 3))];
- kernel_shared[(((int)threadIdx.x) + 3584)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 3584) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 128) % 144) / 9) * 9)) + ((((int)threadIdx.x) + 2) % 9))];
- kernel_shared[(((int)threadIdx.x) + 3808)] = kernel[(((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 3808) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 64) % 144) / 9) * 9)) + ((((int)threadIdx.x) + 1) % 9))];
- kernel_shared[(((int)threadIdx.x) + 4032)] = kernel[(((((((int)blockIdx.x) * 147456) + ((((int)threadIdx.x) / 144) * 4608)) + (rc_outer_outer * 144)) + (((int)threadIdx.x) % 144)) + 129024)];
- kernel_shared[(((int)threadIdx.x) + 4256)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 4256) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 80) % 144) / 9) * 9)) + ((((((int)threadIdx.x) + 8) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- if (((int)threadIdx.x) < 128) {
- kernel_shared[(((int)threadIdx.x) + 4480)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 4480) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) / 9) * 9)) + ((((((int)threadIdx.x) + 7) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+ }
+ for (int i1_inner = 0; i1_inner < 2; ++i1_inner) {
+ for (int i3_inner = 0; i3_inner < 7; ++i3_inner) {
+ compute[((((((((int)blockIdx.x) / 7) * 6272) + (((int)threadIdx.x) * 98)) + (i1_inner * 49)) + ((((int)blockIdx.x) % 7) * 7)) + i3_inner)] = max((conv2d_nchw[((i1_inner * 7) + i3_inner)] + bias[((((((int)blockIdx.x) / 7) * 128) + (((int)threadIdx.x) * 2)) + i1_inner)]), 0.000000e+00f);
}
- __syncthreads();
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) * 9)] * kernel_shared[((((int)threadIdx.x) / 7) * 144)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1)] * kernel_shared[((((int)threadIdx.x) / 7) * 144)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 2)] * kernel_shared[((((int)threadIdx.x) / 7) * 144)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 3)] * kernel_shared[((((int)threadIdx.x) / 7) * 144)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 4)] * kernel_shared[((((int)threadIdx.x) / 7) * 144)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 5)] * kernel_shared[((((int)threadIdx.x) / 7) * 144)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 6)] * kernel_shared[((((int)threadIdx.x) / 7) * 144)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 81)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 9)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 82)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 9)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 83)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 9)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 84)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 9)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 85)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 9)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 86)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 9)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 87)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 9)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 162)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 18)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 163)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 18)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 164)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 18)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 165)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 18)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 166)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 18)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 167)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 18)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 168)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 18)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 243)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 27)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 244)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 27)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 245)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 27)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 246)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 27)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 247)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 27)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 248)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 27)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 249)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 27)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 324)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 36)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 325)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 36)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 326)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 36)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 327)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 36)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 328)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 36)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 329)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 36)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 330)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 36)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 405)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 45)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 406)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 45)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 407)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 45)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 408)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 45)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 409)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 45)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 410)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 45)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 411)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 45)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 486)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 54)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 487)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 54)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 488)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 54)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 489)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 54)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 490)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 54)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 491)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 54)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 492)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 54)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 567)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 63)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 568)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 63)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 569)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 63)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 570)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 63)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 571)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 63)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 572)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 63)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 573)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 63)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 648)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 72)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 649)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 72)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 650)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 72)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 651)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 72)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 652)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 72)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 653)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 72)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 654)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 72)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 729)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 81)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 730)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 81)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 731)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 81)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 732)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 81)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 733)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 81)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 734)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 81)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 735)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 81)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 810)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 90)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 811)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 90)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 812)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 90)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 813)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 90)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 814)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 90)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 815)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 90)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 816)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 90)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 891)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 99)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 892)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 99)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 893)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 99)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 894)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 99)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 895)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 99)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 896)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 99)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 897)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 99)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 972)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 108)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 973)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 108)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 974)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 108)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 975)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 108)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 976)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 108)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 977)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 108)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 978)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 108)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1053)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 117)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1054)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 117)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1055)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 117)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1056)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 117)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1057)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 117)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1058)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 117)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1059)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 117)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1134)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 126)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1135)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 126)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1136)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 126)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1137)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 126)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1138)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 126)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1139)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 126)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1140)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 126)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1215)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 135)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1216)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 135)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1217)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 135)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1218)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 135)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1219)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 135)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1220)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 135)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1221)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 135)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 1)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 2)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 1)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 3)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 1)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 4)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 1)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 5)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 1)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 6)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 1)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 7)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 1)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 82)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 10)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 83)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 10)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 84)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 10)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 85)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 10)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 86)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 10)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 87)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 10)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 88)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 10)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 163)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 19)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 164)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 19)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 165)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 19)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 166)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 19)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 167)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 19)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 168)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 19)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 169)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 19)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 244)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 28)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 245)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 28)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 246)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 28)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 247)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 28)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 248)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 28)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 249)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 28)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 250)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 28)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 325)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 37)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 326)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 37)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 327)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 37)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 328)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 37)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 329)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 37)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 330)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 37)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 331)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 37)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 406)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 46)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 407)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 46)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 408)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 46)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 409)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 46)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 410)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 46)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 411)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 46)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 412)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 46)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 487)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 55)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 488)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 55)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 489)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 55)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 490)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 55)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 491)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 55)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 492)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 55)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 493)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 55)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 568)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 64)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 569)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 64)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 570)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 64)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 571)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 64)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 572)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 64)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 573)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 64)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 574)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 64)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 649)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 73)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 650)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 73)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 651)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 73)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 652)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 73)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 653)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 73)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 654)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 73)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 655)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 73)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 730)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 82)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 731)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 82)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 732)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 82)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 733)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 82)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 734)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 82)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 735)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 82)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 736)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 82)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 811)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 91)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 812)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 91)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 813)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 91)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 814)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 91)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 815)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 91)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 816)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 91)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 817)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 91)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 892)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 100)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 893)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 100)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 894)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 100)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 895)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 100)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 896)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 100)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 897)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 100)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 898)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 100)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 973)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 109)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 974)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 109)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 975)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 109)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 976)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 109)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 977)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 109)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 978)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 109)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 979)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 109)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1054)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 118)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1055)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 118)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1056)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 118)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1057)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 118)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1058)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 118)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1059)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 118)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1060)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 118)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1135)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 127)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1136)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 127)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1137)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 127)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1138)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 127)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1139)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 127)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1140)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 127)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1141)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 127)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1216)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 136)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1217)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 136)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1218)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 136)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1219)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 136)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1220)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 136)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1221)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 136)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1222)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 136)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 2)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 2)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 3)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 2)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 4)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 2)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 5)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 2)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 6)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 2)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 7)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 2)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 8)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 2)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 83)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 11)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 84)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 11)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 85)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 11)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 86)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 11)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 87)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 11)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 88)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 11)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 89)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 11)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 164)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 20)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 165)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 20)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 166)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 20)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 167)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 20)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 168)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 20)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 169)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 20)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 170)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 20)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 245)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 29)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 246)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 29)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 247)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 29)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 248)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 29)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 249)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 29)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 250)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 29)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 251)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 29)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 326)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 38)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 327)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 38)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 328)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 38)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 329)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 38)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 330)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 38)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 331)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 38)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 332)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 38)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 407)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 47)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 408)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 47)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 409)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 47)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 410)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 47)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 411)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 47)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 412)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 47)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 413)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 47)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 488)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 56)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 489)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 56)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 490)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 56)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 491)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 56)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 492)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 56)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 493)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 56)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 494)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 56)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 569)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 65)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 570)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 65)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 571)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 65)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 572)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 65)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 573)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 65)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 574)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 65)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 575)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 65)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 650)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 74)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 651)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 74)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 652)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 74)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 653)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 74)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 654)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 74)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 655)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 74)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 656)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 74)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 731)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 83)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 732)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 83)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 733)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 83)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 734)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 83)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 735)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 83)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 736)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 83)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 737)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 83)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 812)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 92)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 813)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 92)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 814)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 92)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 815)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 92)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 816)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 92)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 817)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 92)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 818)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 92)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 893)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 101)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 894)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 101)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 895)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 101)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 896)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 101)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 897)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 101)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 898)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 101)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 899)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 101)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 974)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 110)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 975)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 110)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 976)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 110)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 977)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 110)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 978)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 110)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 979)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 110)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 980)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 110)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1055)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 119)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1056)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 119)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1057)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 119)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1058)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 119)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1059)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 119)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1060)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 119)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1061)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 119)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1136)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 128)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1137)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 128)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1138)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 128)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1139)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 128)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1140)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 128)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1141)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 128)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1142)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 128)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1217)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 137)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1218)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 137)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1219)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 137)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1220)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 137)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1221)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 137)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1222)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 137)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1223)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 137)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 9)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 3)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 10)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 3)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 11)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 3)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 12)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 3)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 13)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 3)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 14)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 3)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 15)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 3)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 90)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 12)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 91)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 12)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 92)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 12)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 93)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 12)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 94)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 12)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 95)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 12)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 96)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 12)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 171)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 21)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 172)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 21)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 173)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 21)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 174)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 21)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 175)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 21)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 176)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 21)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 177)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 21)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 252)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 30)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 253)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 30)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 254)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 30)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 255)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 30)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 256)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 30)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 257)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 30)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 258)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 30)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 333)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 39)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 334)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 39)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 335)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 39)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 336)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 39)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 337)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 39)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 338)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 39)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 339)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 39)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 414)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 48)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 415)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 48)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 416)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 48)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 417)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 48)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 418)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 48)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 419)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 48)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 420)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 48)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 495)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 57)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 496)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 57)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 497)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 57)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 498)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 57)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 499)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 57)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 500)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 57)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 501)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 57)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 576)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 66)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 577)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 66)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 578)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 66)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 579)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 66)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 580)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 66)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 581)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 66)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 582)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 66)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 657)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 75)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 658)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 75)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 659)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 75)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 660)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 75)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 661)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 75)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 662)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 75)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 663)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 75)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 738)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 84)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 739)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 84)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 740)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 84)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 741)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 84)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 742)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 84)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 743)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 84)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 744)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 84)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 819)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 93)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 820)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 93)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 821)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 93)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 822)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 93)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 823)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 93)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 824)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 93)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 825)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 93)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 900)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 102)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 901)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 102)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 902)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 102)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 903)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 102)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 904)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 102)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 905)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 102)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 906)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 102)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 981)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 111)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 982)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 111)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 983)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 111)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 984)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 111)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 985)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 111)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 986)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 111)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 987)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 111)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1062)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 120)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1063)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 120)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1064)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 120)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1065)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 120)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1066)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 120)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1067)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 120)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1068)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 120)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1143)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 129)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1144)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 129)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1145)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 129)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1146)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 129)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1147)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 129)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1148)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 129)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1149)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 129)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1224)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 138)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1225)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 138)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1226)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 138)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1227)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 138)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1228)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 138)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1229)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 138)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1230)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 138)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 10)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 4)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 11)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 4)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 12)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 4)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 13)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 4)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 14)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 4)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 15)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 4)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 16)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 4)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 91)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 13)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 92)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 13)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 93)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 13)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 94)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 13)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 95)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 13)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 96)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 13)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 97)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 13)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 172)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 22)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 173)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 22)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 174)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 22)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 175)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 22)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 176)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 22)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 177)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 22)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 178)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 22)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 253)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 31)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 254)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 31)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 255)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 31)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 256)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 31)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 257)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 31)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 258)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 31)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 259)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 31)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 334)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 40)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 335)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 40)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 336)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 40)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 337)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 40)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 338)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 40)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 339)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 40)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 340)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 40)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 415)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 49)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 416)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 49)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 417)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 49)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 418)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 49)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 419)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 49)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 420)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 49)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 421)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 49)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 496)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 58)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 497)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 58)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 498)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 58)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 499)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 58)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 500)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 58)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 501)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 58)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 502)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 58)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 577)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 67)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 578)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 67)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 579)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 67)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 580)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 67)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 581)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 67)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 582)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 67)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 583)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 67)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 658)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 76)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 659)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 76)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 660)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 76)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 661)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 76)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 662)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 76)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 663)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 76)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 664)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 76)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 739)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 85)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 740)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 85)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 741)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 85)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 742)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 85)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 743)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 85)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 744)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 85)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 745)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 85)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 820)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 94)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 821)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 94)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 822)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 94)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 823)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 94)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 824)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 94)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 825)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 94)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 826)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 94)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 901)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 103)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 902)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 103)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 903)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 103)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 904)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 103)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 905)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 103)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 906)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 103)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 907)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 103)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 982)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 112)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 983)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 112)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 984)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 112)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 985)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 112)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 986)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 112)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 987)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 112)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 988)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 112)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1063)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 121)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1064)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 121)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1065)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 121)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1066)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 121)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1067)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 121)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1068)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 121)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1069)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 121)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1144)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 130)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1145)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 130)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1146)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 130)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1147)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 130)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1148)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 130)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1149)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 130)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1150)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 130)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1225)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 139)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1226)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 139)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1227)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 139)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1228)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 139)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1229)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 139)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1230)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 139)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1231)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 139)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 11)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 5)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 12)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 5)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 13)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 5)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 14)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 5)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 15)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 5)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 16)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 5)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 17)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 5)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 92)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 14)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 93)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 14)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 94)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 14)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 95)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 14)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 96)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 14)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 97)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 14)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 98)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 14)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 173)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 23)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 174)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 23)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 175)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 23)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 176)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 23)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 177)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 23)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 178)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 23)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 179)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 23)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 254)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 32)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 255)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 32)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 256)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 32)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 257)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 32)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 258)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 32)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 259)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 32)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 260)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 32)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 335)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 41)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 336)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 41)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 337)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 41)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 338)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 41)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 339)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 41)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 340)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 41)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 341)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 41)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 416)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 50)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 417)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 50)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 418)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 50)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 419)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 50)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 420)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 50)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 421)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 50)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 422)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 50)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 497)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 59)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 498)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 59)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 499)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 59)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 500)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 59)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 501)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 59)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 502)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 59)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 503)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 59)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 578)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 68)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 579)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 68)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 580)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 68)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 581)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 68)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 582)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 68)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 583)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 68)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 584)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 68)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 659)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 77)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 660)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 77)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 661)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 77)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 662)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 77)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 663)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 77)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 664)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 77)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 665)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 77)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 740)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 86)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 741)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 86)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 742)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 86)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 743)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 86)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 744)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 86)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 745)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 86)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 746)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 86)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 821)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 95)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 822)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 95)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 823)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 95)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 824)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 95)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 825)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 95)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 826)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 95)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 827)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 95)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 902)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 104)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 903)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 104)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 904)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 104)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 905)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 104)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 906)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 104)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 907)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 104)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 908)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 104)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 983)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 113)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 984)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 113)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 985)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 113)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 986)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 113)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 987)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 113)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 988)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 113)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 989)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 113)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1064)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 122)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1065)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 122)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1066)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 122)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1067)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 122)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1068)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 122)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1069)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 122)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1070)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 122)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1145)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 131)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1146)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 131)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1147)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 131)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1148)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 131)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1149)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 131)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1150)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 131)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1151)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 131)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1226)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 140)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1227)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 140)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1228)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 140)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1229)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 140)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1230)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 140)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1231)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 140)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1232)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 140)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 18)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 6)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 19)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 6)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 20)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 6)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 21)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 6)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 22)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 6)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 23)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 6)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 24)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 6)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 99)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 15)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 100)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 15)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 101)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 15)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 102)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 15)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 103)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 15)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 104)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 15)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 105)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 15)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 180)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 24)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 181)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 24)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 182)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 24)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 183)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 24)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 184)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 24)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 185)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 24)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 186)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 24)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 261)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 33)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 262)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 33)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 263)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 33)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 264)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 33)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 265)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 33)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 266)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 33)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 267)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 33)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 342)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 42)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 343)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 42)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 344)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 42)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 345)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 42)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 346)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 42)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 347)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 42)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 348)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 42)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 423)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 51)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 424)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 51)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 425)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 51)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 426)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 51)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 427)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 51)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 428)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 51)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 429)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 51)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 504)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 60)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 505)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 60)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 506)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 60)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 507)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 60)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 508)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 60)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 509)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 60)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 510)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 60)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 585)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 69)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 586)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 69)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 587)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 69)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 588)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 69)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 589)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 69)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 590)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 69)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 591)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 69)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 666)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 78)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 667)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 78)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 668)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 78)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 669)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 78)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 670)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 78)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 671)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 78)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 672)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 78)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 747)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 87)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 748)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 87)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 749)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 87)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 750)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 87)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 751)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 87)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 752)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 87)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 753)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 87)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 828)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 96)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 829)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 96)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 830)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 96)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 831)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 96)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 832)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 96)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 833)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 96)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 834)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 96)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 909)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 105)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 910)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 105)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 911)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 105)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 912)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 105)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 913)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 105)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 914)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 105)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 915)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 105)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 990)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 114)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 991)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 114)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 992)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 114)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 993)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 114)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 994)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 114)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 995)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 114)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 996)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 114)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1071)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 123)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1072)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 123)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1073)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 123)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1074)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 123)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1075)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 123)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1076)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 123)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1077)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 123)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1152)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 132)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1153)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 132)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1154)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 132)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1155)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 132)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1156)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 132)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1157)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 132)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1158)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 132)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1233)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 141)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1234)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 141)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1235)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 141)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1236)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 141)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1237)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 141)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1238)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 141)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1239)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 141)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 19)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 7)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 20)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 7)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 21)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 7)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 22)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 7)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 23)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 7)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 24)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 7)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 25)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 7)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 100)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 16)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 101)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 16)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 102)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 16)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 103)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 16)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 104)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 16)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 105)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 16)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 106)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 16)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 181)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 25)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 182)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 25)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 183)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 25)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 184)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 25)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 185)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 25)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 186)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 25)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 187)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 25)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 262)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 34)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 263)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 34)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 264)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 34)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 265)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 34)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 266)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 34)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 267)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 34)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 268)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 34)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 343)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 43)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 344)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 43)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 345)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 43)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 346)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 43)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 347)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 43)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 348)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 43)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 349)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 43)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 424)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 52)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 425)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 52)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 426)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 52)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 427)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 52)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 428)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 52)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 429)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 52)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 430)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 52)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 505)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 61)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 506)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 61)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 507)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 61)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 508)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 61)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 509)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 61)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 510)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 61)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 511)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 61)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 586)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 70)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 587)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 70)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 588)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 70)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 589)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 70)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 590)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 70)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 591)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 70)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 592)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 70)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 667)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 79)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 668)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 79)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 669)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 79)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 670)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 79)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 671)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 79)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 672)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 79)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 673)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 79)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 748)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 88)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 749)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 88)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 750)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 88)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 751)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 88)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 752)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 88)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 753)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 88)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 754)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 88)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 829)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 97)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 830)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 97)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 831)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 97)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 832)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 97)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 833)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 97)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 834)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 97)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 835)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 97)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 910)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 106)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 911)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 106)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 912)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 106)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 913)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 106)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 914)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 106)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 915)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 106)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 916)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 106)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 991)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 115)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 992)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 115)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 993)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 115)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 994)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 115)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 995)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 115)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 996)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 115)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 997)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 115)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1072)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 124)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1073)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 124)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1074)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 124)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1075)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 124)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1076)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 124)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1077)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 124)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1078)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 124)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1153)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 133)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1154)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 133)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1155)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 133)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1156)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 133)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1157)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 133)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1158)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 133)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1159)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 133)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1234)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 142)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1235)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 142)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1236)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 142)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1237)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 142)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1238)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 142)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1239)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 142)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1240)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 142)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 20)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 8)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 21)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 8)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 22)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 8)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 23)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 8)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 24)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 8)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 25)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 8)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 26)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 8)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 101)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 17)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 102)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 17)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 103)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 17)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 104)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 17)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 105)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 17)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 106)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 17)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 107)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 17)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 182)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 26)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 183)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 26)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 184)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 26)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 185)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 26)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 186)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 26)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 187)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 26)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 188)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 26)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 263)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 35)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 264)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 35)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 265)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 35)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 266)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 35)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 267)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 35)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 268)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 35)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 269)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 35)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 344)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 44)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 345)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 44)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 346)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 44)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 347)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 44)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 348)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 44)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 349)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 44)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 350)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 44)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 425)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 53)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 426)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 53)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 427)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 53)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 428)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 53)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 429)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 53)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 430)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 53)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 431)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 53)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 506)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 62)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 507)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 62)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 508)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 62)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 509)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 62)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 510)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 62)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 511)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 62)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 512)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 62)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 587)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 71)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 588)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 71)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 589)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 71)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 590)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 71)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 591)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 71)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 592)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 71)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 593)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 71)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 668)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 80)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 669)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 80)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 670)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 80)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 671)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 80)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 672)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 80)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 673)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 80)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 674)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 80)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 749)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 89)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 750)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 89)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 751)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 89)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 752)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 89)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 753)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 89)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 754)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 89)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 755)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 89)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 830)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 98)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 831)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 98)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 832)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 98)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 833)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 98)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 834)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 98)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 835)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 98)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 836)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 98)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 911)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 107)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 912)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 107)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 913)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 107)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 914)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 107)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 915)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 107)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 916)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 107)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 917)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 107)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 992)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 116)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 993)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 116)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 994)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 116)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 995)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 116)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 996)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 116)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 997)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 116)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 998)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 116)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1073)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 125)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1074)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 125)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1075)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 125)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1076)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 125)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1077)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 125)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1078)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 125)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1079)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 125)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1154)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 134)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1155)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 134)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1156)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 134)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1157)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 134)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1158)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 134)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1159)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 134)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1160)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 134)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1235)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 143)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1236)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 143)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1237)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 143)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1238)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 143)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1239)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 143)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1240)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 143)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 9) + 1241)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 143)]));
}
- compute[((((int)blockIdx.x) * 1568) + (((int)threadIdx.x) * 7))] = max((conv2d_nchw[0] + bias[((((int)blockIdx.x) * 32) + (((int)threadIdx.x) / 7))]), 0.000000e+00f);
- compute[(((((int)blockIdx.x) * 1568) + (((int)threadIdx.x) * 7)) + 1)] = max((conv2d_nchw[1] + bias[((((int)blockIdx.x) * 32) + (((int)threadIdx.x) / 7))]), 0.000000e+00f);
- compute[(((((int)blockIdx.x) * 1568) + (((int)threadIdx.x) * 7)) + 2)] = max((conv2d_nchw[2] + bias[((((int)blockIdx.x) * 32) + (((int)threadIdx.x) / 7))]), 0.000000e+00f);
- compute[(((((int)blockIdx.x) * 1568) + (((int)threadIdx.x) * 7)) + 3)] = max((conv2d_nchw[3] + bias[((((int)blockIdx.x) * 32) + (((int)threadIdx.x) / 7))]), 0.000000e+00f);
- compute[(((((int)blockIdx.x) * 1568) + (((int)threadIdx.x) * 7)) + 4)] = max((conv2d_nchw[4] + bias[((((int)blockIdx.x) * 32) + (((int)threadIdx.x) / 7))]), 0.000000e+00f);
- compute[(((((int)blockIdx.x) * 1568) + (((int)threadIdx.x) * 7)) + 5)] = max((conv2d_nchw[5] + bias[((((int)blockIdx.x) * 32) + (((int)threadIdx.x) / 7))]), 0.000000e+00f);
- compute[(((((int)blockIdx.x) * 1568) + (((int)threadIdx.x) * 7)) + 6)] = max((conv2d_nchw[6] + bias[((((int)blockIdx.x) * 32) + (((int)threadIdx.x) / 7))]), 0.000000e+00f);
}
</pre></div>
</div>
@@ -2829,7 +1580,7 @@ In the example below we resume the status and do more 5 trials.</p>
Get devices for measurement successfully!
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 5 minutes 30.470 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 5 minutes 37.098 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/e3e540f3b477c0c52d8eb73e674e8ffd/tune_conv2d_layer_cuda.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_conv2d_layer_cuda.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
index 5690298adf..ca41c53bb7 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
@@ -916,7 +916,7 @@ so we can read the log file and load the best schedules.</p>
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 7.8515 7.8508 7.8602 7.8435 0.0068
+ 7.8957 7.8972 7.8982 7.8916 0.0029
</pre></div>
</div>
</div>
@@ -938,7 +938,7 @@ to learn how to use the RPC Tracker and RPC Server.
To use the RPC Tracker in auto-scheduler, replace the runner in <code class="code docutils literal notranslate"><span class="pre">TuningOptions</span></code>
with <a class="reference internal" href="../../reference/api/python/auto_scheduler.html#tvm.auto_scheduler.RPCRunner" title="tvm.auto_scheduler.RPCRunner"><code class="xref any py py-class docutils literal notranslate"><span class="pre">auto_scheduler.RPCRunner</span></code></a>.</p></li>
</ol>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 1.205 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 1.802 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-network-cuda-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/eafe360d52540634c9eea0fa89e804bd/tune_network_cuda.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_network_cuda.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_x86.html b/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
index e17b6e04bc..01d1a691b7 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
@@ -935,7 +935,7 @@ so we can read the log file and load the best schedules.</p>
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 745.1719 745.3651 745.7341 744.4164 0.5550
+ 748.8855 748.8227 749.0615 748.7722 0.1262
</pre></div>
</div>
</div>
@@ -957,7 +957,7 @@ to learn how to use the RPC Tracker and RPC Server.
To use the RPC Tracker in auto-scheduler, replace the runner in <code class="code docutils literal notranslate"><span class="pre">TuningOptions</span></code>
with <a class="reference internal" href="../../reference/api/python/auto_scheduler.html#tvm.auto_scheduler.RPCRunner" title="tvm.auto_scheduler.RPCRunner"><code class="xref any py py-class docutils literal notranslate"><span class="pre">auto_scheduler.RPCRunner</span></code></a>.</p></li>
</ol>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 31.128 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 32.166 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-network-x86-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/e416b94ca1090b0897c0f6e0df95b911/tune_network_x86.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_network_x86.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html b/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
index bcb0a13976..40aad8730d 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
@@ -633,103 +633,31 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
placeholder_4: Buffer(placeholder_14: Pointer(float32), float32, [128, 512], []),
compute: Buffer(compute_2: Pointer(float32), float32, [128, 512], [])}
buffer_map = {placeholder_5: placeholder, placeholder_6: placeholder_1, placeholder_7: placeholder_2, placeholder_8: placeholder_3, placeholder_9: placeholder_4, compute_1: compute} {
- for (i0.outer.i1.outer.fused: int32, 0, 256) "parallel" {
- allocate(compute_3: Pointer(global float32), float32, [256]), storage_scope = global {
- for (i.inner.init: int32, 0, 16) {
- let cse_var_1: int32 = (i.inner.init*16)
- {
- compute_4: Buffer(compute_3, float32, [256], [])[cse_var_1] = 0f32
- compute_4[(cse_var_1 + 1)] = 0f32
- compute_4[(cse_var_1 + 2)] = 0f32
- compute_4[(cse_var_1 + 3)] = 0f32
- compute_4[(cse_var_1 + 4)] = 0f32
- compute_4[(cse_var_1 + 5)] = 0f32
- compute_4[(cse_var_1 + 6)] = 0f32
- compute_4[(cse_var_1 + 7)] = 0f32
- compute_4[(cse_var_1 + 8)] = 0f32
- compute_4[(cse_var_1 + 9)] = 0f32
- compute_4[(cse_var_1 + 10)] = 0f32
- compute_4[(cse_var_1 + 11)] = 0f32
- compute_4[(cse_var_1 + 12)] = 0f32
- compute_4[(cse_var_1 + 13)] = 0f32
- compute_4[(cse_var_1 + 14)] = 0f32
- compute_4[(cse_var_1 + 15)] = 0f32
- }
- }
- for (elem_idx: int32, 0, let cse_var_2: int32 = floormod(i0.outer.i1.outer.fused, 32) in (placeholder_15: Buffer(placeholder_13, int32, [33], [])[(cse_var_2 + 1)] - placeholder_15[cse_var_2])) {
- for (i.inner: int32, 0, 16) {
- let cse_var_3: int32 = floormod(i0.outer.i1.outer.fused, 32)
- {
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_4: int32 = (i.inner*16)
- compute_4[cse_var_4] = (compute_4[cse_var_4] + (placeholder_16: Buffer(placeholder_11, float32, [78656], [])[((placeholder_15[cse_var_3]*16) + (elem_idx*16))]*max(placeholder_17: Buffer(placeholder_10, float32, [32768], [])[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18: Buffer(placeholder_12, int32, [4916], [])[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_5: int32 = ((i.inner*16) + 1)
- compute_4[cse_var_5] = (compute_4[cse_var_5] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 1)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_6: int32 = ((i.inner*16) + 2)
- compute_4[cse_var_6] = (compute_4[cse_var_6] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 2)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_7: int32 = ((i.inner*16) + 3)
- compute_4[cse_var_7] = (compute_4[cse_var_7] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 3)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_8: int32 = ((i.inner*16) + 4)
- compute_4[cse_var_8] = (compute_4[cse_var_8] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 4)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_9: int32 = ((i.inner*16) + 5)
- compute_4[cse_var_9] = (compute_4[cse_var_9] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 5)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_10: int32 = ((i.inner*16) + 6)
- compute_4[cse_var_10] = (compute_4[cse_var_10] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 6)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_11: int32 = ((i.inner*16) + 7)
- compute_4[cse_var_11] = (compute_4[cse_var_11] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 7)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
+ for (i0.outer.i1.outer.fused: int32, 0, 64) "parallel" {
+ allocate(compute_3: Pointer(global float32), float32, [1024]), storage_scope = global {
+ for (i.outer.inner: int32, 0, 4) {
+ for (nb_j.inner: int32, 0, 2) {
+ for (i.inner.init: int32, 0, 8) {
+ for (j.init: int32, 0, 16) {
+ compute_4: Buffer(compute_3, float32, [1024], [])[((((i.outer.inner*256) + (i.inner.init*32)) + (nb_j.inner*16)) + j.init)] = 0f32
}
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_12: int32 = ((i.inner*16) + 8)
- compute_4[cse_var_12] = (compute_4[cse_var_12] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 8)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_13: int32 = ((i.inner*16) + 9)
- compute_4[cse_var_13] = (compute_4[cse_var_13] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 9)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_14: int32 = ((i.inner*16) + 10)
- compute_4[cse_var_14] = (compute_4[cse_var_14] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 10)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_15: int32 = ((i.inner*16) + 11)
- compute_4[cse_var_15] = (compute_4[cse_var_15] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 11)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_16: int32 = ((i.inner*16) + 12)
- compute_4[cse_var_16] = (compute_4[cse_var_16] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 12)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_17: int32 = ((i.inner*16) + 13)
- compute_4[cse_var_17] = (compute_4[cse_var_17] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 13)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_18: int32 = ((i.inner*16) + 14)
- compute_4[cse_var_18] = (compute_4[cse_var_18] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 14)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_15[(cse_var_3 + 1)] - placeholder_15[cse_var_3])), dtype=bool) {
- let cse_var_19: int32 = ((i.inner*16) + 15)
- compute_4[cse_var_19] = (compute_4[cse_var_19] + (placeholder_16[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + 15)]*max(placeholder_17[(((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i.inner*256)) + placeholder_18[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
+ }
+ for (elem_idx: int32, 0, let cse_var_1: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner) in (placeholder_15: Buffer(placeholder_13, int32, [33], [])[(cse_var_1 + 1)] - placeholder_15[cse_var_1])) {
+ for (i.inner: int32, 0, 8) {
+ for (j: int32, 0, 16) {
+ let cse_var_3: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)
+ let cse_var_2: int32 = ((((i.outer.inner*256) + (i.inner*32)) + (nb_j.inner*16)) + j)
+ compute_4[cse_var_2] = (compute_4[cse_var_2] + (placeholder_16: Buffer(placeholder_11, float32, [78656], [])[(((placeholder_15[cse_var_3]*16) + (elem_idx*16)) + j)]*max(placeholder_17: Buffer(placeholder_10, float32, [32768], [])[((((floordiv(i0.outer.i1.outer.fused, 16)*8192) + (i.outer.inner*2048)) + (i.inner*256)) + placeholder_18: Buffer(placeholder_12, int32, [4916], [])[(placeholder_15[cse_var_3] + elem_idx)])], 0f32)))
+ }
}
}
}
}
- for (i0.inner: int32, 0, 16) {
- let cse_var_20: int32 = (((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 32)*16))
- compute_5: Buffer(compute_2, float32, [65536], [])[ramp(cse_var_20, 1, 16)] = max((compute_4[ramp((i0.inner*16), 1, 16)] + placeholder_19: Buffer(placeholder_14, float32, [65536], [])[ramp(cse_var_20, 1, 16)]), broadcast(0f32, 16))
+ for (i0.inner: int32, 0, 32) {
+ for (i1.inner: int32, 0, 32) {
+ let cse_var_4: int32 = ((((floordiv(i0.outer.i1.outer.fused, 16)*16384) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 16)*32)) + i1.inner)
+ compute_5: Buffer(compute_2, float32, [65536], [])[cse_var_4] = max((compute_4[((i0.inner*32) + i1.inner)] + placeholder_19: Buffer(placeholder_14, float32, [65536], [])[cse_var_4]), 0f32)
+ }
}
}
}
@@ -767,7 +695,7 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
<span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 1.939 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 1.608 ms
</pre></div>
</div>
<div class="admonition note">
diff --git a/docs/how_to/tune_with_autotvm/sg_execution_times.html b/docs/how_to/tune_with_autotvm/sg_execution_times.html
index bbcfefee7f..09dc13e0b2 100644
--- a/docs/how_to/tune_with_autotvm/sg_execution_times.html
+++ b/docs/how_to/tune_with_autotvm/sg_execution_times.html
@@ -340,7 +340,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-tune-with-autotvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">ΒΆ</a></h1>
-<p><strong>00:51.902</strong> total execution time for <strong>how_to_tune_with_autotvm</strong> files:</p>
+<p><strong>00:42.433</strong> total execution time for <strong>how_to_tune_with_autotvm</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 84%" />
@@ -349,11 +349,11 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_conv2d_cuda.html#sphx-glr-how-to-tune-with-autotvm-tune-conv2d-cuda-py"><span class="std std-ref">Tuning High Performance Convolution on NVIDIA GPUs</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_cuda.py</span></code>)</p></td>
-<td><p>00:51.868</p></td>
+<td><p>00:42.398</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_relay_x86.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-x86-py"><span class="std std-ref">Auto-tuning a Convolutional Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_x86.py</span></code>)</p></td>
-<td><p>00:00.020</p></td>
+<td><p>00:00.022</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_relay_cuda.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-cuda-py"><span class="std std-ref">Auto-tuning a Convolutional Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_cuda.py</span></code>)</p></td>
@@ -361,7 +361,7 @@
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_relay_arm.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-arm-py"><span class="std std-ref">Auto-tuning a Convolutional Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_arm.py</span></code>)</p></td>
-<td><p>00:00.004</p></td>
+<td><p>00:00.005</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_relay_mobile_gpu.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-mobile-gpu-py"><span class="std std-ref">Auto-tuning a Convolutional Network for Mobile GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_mobile_gpu.py</span></code>)</p></td>
diff --git a/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html b/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
index e1e23a0d37..726f3020e7 100644
--- a/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
+++ b/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
@@ -690,9 +690,8 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 4, 16]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9369538
-No: 2 GFLOPS: 119.97/119.97 result: MeasureResult(costs=(0.0019296020655737705,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.6506681442260742, timestamp=1673586277.1947305) [('tile_f', [-1, 1, 8, 1]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 64, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,795987
-No: 3 GFLOPS: 0.00/119.97 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 2, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 8, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9788789
+No: 2 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -814,8 +813,8 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 256]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1156339
-No: 4 GFLOPS: 0.00/119.97 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 16, 8, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 64]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7714645
+No: 3 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -937,8 +936,8 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 1, 16]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 32, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,212907
-No: 5 GFLOPS: 0.00/119.97 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 2, 64]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 64]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5588645
+No: 4 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1060,9 +1059,8 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 2, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 64, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6059911
-No: 6 GFLOPS: 46.72/119.97 result: MeasureResult(costs=(0.004955297272727273,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.4855399131774902, timestamp=1673586282.7600145) [('tile_f', [-1, 1, 8, 16]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8392739
-No: 7 GFLOPS: 0.00/119.97 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 32, 4]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 128, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5253730
+No: 5 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1184,8 +1182,8 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 1, 16]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 32, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,3310726
-No: 8 GFLOPS: 0.00/119.97 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 2, 32]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 16, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,3374552
+No: 6 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1307,8 +1305,131 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 32, 16]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 32, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7024784
-No: 9 GFLOPS: 0.00/119.97 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 16, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 256, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,996417
+No: 7 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
+ func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
... 3942 lines suppressed ...