You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by tq...@apache.org on 2022/09/22 22:52:00 UTC
[tvm-site] branch asf-site updated: deploying docs (apache/tvm@4e783a6087fd236c588cde30e0ac99daa15afe61)
This is an automated email from the ASF dual-hosted git repository.
tqchen pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/tvm-site.git
The following commit(s) were added to refs/heads/asf-site by this push:
new d6655c670a deploying docs (apache/tvm@4e783a6087fd236c588cde30e0ac99daa15afe61)
d6655c670a is described below
commit d6655c670a8b4994a82b103ed4cdec79bb5d79b0
Author: tvm-bot <95...@users.noreply.github.com>
AuthorDate: Thu Sep 22 22:51:52 2022 +0000
deploying docs (apache/tvm@4e783a6087fd236c588cde30e0ac99daa15afe61)
---
docs/_images/sphx_glr_micro_train_001.png | Bin 329141 -> 324216 bytes
docs/_images/sphx_glr_micro_train_thumb.png | Bin 22792 -> 23634 bytes
.../how_to/compile_models/from_darknet.rst.txt | 2 +-
.../how_to/compile_models/from_keras.rst.txt | 2 +-
.../how_to/compile_models/from_mxnet.rst.txt | 2 +-
.../how_to/compile_models/from_oneflow.rst.txt | 2 +-
.../how_to/compile_models/from_pytorch.rst.txt | 2 +-
.../how_to/compile_models/from_tensorflow.rst.txt | 2 +-
.../compile_models/sg_execution_times.rst.txt | 22 +-
.../deploy_models/deploy_model_on_android.rst.txt | 2 +-
.../deploy_object_detection_pytorch.rst.txt | 4 +-
.../deploy_models/deploy_prequantized.rst.txt | 6 +-
.../deploy_prequantized_tflite.rst.txt | 4 +-
.../how_to/deploy_models/deploy_quantized.rst.txt | 2 +-
.../deploy_models/deploy_ssd_gluoncv.rst.txt | 4 +-
.../deploy_models/sg_execution_times.rst.txt | 20 +-
.../extend_tvm/bring_your_own_datatypes.rst.txt | 2 +-
.../how_to/extend_tvm/sg_execution_times.rst.txt | 10 +-
.../how_to/extend_tvm/use_pass_instrument.rst.txt | 16 +-
.../optimize_operators/opt_conv_cuda.rst.txt | 2 +-
.../optimize_operators/opt_conv_tensorcore.rst.txt | 2 +-
.../how_to/optimize_operators/opt_gemm.rst.txt | 16 +-
.../optimize_operators/sg_execution_times.rst.txt | 8 +-
.../sg_execution_times.rst.txt | 14 +-
.../tune_conv2d_layer_cuda.rst.txt | 1410 +++++++-----------
.../tune_network_cuda.rst.txt | 2 +-
.../tune_network_x86.rst.txt | 4 +-
.../tune_sparse_x86.rst.txt | 161 +-
.../tune_with_autotvm/sg_execution_times.rst.txt | 10 +-
.../tune_with_autotvm/tune_conv2d_cuda.rst.txt | 192 ++-
.../work_with_microtvm/micro_autotune.rst.txt | 16 +-
.../how_to/work_with_microtvm/micro_train.rst.txt | 18 +-
.../work_with_microtvm/sg_execution_times.rst.txt | 10 +-
.../work_with_relay/sg_execution_times.rst.txt | 8 +-
.../how_to/work_with_schedules/intrin_math.rst.txt | 2 +-
.../work_with_schedules/sg_execution_times.rst.txt | 12 +-
.../how_to/work_with_schedules/tensorize.rst.txt | 2 +-
.../tutorials/autotvm/sg_execution_times.rst.txt | 4 +-
.../frontend/deploy_classification.rst.txt | 2 +-
.../tutorials/frontend/deploy_detection.rst.txt | 2 +-
.../tutorials/frontend/sg_execution_times.rst.txt | 6 +-
.../tutorials/optimize/sg_execution_times.rst.txt | 6 +-
.../topic/vta/tutorials/sg_execution_times.rst.txt | 6 +-
.../tutorial/auto_scheduler_matmul_x86.rst.txt | 2 +-
docs/_sources/tutorial/autotvm_matmul_x86.rst.txt | 20 +-
docs/_sources/tutorial/autotvm_relay_x86.rst.txt | 56 +-
.../tutorial/cross_compilation_and_rpc.rst.txt | 2 +-
docs/_sources/tutorial/intro_topi.rst.txt | 2 +-
docs/_sources/tutorial/sg_execution_times.rst.txt | 24 +-
.../tutorial/tensor_expr_get_started.rst.txt | 45 +-
docs/commit_hash | 2 +-
docs/genindex.html | 4 +
docs/how_to/compile_models/from_darknet.html | 2 +-
docs/how_to/compile_models/from_keras.html | 2 +-
docs/how_to/compile_models/from_mxnet.html | 2 +-
docs/how_to/compile_models/from_oneflow.html | 13 +-
docs/how_to/compile_models/from_pytorch.html | 6 +-
docs/how_to/compile_models/from_tensorflow.html | 2 +-
docs/how_to/compile_models/sg_execution_times.html | 26 +-
.../deploy_models/deploy_model_on_android.html | 2 +-
.../deploy_object_detection_pytorch.html | 23 +-
docs/how_to/deploy_models/deploy_prequantized.html | 8 +-
.../deploy_models/deploy_prequantized_tflite.html | 4 +-
docs/how_to/deploy_models/deploy_quantized.html | 2 +-
docs/how_to/deploy_models/deploy_ssd_gluoncv.html | 38 +-
docs/how_to/deploy_models/sg_execution_times.html | 20 +-
.../extend_tvm/bring_your_own_datatypes.html | 2 +-
docs/how_to/extend_tvm/sg_execution_times.html | 10 +-
docs/how_to/extend_tvm/use_pass_instrument.html | 16 +-
docs/how_to/optimize_operators/opt_conv_cuda.html | 2 +-
.../optimize_operators/opt_conv_tensorcore.html | 2 +-
docs/how_to/optimize_operators/opt_gemm.html | 16 +-
.../optimize_operators/sg_execution_times.html | 8 +-
.../sg_execution_times.html | 14 +-
.../tune_conv2d_layer_cuda.html | 1410 +++++++-----------
.../tune_with_autoscheduler/tune_network_cuda.html | 2 +-
.../tune_with_autoscheduler/tune_network_x86.html | 4 +-
.../tune_with_autoscheduler/tune_sparse_x86.html | 161 +-
.../tune_with_autotvm/sg_execution_times.html | 10 +-
.../how_to/tune_with_autotvm/tune_conv2d_cuda.html | 192 ++-
docs/how_to/work_with_microtvm/micro_autotune.html | 16 +-
docs/how_to/work_with_microtvm/micro_train.html | 16 +-
.../work_with_microtvm/sg_execution_times.html | 10 +-
.../how_to/work_with_relay/sg_execution_times.html | 8 +-
docs/how_to/work_with_schedules/intrin_math.html | 2 +-
.../work_with_schedules/sg_execution_times.html | 12 +-
docs/how_to/work_with_schedules/tensorize.html | 2 +-
docs/install/nnpack.html | 12 +-
docs/objects.inv | Bin 23538 -> 23545 bytes
docs/reference/api/doxygen/analyzer_8h.html | 2 +-
.../api/doxygen/analyzer_8h__dep__incl.svg | 802 +++++-----
docs/reference/api/doxygen/array_8h__dep__incl.svg | 384 ++---
.../doxygen/attr__registry__map_8h__dep__incl.svg | 204 +--
docs/reference/api/doxygen/bound_8h.html | 2 +-
docs/reference/api/doxygen/bound_8h__dep__incl.svg | 814 +++++------
docs/reference/api/doxygen/buffer_8h.html | 2 +-
.../reference/api/doxygen/buffer_8h__dep__incl.svg | 899 ++++++------
.../api/doxygen/c__runtime__api_8h__dep__incl.svg | 568 ++++----
.../api/doxygen/data__type_8h__dep__incl.svg | 664 ++++-----
.../api/doxygen/diagnostic_8h__dep__incl.svg | 48 +-
docs/reference/api/doxygen/dir_000016_000032.html | 2 +-
docs/reference/api/doxygen/dir_000038_000032.html | 2 +-
.../dir_3a038e7bfa2370c6aee2a5aecd5d3ef1.html | 3 +
.../dir_3a038e7bfa2370c6aee2a5aecd5d3ef1_dep.svg | 4 +-
.../dir_54983dd6d74c59f67ee9e8e5a50aafc4_dep.svg | 4 +-
.../dir_8e4e25e66b8623d88c5b5dd2040bca97_dep.svg | 4 +-
.../dir_ac57496531ccbad72f774fa62e6de987_dep.svg | 4 +-
.../dir_b4c7d8e826c599ba55146c099a14beb5_dep.svg | 4 +-
.../api/doxygen/env__func_8h__dep__incl.svg | 104 +-
docs/reference/api/doxygen/files.html | 9 +-
.../api/doxygen/functor_8h__dep__incl.svg | 464 +++---
.../api/doxygen/index__map_8h__dep__incl.svg | 128 +-
docs/reference/api/doxygen/int__set_8h.html | 2 +-
.../api/doxygen/int__set_8h__dep__incl.svg | 1016 +++++++------
.../api/doxygen/ir_2adt_8h__dep__incl.svg | 180 +--
.../api/doxygen/ir_2attrs_8h__dep__incl.svg | 236 +--
.../api/doxygen/ir_2expr_8h__dep__incl.svg | 540 +++----
.../api/doxygen/ir_2function_8h__dep__incl.svg | 248 ++--
.../api/doxygen/ir_2module_8h__dep__incl.svg | 180 +--
.../reference/api/doxygen/ir_2op_8h__dep__incl.svg | 44 +-
.../api/doxygen/ir_2span_8h__dep__incl.svg | 552 +++----
.../api/doxygen/ir_2type_8h__dep__incl.svg | 472 +++---
docs/reference/api/doxygen/layer__norm_8h.html | 113 ++
.../reference/api/doxygen/layer__norm_8h__incl.svg | 1531 ++++++++++++++++++++
.../api/doxygen/layer__norm_8h_source.html | 100 ++
docs/reference/api/doxygen/map_8h__dep__incl.svg | 464 +++---
.../api/doxygen/namespacemembers_func_l.html | 11 +-
.../api/doxygen/namespacemembers_func_m.html | 13 +-
.../api/doxygen/namespacemembers_func_p.html | 6 +-
.../api/doxygen/namespacemembers_func_s.html | 6 +-
docs/reference/api/doxygen/namespacemembers_l.html | 17 +-
docs/reference/api/doxygen/namespacemembers_m.html | 13 +-
docs/reference/api/doxygen/namespacemembers_p.html | 6 +-
docs/reference/api/doxygen/namespacemembers_s.html | 8 +-
.../api/doxygen/namespacetvm_1_1topi.html | 566 ++++----
.../api/doxygen/namespacetvm_1_1topi_1_1nn.html | 84 ++
.../api/doxygen/ndarray_8h__dep__incl.svg | 556 +++----
docs/reference/api/doxygen/node_8h__dep__incl.svg | 516 +++----
.../reference/api/doxygen/object_8h__dep__incl.svg | 748 +++++-----
.../api/doxygen/object__path_8h__dep__incl.svg | 512 +++----
docs/reference/api/doxygen/operation_8h.html | 2 +-
.../api/doxygen/operation_8h__dep__incl.svg | 718 ++++-----
.../api/doxygen/optional_8h__dep__incl.svg | 620 ++++----
.../api/doxygen/packed__func_8h__dep__incl.svg | 356 ++---
docs/reference/api/doxygen/reduction_8h.html | 3 +
.../reference/api/doxygen/reduction_8h_source.html | 4 +-
.../api/doxygen/reflection_8h__dep__incl.svg | 688 ++++-----
.../api/doxygen/registry_8h__dep__incl.svg | 208 +--
.../api/doxygen/repr__printer_8h__dep__incl.svg | 508 +++----
.../runtime_2container_2adt_8h__dep__incl.svg | 180 +--
.../runtime_2container_2base_8h__dep__incl.svg | 740 +++++-----
.../api/doxygen/runtime_2memory_8h__dep__incl.svg | 580 ++++----
.../api/doxygen/runtime_2module_8h__dep__incl.svg | 336 ++---
docs/reference/api/doxygen/search/all_11.js | 2 +-
docs/reference/api/doxygen/search/all_14.js | 6 +-
docs/reference/api/doxygen/search/all_18.js | 2 +-
docs/reference/api/doxygen/search/all_d.js | 2 +
docs/reference/api/doxygen/search/all_e.js | 1 +
docs/reference/api/doxygen/search/files_8.js | 1 +
docs/reference/api/doxygen/search/functions_10.js | 2 +-
docs/reference/api/doxygen/search/functions_13.js | 4 +-
docs/reference/api/doxygen/search/functions_17.js | 2 +-
docs/reference/api/doxygen/search/functions_c.js | 1 +
docs/reference/api/doxygen/search/functions_d.js | 1 +
.../api/doxygen/serializer_8h__dep__incl.svg | 544 +++----
.../api/doxygen/shape__tuple_8h__dep__incl.svg | 420 +++---
.../api/doxygen/source__map_8h__dep__incl.svg | 256 ++--
docs/reference/api/doxygen/stmt_8h__dep__incl.svg | 340 ++---
.../reference/api/doxygen/string_8h__dep__incl.svg | 468 +++---
.../doxygen/structural__equal_8h__dep__incl.svg | 500 +++----
.../api/doxygen/structural__hash_8h__dep__incl.svg | 500 +++----
docs/reference/api/doxygen/tags_8h.html | 2 +-
docs/reference/api/doxygen/tags_8h__dep__incl.svg | 340 ++---
docs/reference/api/doxygen/te_2schedule_8h.html | 2 +-
.../api/doxygen/te_2schedule_8h__dep__incl.svg | 812 +++++------
docs/reference/api/doxygen/tensor_8h.html | 2 +-
.../reference/api/doxygen/tensor_8h__dep__incl.svg | 886 ++++++-----
docs/reference/api/doxygen/tensor__intrin_8h.html | 2 +-
.../api/doxygen/tensor__intrin_8h__dep__incl.svg | 750 +++++-----
.../api/doxygen/tir_2expr_8h__dep__incl.svg | 396 ++---
docs/reference/api/doxygen/tir_2op_8h.html | 2 +-
.../api/doxygen/tir_2op_8h__dep__incl.svg | 910 ++++++------
.../api/doxygen/type__relation_8h__dep__incl.svg | 92 +-
docs/reference/api/doxygen/var_8h__dep__incl.svg | 352 ++---
docs/reference/api/doxygen/with_8h__dep__incl.svg | 320 ++--
docs/reference/api/python/auto_scheduler.html | 4 +-
docs/reference/api/python/topi.html | 26 +
.../api/typedoc/classes/bytestreamreader.html | 12 +-
.../api/typedoc/classes/cachedcallstack.html | 34 +-
docs/reference/api/typedoc/classes/dldatatype.html | 12 +-
docs/reference/api/typedoc/classes/dldevice.html | 10 +-
.../reference/api/typedoc/classes/environment.html | 12 +-
docs/reference/api/typedoc/classes/ffilibrary.html | 20 +-
.../api/typedoc/classes/graphexecutor.html | 16 +-
docs/reference/api/typedoc/classes/instance.html | 40 +-
docs/reference/api/typedoc/classes/memory.html | 34 +-
docs/reference/api/typedoc/classes/module.html | 10 +-
docs/reference/api/typedoc/classes/ndarray.html | 22 +-
.../api/typedoc/classes/packedfunccell.html | 6 +-
docs/reference/api/typedoc/classes/rpcserver.html | 14 +-
docs/reference/api/typedoc/classes/scalar.html | 6 +-
.../api/typedoc/classes/webgpucontext.html | 12 +-
docs/reference/api/typedoc/enums/argtypecode.html | 30 +-
.../api/typedoc/enums/aynccallbackcode.html | 4 +-
.../api/typedoc/enums/dldatatypecode.html | 8 +-
.../api/typedoc/enums/rpcserverstate.html | 12 +-
docs/reference/api/typedoc/enums/sizeof.html | 18 +-
docs/reference/api/typedoc/index.html | 112 +-
.../api/typedoc/interfaces/disposable.html | 2 +-
.../api/typedoc/interfaces/functioninfo.html | 6 +-
.../api/typedoc/interfaces/libraryprovider.html | 4 +-
docs/searchindex.js | 2 +-
.../vta/tutorials/autotvm/sg_execution_times.html | 4 +-
.../tutorials/frontend/deploy_classification.html | 2 +-
.../vta/tutorials/frontend/deploy_detection.html | 2 +-
.../vta/tutorials/frontend/sg_execution_times.html | 6 +-
.../vta/tutorials/optimize/sg_execution_times.html | 6 +-
docs/topic/vta/tutorials/sg_execution_times.html | 6 +-
docs/tutorial/auto_scheduler_matmul_x86.html | 2 +-
docs/tutorial/autotvm_matmul_x86.html | 20 +-
docs/tutorial/autotvm_relay_x86.html | 270 ++--
docs/tutorial/cross_compilation_and_rpc.html | 2 +-
docs/tutorial/intro_topi.html | 2 +-
docs/tutorial/sg_execution_times.html | 28 +-
docs/tutorial/tensor_expr_get_started.html | 41 +-
225 files changed, 16576 insertions(+), 15280 deletions(-)
diff --git a/docs/_images/sphx_glr_micro_train_001.png b/docs/_images/sphx_glr_micro_train_001.png
index 9c86215278..58230570fb 100644
Binary files a/docs/_images/sphx_glr_micro_train_001.png and b/docs/_images/sphx_glr_micro_train_001.png differ
diff --git a/docs/_images/sphx_glr_micro_train_thumb.png b/docs/_images/sphx_glr_micro_train_thumb.png
index 839a4f5975..c7f45c5bc4 100644
Binary files a/docs/_images/sphx_glr_micro_train_thumb.png and b/docs/_images/sphx_glr_micro_train_thumb.png differ
diff --git a/docs/_sources/how_to/compile_models/from_darknet.rst.txt b/docs/_sources/how_to/compile_models/from_darknet.rst.txt
index e8b0332999..2b5422a6fe 100644
--- a/docs/_sources/how_to/compile_models/from_darknet.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_darknet.rst.txt
@@ -315,7 +315,7 @@ The process is no different from other examples.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 2.659 seconds)
+ **Total running time of the script:** ( 1 minutes 2.345 seconds)
.. _sphx_glr_download_how_to_compile_models_from_darknet.py:
diff --git a/docs/_sources/how_to/compile_models/from_keras.rst.txt b/docs/_sources/how_to/compile_models/from_keras.rst.txt
index d3007248dd..509c185f80 100644
--- a/docs/_sources/how_to/compile_models/from_keras.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_keras.rst.txt
@@ -228,7 +228,7 @@ Look up prediction top 1 index in 1000 class synset.
.. code-block:: none
Relay top-1 id: 285, class name: Egyptian cat
-
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 1s 939ms/step
+
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 1s 952ms/step
Keras top-1 id: 285, class name: Egyptian cat
diff --git a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
index 62cb71a7c9..81176c4da9 100644
--- a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
@@ -115,7 +115,7 @@ In this section, we download a pretrained imagenet model and classify an image.
.. code-block:: none
- Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zipad65b5d5-f351-40df-b354-b0c5e6ea4e50 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+ Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zipb4fe9518-3411-4a64-8110-b3cf781ae214 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
x (1, 3, 224, 224)
diff --git a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
index 6b38d8fbc2..a37a075804 100644
--- a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
@@ -116,7 +116,7 @@ Load a pretrained OneFlow model and save model
.. code-block:: none
Downloading: "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip" to /workspace/.oneflow/flowvision_cache/resnet18.zip
-
0%| | 0.00/41.5M [00:00<?, ?B/s]
15%|#5 | 6.33M/41.5M [00:00<00:00, 60.4MB/s]
29%|##9 | 12.1M/41.5M [00:00<00:00, 47.3MB/s]
40%|#### | 16.7M/41.5M [00:00<00:00, 32.8MB/s]
58%|#####7 | 24.0M/41.5M [00:00<00:00, 39.5MB/s]
77%|#######7 | 32.0M/41.5M [00:00<00:00, 45.6MB/s]
96%|#########6| 40.0M/41.5M [00:00<00:00, 48.1MB/s]
100%|##########| 41.5M/41.5M [00:00<00:00, 46.4MB/s]
+
0%| | 0.00/41.5M [00:00<?, ?B/s]
19%|#9 | 7.99M/41.5M [00:00<00:00, 66.9MB/s]
39%|###8 | 16.0M/41.5M [00:00<00:00, 71.1MB/s]
58%|#####7 | 24.0M/41.5M [00:00<00:00, 68.0MB/s]
77%|#######7 | 32.0M/41.5M [00:00<00:00, 73.2MB/s]
94%|#########4| 39.1M/41.5M [00:00<00:00, 52.7MB/s]
100%|##########| 41.5M/41.5M [00:00<00:00, 59.7MB/s]
diff --git a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
index f6c583bc43..d2f69bc687 100644
--- a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
@@ -94,7 +94,7 @@ Load a pretrained PyTorch model
.. code-block:: none
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
-
0%| | 0.00/44.7M [00:00<?, ?B/s]
44%|####3 | 19.6M/44.7M [00:00<00:00, 205MB/s]
93%|#########3| 41.6M/44.7M [00:00<00:00, 220MB/s]
100%|##########| 44.7M/44.7M [00:00<00:00, 215MB/s]
+
0%| | 0.00/44.7M [00:00<?, ?B/s]
22%|##1 | 9.76M/44.7M [00:00<00:00, 102MB/s]
46%|####6 | 20.6M/44.7M [00:00<00:00, 109MB/s]
100%|##########| 44.7M/44.7M [00:00<00:00, 157MB/s]
diff --git a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
index 6731381ac3..de44ff9918 100644
--- a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
@@ -416,7 +416,7 @@ Run the corresponding model on tensorflow
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 3.207 seconds)
+ **Total running time of the script:** ( 1 minutes 3.087 seconds)
.. _sphx_glr_download_how_to_compile_models_from_tensorflow.py:
diff --git a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
index b129f128f9..c2c2d7c62a 100644
--- a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
@@ -5,26 +5,26 @@
Computation times
=================
-**05:02.275** total execution time for **how_to_compile_models** files:
+**05:02.529** total execution time for **how_to_compile_models** files:
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``) | 01:03.207 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``) | 01:03.087 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``) | 01:02.659 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``) | 01:02.345 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``) | 00:38.688 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``) | 00:38.976 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``) | 00:28.892 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``) | 00:27.497 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``) | 00:25.105 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``) | 00:26.280 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``) | 00:24.955 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``) | 00:24.429 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``) | 00:21.186 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``) | 00:21.363 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``) | 00:18.982 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``) | 00:19.564 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``) | 00:16.211 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``) | 00:16.439 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``) | 00:02.390 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``) | 00:02.550 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
index 0d02f4996c..c90b45682c 100644
--- a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
@@ -434,7 +434,7 @@ Execute on TVM
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 15.5880 15.5711 15.8581 15.4948 0.1032
+ 15.6156 15.5715 15.9768 15.5318 0.1261
diff --git a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
index d6db5b2205..7cd7c546f9 100644
--- a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
@@ -123,7 +123,7 @@ Load pre-trained maskrcnn from torchvision and do tracing
.. code-block:: none
Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
-
0%| | 0.00/170M [00:00<?, ?B/s]
3%|2 | 4.25M/170M [00:00<00:03, 44.6MB/s]
5%|5 | 8.97M/170M [00:00<00:03, 47.4MB/s]
17%|#6 | 28.3M/170M [00:00<00:01, 118MB/s]
32%|###1 | 54.1M/170M [00:00<00:00, 178MB/s]
42%|####1 | 71.1M/170M [00:00<00:00, 173MB/s]
52%|#####1 | 87.7M/170M [00:00<00:00, 167MB/s]
61%|######1 | 104M/170M [00:00<00:00, 164MB/s]
76%|#######5 | 129M/170M [00:00<00:00, 195MB/s]
87%|########7 | 148M/170M [00:00<00:00, 198MB/s]
99%|#########8| 167M/170M [00:01<00:00, 180MB/s]
100%|##########| 170M/170M [00:01<00:00, 167MB/s]
+
0%| | 0.00/170M [00:00<?, ?B/s]
2%|2 | 4.24M/170M [00:00<00:03, 44.5MB/s]
5%|4 | 8.48M/170M [00:00<00:04, 41.6MB/s]
17%|#7 | 29.4M/170M [00:00<00:01, 120MB/s]
31%|###1 | 53.0M/170M [00:00<00:00, 169MB/s]
44%|####4 | 75.3M/170M [00:00<00:00, 190MB/s]
55%|#####5 | 93.7M/170M [00:00<00:00, 167MB/s]
66%|######5 | 111M/170M [00:00<00:00, 173MB/s]
78%|#######7 | 132M/170M [00:00<00:00, 185MB/s]
89%|########9 | 152M/170M [00:00<00:00, 193MB/s]
100%|##########| 170M/170M [00:01<00:00, 171MB/s]
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:3878: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
for i in range(dim)
/usr/local/lib/python3.7/dist-packages/torchvision/models/detection/anchor_utils.py:127: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
@@ -288,7 +288,7 @@ Get boxes with score larger than 0.9
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 2 minutes 56.393 seconds)
+ **Total running time of the script:** ( 2 minutes 54.438 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_object_detection_pytorch.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
index b9bdcaf706..0b62abe741 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
@@ -232,7 +232,7 @@ training. Other models require a full post training calibration.
.. code-block:: none
Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
-
0%| | 0.00/13.6M [00:00<?, ?B/s]
26%|##6 | 3.54M/13.6M [00:00<00:00, 37.0MB/s]
52%|#####2 | 7.08M/13.6M [00:00<00:00, 35.3MB/s]
100%|##########| 13.6M/13.6M [00:00<00:00, 58.6MB/s]
+
0%| | 0.00/13.6M [00:00<?, ?B/s]
100%|##########| 13.6M/13.6M [00:00<00:00, 156MB/s]
@@ -405,7 +405,7 @@ Here we give an example of how to measure performance of TVM compiled models.
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 90.2916 90.1593 96.2862 89.9157 0.6391
+ 90.1920 90.1001 92.8598 89.9293 0.3587
@@ -454,7 +454,7 @@ TODO
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 7.576 seconds)
+ **Total running time of the script:** ( 1 minutes 7.664 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_prequantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
index 3d9a295ff4..dd0176f589 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
@@ -432,7 +432,7 @@ Here we give an example of how to measure performance of TVM compiled models.
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 120.0710 119.9796 124.3624 118.7161 0.8110
+ 119.2496 119.4333 122.3497 117.1914 1.0613
@@ -469,7 +469,7 @@ Here we give an example of how to measure performance of TVM compiled models.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 52.732 seconds)
+ **Total running time of the script:** ( 1 minutes 52.227 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_prequantized_tflite.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
index b21fbcf0be..6417836cd3 100644
--- a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
@@ -253,7 +253,7 @@ We create a Relay VM to build and execute the model.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 25.819 seconds)
+ **Total running time of the script:** ( 1 minutes 22.168 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_quantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
index d9e3bb4488..b0f1acd260 100644
--- a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
@@ -158,7 +158,7 @@ Convert and compile model for CPU.
data: None
input_sym_arg_type = in_param.infer_type()[0]
Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
-
0%| | 0/132723 [00:00<?, ?KB/s]
4%|3 | 4699/132723 [00:00<00:02, 46984.78KB/s]
10%|9 | 12752/132723 [00:00<00:01, 66712.47KB/s]
16%|#5 | 20944/132723 [00:00<00:01, 73654.74KB/s]
22%|##1 | 29143/132723 [00:00<00:01, 76942.53KB/s]
28%|##8 | 37304/132723 [00:00<00:01, 78623.23KB/s]
34%|###4 | 45445/132723 [00:00<00:01, 79568.58KB/s]
40%|#### | 53637/132723 [00:00<00:00, 80334.48KB/s]
47%|####6 | 61831/132723 [00:00<00:00, 80842.53KB/s]
53%|#####2 | 69978/132723 [00:00<00:00, 81037.13KB/s]
59%|#####8 | 78082/132723 [00:01<00:00, 78412.88KB/s]
65%|######4 | 86209/132723 [00:01<00:00, 79260.15KB/s]
71%|#######1 | 94425/132723 [00:01<00:00, 80126.85KB/s]
77%|#######7 | 102561/132723 [00:01<00:00, 80489.18KB/s]
83%|########3 | 110757/132723 [00:01<00:00, 80928.15KB/s]
90%|########9 | 118856/132723 [00:01<00:00, 79754.76KB/s]
96%|########
#5| 126840/132723 [00:01<00:00, 79420.54KB/s]
100%|##########| 132723/132723 [00:01<00:00, 78475.19KB/s]
+
0%| | 0/132723 [00:00<?, ?KB/s]
1%|1 | 1853/132723 [00:00<00:07, 18523.44KB/s]
4%|4 | 5643/132723 [00:00<00:04, 29916.54KB/s]
10%|9 | 12820/132723 [00:00<00:02, 49023.87KB/s]
15%|#5 | 20439/132723 [00:00<00:01, 59744.91KB/s]
21%|##1 | 28091/132723 [00:00<00:01, 65787.26KB/s]
27%|##6 | 35715/132723 [00:00<00:01, 69339.71KB/s]
33%|###2 | 43376/132723 [00:00<00:01, 71713.82KB/s]
38%|###8 | 50981/132723 [00:00<00:01, 73091.49KB/s]
44%|####4 | 58665/132723 [00:00<00:00, 74257.33KB/s]
50%|##### | 66402/132723 [00:01<00:00, 75214.32KB/s]
56%|#####5 | 74108/132723 [00:01<00:00, 75778.10KB/s]
62%|######1 | 81818/132723 [00:01<00:00, 76175.14KB/s]
67%|######7 | 89505/132723 [00:01<00:00, 76384.64KB/s]
73%|#######3 | 97144/132723 [00:01<00:00, 76378.58KB/s]
79%|#######8 | 104782/132723 [00:01<00:00, 75872.05KB/s]
85%|########4 |
112370/132723 [00:01<00:00, 75751.69KB/s]
90%|######### | 119946/132723 [00:01<00:00, 75375.03KB/s]
96%|#########6| 127485/132723 [00:01<00:00, 75175.26KB/s]
100%|##########| 132723/132723 [00:01<00:00, 70741.71KB/s]
@@ -234,7 +234,7 @@ Display result
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 2 minutes 32.893 seconds)
+ **Total running time of the script:** ( 2 minutes 33.980 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_ssd_gluoncv.py:
diff --git a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
index bf59a24bbe..81d2d1c997 100644
--- a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
@@ -5,24 +5,24 @@
Computation times
=================
-**11:09.444** total execution time for **how_to_deploy_models** files:
+**11:03.720** total execution time for **how_to_deploy_models** files:
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``) | 02:56.393 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``) | 02:54.438 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``) | 02:32.893 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``) | 02:33.980 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``) | 01:52.732 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``) | 01:52.227 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``) | 01:25.819 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``) | 01:22.168 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``) | 01:07.576 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``) | 01:07.664 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``) | 00:29.309 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``) | 00:29.555 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_nano.py` (``deploy_model_on_nano.py``) | 00:22.628 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_nano.py` (``deploy_model_on_nano.py``) | 00:22.025 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``) | 00:22.088 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``) | 00:21.658 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_sparse.py` (``deploy_sparse.py``) | 00:00.006 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_sparse.py` (``deploy_sparse.py``) | 00:00.007 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
index f4ff4a5eaf..f1b4877015 100644
--- a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
@@ -472,7 +472,7 @@ First let us define two helper functions to get the mobilenet model and a cat im
.. code-block:: none
- Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip43250d6e-f713-42db-bbf0-2168e938c07c from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+ Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip18a0ccaa-7530-4369-9b75-790d8fb0e3ef from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
diff --git a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
index a75a249a92..f2c3ed10b8 100644
--- a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
Computation times
=================
-**00:40.258** total execution time for **how_to_extend_tvm** files:
+**00:40.753** total execution time for **how_to_extend_tvm** files:
+-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``) | 00:37.192 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``) | 00:37.678 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``) | 00:02.156 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``) | 00:02.153 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``) | 00:00.902 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``) | 00:00.915 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_low_level_custom_pass.py` (``low_level_custom_pass.py``) | 00:00.007 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_low_level_custom_pass.py` (``low_level_custom_pass.py``) | 00:00.008 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
index ab56ff6b5d..eb0bbb7a19 100644
--- a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
@@ -216,10 +216,10 @@ profile the execution time of each passes.
.. code-block:: none
Printing results of timing profile...
- InferType: 6774us [6774us] (45.94%; 45.94%)
- FoldScaleAxis: 7972us [5us] (54.06%; 54.06%)
- FoldConstant: 7968us [1673us] (54.03%; 99.94%)
- InferType: 6295us [6295us] (42.69%; 79.00%)
+ InferType: 6751us [6751us] (45.94%; 45.94%)
+ FoldScaleAxis: 7945us [5us] (54.06%; 54.06%)
+ FoldConstant: 7940us [1626us] (54.03%; 99.94%)
+ InferType: 6313us [6313us] (42.96%; 79.52%)
@@ -258,10 +258,10 @@ Refer to following sections and :py:func:`tvm.instrument.pass_instrument` for th
.. code-block:: none
Printing results of timing profile...
- InferType: 6278us [6278us] (44.45%; 44.45%)
- FoldScaleAxis: 7846us [4us] (55.55%; 55.55%)
- FoldConstant: 7841us [1647us] (55.52%; 99.95%)
- InferType: 6195us [6195us] (43.86%; 79.00%)
+ InferType: 6342us [6342us] (44.68%; 44.68%)
+ FoldScaleAxis: 7852us [5us] (55.32%; 55.32%)
+ FoldConstant: 7847us [1632us] (55.29%; 99.94%)
+ InferType: 6215us [6215us] (43.79%; 79.20%)
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
index 0601e676f1..5630d3bf18 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
@@ -340,7 +340,7 @@ latency of convolution.
.. code-block:: none
- Convolution: 39.244670 ms
+ Convolution: 54.208480 ms
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
index 43af2e41a8..d9d79889aa 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
@@ -671,7 +671,7 @@ be able to run on our build server
.. code-block:: none
- conv2d with tensor core: 13.376755 ms
+ conv2d with tensor core: 6.682477 ms
diff --git a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
index 03064ab594..d70848b16f 100644
--- a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
@@ -143,8 +143,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
.. code-block:: none
- Numpy running time: 0.018204
- Baseline: 3.337629
+ Numpy running time: 0.017839
+ Baseline: 3.487118
@@ -239,7 +239,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
.. code-block:: none
- Opt1: 0.294708
+ Opt1: 0.293766
@@ -342,7 +342,7 @@ In this tutorial, we chose to vectorize the inner loop row data since it is cach
.. code-block:: none
- Opt2: 0.326730
+ Opt2: 0.330934
@@ -438,7 +438,7 @@ the access pattern for A matrix is more cache friendly.
.. code-block:: none
- Opt3: 0.115941
+ Opt3: 0.113946
@@ -563,7 +563,7 @@ flattening.
.. code-block:: none
- Opt4: 0.109459
+ Opt4: 0.109250
@@ -685,7 +685,7 @@ write to C when all the block results are ready.
.. code-block:: none
- Opt5: 0.113573
+ Opt5: 0.111532
@@ -810,7 +810,7 @@ Furthermore, we can also utilize multi-core processors to do the thread-level pa
.. code-block:: none
- Opt6: 0.147124
+ Opt6: 0.147087
diff --git a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
index 7076f7e677..29894fb4b8 100644
--- a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
@@ -5,12 +5,12 @@
Computation times
=================
-**00:34.390** total execution time for **how_to_optimize_operators** files:
+**00:34.556** total execution time for **how_to_optimize_operators** files:
+-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``) | 00:31.811 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``) | 00:32.220 | 0.0 MB |
+-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``) | 00:01.419 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``) | 00:01.290 | 0.0 MB |
+-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``) | 00:01.160 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``) | 00:01.046 | 0.0 MB |
+-----------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
index 62a43d387f..817325eb64 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
@@ -5,18 +5,18 @@
Computation times
=================
-**06:13.305** total execution time for **how_to_tune_with_autoscheduler** files:
+**06:26.086** total execution time for **how_to_tune_with_autoscheduler** files:
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``) | 03:20.140 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``) | 03:21.096 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``) | 01:21.526 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``) | 01:22.136 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``) | 00:55.852 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``) | 00:56.053 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``) | 00:18.698 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``) | 00:29.591 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``) | 00:08.639 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``) | 00:08.701 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``) | 00:08.450 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``) | 00:08.509 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
index e6bb5d543f..de225f00fe 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
@@ -240,483 +240,316 @@ cooperative fetching, unrolling and operator fusion.
compute: Buffer(compute_2: Pointer(float32), float32, [25088], [])}
buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, compute_1: compute}
preflattened_buffer_map = {data_1: data_3: Buffer(data_2, float32, [1, 512, 7, 7], []), kernel_1: kernel_3: Buffer(kernel_2, float32, [512, 512, 3, 3], []), bias_1: bias_3: Buffer(bias_2, float32, [1, 512, 1, 1], []), compute_1: compute_3: Buffer(compute_2, float32, [1, 512, 7, 7], [])} {
- attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 28;
- allocate(conv2d_nchw: Pointer(local float32), float32, [14]), storage_scope = local;
- allocate(pad_temp.shared: Pointer(shared float32), float32, [72]), storage_scope = shared;
- allocate(kernel.shared: Pointer(shared float32), float32, [3072]), storage_scope = shared;
- attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64 {
- conv2d_nchw_1: Buffer(conv2d_nchw, float32, [14], [], scope="local", align=32)[0] = 0f32
+ attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 64;
+ allocate(conv2d_nchw: Pointer(local float32), float32, [7]), storage_scope = local;
+ allocate(pad_temp.shared: Pointer(shared float32), float32, [4032]), storage_scope = shared;
+ allocate(kernel.shared: Pointer(shared float32), float32, [1536]), storage_scope = shared;
+ attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ conv2d_nchw_1: Buffer(conv2d_nchw, float32, [7], [], scope="local", align=16)[0] = 0f32
conv2d_nchw_1[1] = 0f32
conv2d_nchw_1[2] = 0f32
conv2d_nchw_1[3] = 0f32
conv2d_nchw_1[4] = 0f32
conv2d_nchw_1[5] = 0f32
conv2d_nchw_1[6] = 0f32
- conv2d_nchw_1[7] = 0f32
- conv2d_nchw_1[8] = 0f32
- conv2d_nchw_1[9] = 0f32
- conv2d_nchw_1[10] = 0f32
- conv2d_nchw_1[11] = 0f32
- conv2d_nchw_1[12] = 0f32
- conv2d_nchw_1[13] = 0f32
- for (rc.outer.outer: int32, 0, 64) {
- for (ry.outer.outer: int32, 0, 3) {
- let cse_var_2: int32 = (rc.outer.outer*72)
- let cse_var_1: int32 = (ry.outer.outer*3)
+ for (rc.outer.outer: int32, 0, 8) {
+ for (rx.outer.outer: int32, 0, 3) {
+ let cse_var_2: int32 = (rc.outer.outer*3136)
+ let cse_var_1: int32 = (rc.outer.outer*576)
{
- attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64 {
- if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
- pad_temp.shared_1: Buffer(pad_temp.shared, float32, [72], [], scope="shared")[(threadIdx.x_1*4)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1*4), 9))) && (floormod((threadIdx.x_1*4), 9) < 8)), data[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1*4), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod((threadIdx.x_1*4), 9)) - 8)], 0f3 [...]
- }
- if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
- pad_temp.shared_1[((threadIdx.x_1*4) + 1)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 1), 9))) && (floormod(((threadIdx.x_1*4) + 1), 9) < 8)), data[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 1), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 1), 9)) - 8)], 0f32, dtype=float32)
- }
- if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
- pad_temp.shared_1[((threadIdx.x_1*4) + 2)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 2), 9))) && (floormod(((threadIdx.x_1*4) + 2), 9) < 8)), data[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 2), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 2), 9)) - 8)], 0f32, dtype=float32)
- }
- if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
- pad_temp.shared_1[((threadIdx.x_1*4) + 3)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 3), 9))) && (floormod(((threadIdx.x_1*4) + 3), 9) < 8)), data[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 3), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 3), 9)) - 8)], 0f32, dtype=float32)
- }
+ attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1: Buffer(pad_temp.shared, float32, [4032], [], scope="shared")[(threadIdx.x_1*2)] = @tir.if_then_else(((((7 <= floormod((threadIdx.x_1*2), 63)) && (floormod((threadIdx.x_1*2), 63) < 56)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1*2), 63)*49)) + rx.outer.outer) + floormod((threadIdx.x_1*2), 63)) - 8)], 0f32, dtype=float32)
+ pad_temp.shared_1[((threadIdx.x_1*2) + 1)] = @tir.if_then_else(((((7 <= floormod(((threadIdx.x_1*2) + 1), 63)) && (floormod(((threadIdx.x_1*2) + 1), 63) < 56)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 1), 63)*49)) + rx.outer.outer) + floormod(((threadIdx.x_1*2) + 1), 63)) - 8)], 0f32, dtype=float32)
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 112), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 7), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 7), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 112) [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 113), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 7), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 7), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 + [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 224), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 5), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 5), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 224) [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 225), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 5), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 5), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 + [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 336), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 3), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 3), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 336) [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 337), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 3), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 3), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 + [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 448), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 1), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 1), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 448) [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 449), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 1), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 1), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 + [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 560), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 8), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 8), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 560) [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 561), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 8), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 8), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 + [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 672), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 6), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 6), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 672) [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 673), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 6), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 6), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 + [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 784), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 4), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 4), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 784) [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 785), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 4), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 4), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 + [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 896), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 2), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 2), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 896) [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 897), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 2), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 2), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 + [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[((threadIdx.x_1*2) + 1008)] = @tir.if_then_else(((((7 <= floormod((threadIdx.x_1*2), 63)) && (floormod((threadIdx.x_1*2), 63) < 56)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1*2), 63)*49)) + rx.outer.outer) + floormod((threadIdx.x_1*2), 63)) + 776)], 0f32, dtype=float32)
+ pad_temp.shared_1[((threadIdx.x_1*2) + 1009)] = @tir.if_then_else(((((7 <= floormod(((threadIdx.x_1*2) + 1), 63)) && (floormod(((threadIdx.x_1*2) + 1), 63) < 56)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 1), 63)*49)) + rx.outer.outer) + floormod(((threadIdx.x_1*2) + 1), 63)) + 776)], 0f32, dtype=float32)
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1120), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 7), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 7), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 112 [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1121), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 7), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 7), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1232), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 5), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 5), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 123 [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1233), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 5), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 5), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1344), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 3), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 3), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 134 [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1345), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 3), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 3), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1456), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 1), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 1), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 145 [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1457), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 1), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 1), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1568), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 8), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 8), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 156 [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1569), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 8), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 8), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1680), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 6), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 6), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 168 [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1681), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 6), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 6), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1792), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 4), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 4), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 179 [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1793), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 4), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 4), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1904), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 2), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 2), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 190 [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1905), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 2), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 2), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[((threadIdx.x_1*2) + 2016)] = @tir.if_then_else(((((7 <= floormod((threadIdx.x_1*2), 63)) && (floormod((threadIdx.x_1*2), 63) < 56)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1*2), 63)*49)) + rx.outer.outer) + floormod((threadIdx.x_1*2), 63)) + 1560)], 0f32, dtype=float32)
+ pad_temp.shared_1[((threadIdx.x_1*2) + 2017)] = @tir.if_then_else(((((7 <= floormod(((threadIdx.x_1*2) + 1), 63)) && (floormod(((threadIdx.x_1*2) + 1), 63) < 56)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 1), 63)*49)) + rx.outer.outer) + floormod(((threadIdx.x_1*2) + 1), 63)) + 1560)], 0f32, dtype=float32)
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2128), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 7), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 7), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 212 [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2129), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 7), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 7), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2240), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 5), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 5), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 224 [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2241), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 5), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 5), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2352), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 3), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 3), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 235 [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2353), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 3), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 3), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2464), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 1), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 1), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 246 [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2465), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 1), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 1), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2576), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 8), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 8), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 257 [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2577), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 8), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 8), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2688), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 6), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 6), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 268 [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2689), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 6), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 6), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2800), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 4), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 4), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 280 [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2801), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 4), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 4), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2912), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 2), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 2), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 291 [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2913), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 2), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 2), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[((threadIdx.x_1*2) + 3024)] = @tir.if_then_else(((((7 <= floormod((threadIdx.x_1*2), 63)) && (floormod((threadIdx.x_1*2), 63) < 56)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1*2), 63)*49)) + rx.outer.outer) + floormod((threadIdx.x_1*2), 63)) + 2344)], 0f32, dtype=float32)
+ pad_temp.shared_1[((threadIdx.x_1*2) + 3025)] = @tir.if_then_else(((((7 <= floormod(((threadIdx.x_1*2) + 1), 63)) && (floormod(((threadIdx.x_1*2) + 1), 63) < 56)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 1), 63)*49)) + rx.outer.outer) + floormod(((threadIdx.x_1*2) + 1), 63)) + 2344)], 0f32, dtype=float32)
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3136), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 7), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 7), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 313 [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3137), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 7), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 7), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3248), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 5), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 5), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 324 [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3249), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 5), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 5), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3360), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 3), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 3), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 336 [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3361), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 3), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 3), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3472), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 1), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 1), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 347 [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3473), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 1), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 1), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3584), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 8), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 8), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 358 [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3585), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 8), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 8), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3696), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 6), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 6), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 369 [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3697), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 6), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 6), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3808), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 4), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 4), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 380 [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3809), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 4), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 4), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3920), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 2), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 2), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 392 [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3921), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 2), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 2), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[(((((cse_var_2 [...]
+ }
+ attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1: Buffer(kernel.shared, float32, [1536], [], scope="shared")[threadIdx.x_2] = kernel[((((blockIdx.x*36864) + cse_var_1) + (threadIdx.x_2*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 56)] = kernel[(((((blockIdx.x*36864) + cse_var_1) + (floordiv((threadIdx.x_2 + 56), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 112)] = kernel[(((((blockIdx.x*36864) + cse_var_1) + (floordiv((threadIdx.x_2 + 112), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 168)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 168), 192)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 56), 64)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 224)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 224), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 192), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 280)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 280), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 88), 192), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 336)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 336), 192)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 48), 64)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 392)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 392), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 8), 192), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 448)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 448), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 64), 192), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 504)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 504), 192)*4608)) + cse_var_1) + ((floordiv(threadIdx.x_2, 3) + 40)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 560)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 560), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 176), 192), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 616)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 616), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 40), 192), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 672)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 672), 192)*4608)) + cse_var_1) + ((floordiv(threadIdx.x_2, 3) + 32)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 728)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 728), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 152), 192), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 784)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 784), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 16), 192), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 840)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 840), 192)*4608)) + cse_var_1) + ((floordiv(threadIdx.x_2, 3) + 24)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 896)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 896), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 128), 192), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 952)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 952), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 184), 192), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 1008)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 1008), 192)*4608)) + cse_var_1) + ((floordiv(threadIdx.x_2, 3) + 16)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 1064)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 1064), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 104), 192), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 1120)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 1120), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 160), 192), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 1176)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 1176), 192)*4608)) + cse_var_1) + ((floordiv(threadIdx.x_2, 3) + 8)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 1232)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 1232), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 80), 192), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 1288)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 1288), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 136), 192), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 1344)] = kernel[(((((blockIdx.x*36864) + cse_var_1) + (threadIdx.x_2*3)) + rx.outer.outer) + 32256)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 1400)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 1400), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 56), 192), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 1456)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 1456), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 112), 192), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ if @tir.likely((threadIdx.x_2 < 24), dtype=bool) {
+ kernel.shared_1[(threadIdx.x_2 + 1512)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 1512), 192)*4608)) + cse_var_1) + ((floordiv(threadIdx.x_2, 3) + 56)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ }
+ for (rc.outer.inner: int32, 0, 16) {
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((rc.outer.inner*252) + floormod(threadIdx.x, 7))]*kernel.shared_1[((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12))]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 7)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12))]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 14)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12))]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 21)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12))]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 28)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12))]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 35)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12))]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 42)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12))]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 3)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 70)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 3)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 77)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 3)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 84)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 3)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 3)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 98)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 3)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 105)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 3)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 126)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 6)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 133)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 6)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 140)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 6)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 147)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 6)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 154)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 6)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 161)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 6)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 168)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 6)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 189)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 9)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 196)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 9)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 203)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 9)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 210)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 9)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 217)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 9)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 224)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 9)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 231)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 9)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 7)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 1)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 14)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 1)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 21)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 1)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 28)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 1)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 35)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 1)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 42)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 1)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 49)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 1)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 70)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 4)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 77)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 4)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 84)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 4)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 4)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 98)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 4)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 105)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 4)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 112)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 4)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 133)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 7)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 140)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 7)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 147)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 7)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 154)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 7)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 161)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 7)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 168)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 7)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 175)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 7)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 196)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 10)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 203)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 10)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 210)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 10)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 217)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 10)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 224)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 10)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 231)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 10)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 238)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 10)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 14)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 2)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 21)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 2)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 28)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 2)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 35)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 2)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 42)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 2)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 49)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 2)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 56)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 2)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 77)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 5)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 84)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 5)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 5)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 98)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 5)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 105)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 5)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 112)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 5)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 119)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 5)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 140)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 8)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 147)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 8)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 154)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 8)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 161)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 8)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 168)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 8)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 175)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 8)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 182)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 8)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 203)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 11)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 210)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 11)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 217)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 11)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 224)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 11)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 231)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 11)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 238)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 11)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 245)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 11)]))
}
- attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1: Buffer(kernel.shared, float32, [3072], [], scope="shared")[threadIdx.x_2] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 64)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 64), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 128)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 128), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 192)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 36864)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 256)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 256), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 320)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 320), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 384)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 73728)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 448)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 448), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 512)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 512), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 576)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 110592)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 640)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 640), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 704)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 704), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 768)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 147456)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 832)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 832), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 896)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 896), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 960)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 184320)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1024)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1024), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1088)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1088), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1152)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 221184)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1216)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1216), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1280)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1280), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1344)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 258048)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1408)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1408), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1472)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1472), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1536)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 294912)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1600)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1600), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1664)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1664), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1728)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 331776)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1792)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1792), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1856)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1856), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1920)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 368640)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1984)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1984), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2048)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2048), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2112)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 405504)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2176)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2176), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2240)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2240), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2304)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 442368)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2368)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2368), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2432)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2432), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2496)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 479232)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2560)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2560), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2624)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2624), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2688)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 516096)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2752)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2752), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2816)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2816), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2880)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 552960)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2944)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2944), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 3008)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 3008), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[0]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[9]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[1]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[2]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[3]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[4]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[5]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[6]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[0]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[9]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[8]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[17]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[8]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[17]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[18]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[27]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[18]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[27]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[26]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[35]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[26]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[35]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[36]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[45]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[36]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[45]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[44]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[53]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[44]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[53]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[54]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[63]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[54]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[63]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[62]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[71]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[62]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[71]*kernel.shared_1[((threadIdx.x*48) + 47)]))
}
}
}
- for (i1.inner: int32, 0, 2) {
- for (i3.inner: int32, 0, 7) {
- compute[(((((floordiv(blockIdx.x, 7)*6272) + (threadIdx.x*98)) + (i1.inner*49)) + (floormod(blockIdx.x, 7)*7)) + i3.inner)] = max((conv2d_nchw_1[((i1.inner*7) + i3.inner)] + bias[(((floordiv(blockIdx.x, 7)*128) + (threadIdx.x*2)) + i1.inner)]), 0f32)
- }
+ for (i2.inner: int32, 0, 7) {
+ compute[((((blockIdx.x*392) + (floordiv(threadIdx.x, 7)*49)) + (i2.inner*7)) + floormod(threadIdx.x, 7))] = max((conv2d_nchw_1[i2.inner] + bias[((blockIdx.x*8) + floordiv(threadIdx.x, 7))]), 0f32)
}
}
}
@@ -771,7 +604,7 @@ We build the binary and check its correctness and performance.
.. code-block:: none
- Execution time of this operator: 0.359 ms
+ Execution time of this operator: 0.367 ms
@@ -820,35 +653,35 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
conv2d_nchw_nn_o_o_o_i, conv2d_nchw_nn_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_i, factor=1)
conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
- conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=2)
- conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=64)
+ conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=1)
+ conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=8)
conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
- conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
+ conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=7)
conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=1)
conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
- conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=7)
- conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
+ conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
+ conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=7)
conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
- conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=2)
- conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=4)
+ conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=4)
+ conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=16)
conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1)
- conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=1)
+ conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=3)
conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=1)
- conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=3)
+ conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=1)
s[conv2d_nchw].reorder(conv2d_nchw_nn_o_o_o_o, conv2d_nchw_ff_o_o_o_o, conv2d_nchw_yy_o_o_o_o, conv2d_nchw_xx_o_o_o_o, conv2d_nchw_nn_o_o_o_i, conv2d_nchw_ff_o_o_o_i, conv2d_nchw_yy_o_o_o_i, conv2d_nchw_xx_o_o_o_i, conv2d_nchw_nn_o_o_i, conv2d_nchw_ff_o_o_i, conv2d_nchw_yy_o_o_i, conv2d_nchw_xx_o_o_i, conv2d_nchw_rc_o_o, conv2d_nchw_ry_o_o, conv2d_nchw_rx_o_o, conv2d_nchw_rc_o_i, conv2d_nchw_ry_o_i, conv2d_nchw_rx_o_i, conv2d_nchw_nn_o_i, conv2d_nchw_ff_o_i, conv2d_nchw_yy_o_i, conv2 [...]
compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1)
compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
- compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=2)
- compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=64)
+ compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=1)
+ compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=8)
compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
- compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
+ compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=7)
compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=1)
compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
- compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
- compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
+ compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
+ compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=7)
compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
@@ -868,12 +701,12 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
- kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+ kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=56)
s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
- pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=4)
+ pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=2)
s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
- pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+ pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=56)
s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 512)
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "unroll_explicit", True)
@@ -893,10 +726,10 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
#define int64_t long long
#define uint64_t unsigned long long
#endif
- extern "C" __global__ void __launch_bounds__(64) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
- float conv2d_nchw[14];
- __shared__ float pad_temp_shared[72];
- __shared__ float kernel_shared[3072];
+ extern "C" __global__ void __launch_bounds__(56) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+ float conv2d_nchw[7];
+ __shared__ float pad_temp_shared[4032];
+ __shared__ float kernel_shared[1536];
conv2d_nchw[0] = 0.000000e+00f;
conv2d_nchw[1] = 0.000000e+00f;
conv2d_nchw[2] = 0.000000e+00f;
@@ -904,419 +737,202 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
conv2d_nchw[4] = 0.000000e+00f;
conv2d_nchw[5] = 0.000000e+00f;
conv2d_nchw[6] = 0.000000e+00f;
- conv2d_nchw[7] = 0.000000e+00f;
- conv2d_nchw[8] = 0.000000e+00f;
- conv2d_nchw[9] = 0.000000e+00f;
- conv2d_nchw[10] = 0.000000e+00f;
- conv2d_nchw[11] = 0.000000e+00f;
- conv2d_nchw[12] = 0.000000e+00f;
- conv2d_nchw[13] = 0.000000e+00f;
- for (int rc_outer_outer = 0; rc_outer_outer < 64; ++rc_outer_outer) {
- for (int ry_outer_outer = 0; ry_outer_outer < 3; ++ry_outer_outer) {
+ for (int rc_outer_outer = 0; rc_outer_outer < 8; ++rc_outer_outer) {
+ for (int rx_outer_outer = 0; rx_outer_outer < 3; ++rx_outer_outer) {
__syncthreads();
- if (((int)threadIdx.x) < 18) {
- pad_temp_shared[(((int)threadIdx.x) * 4)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) * 4) % 9))) && (((((int)threadIdx.x) * 4) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + (((((int)threadIdx.x) * 4) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) * 4) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) * 2)] = (((((7 <= ((((int)threadIdx.x) * 2) % 63)) && (((((int)threadIdx.x) * 2) % 63) < 56)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + (((((int)threadIdx.x) * 2) / 63) * 49)) + rx_outer_outer) + ((((int)threadIdx.x) * 2) % 63)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[((((int)threadIdx.x) * 2) + 1)] = (((((7 <= (((((int)threadIdx.x) * 2) + 1) % 63)) && ((((((int)threadIdx.x) * 2) + 1) % 63) < 56)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 1) / 63) * 49)) + rx_outer_outer) + (((((int)threadIdx.x) * 2) + 1) % 63)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 112) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 7) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 7) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 112) / 63) * 49)) + (((((((int)t [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 113) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 7) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 7) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 224) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 5) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 5) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 224) / 63) * 49)) + (((((((int)t [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 225) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 5) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 5) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 336) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 3) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 3) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 336) / 63) * 49)) + (((((((int)t [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 337) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 3) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 3) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 448) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 1) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 1) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 448) / 63) * 49)) + (((((((int)t [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 449) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 1) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 1) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 560) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 8) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 8) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 560) / 63) * 49)) + (((((((int)t [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 561) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 8) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 8) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 672) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 6) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 6) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 672) / 63) * 49)) + (((((((int)t [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 673) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 6) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 6) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 784) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 4) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 4) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 784) / 63) * 49)) + (((((((int)t [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 785) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 4) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 4) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 896) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 2) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 2) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 896) / 63) * 49)) + (((((((int)t [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 897) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 2) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 2) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2 [...]
+ pad_temp_shared[((((int)threadIdx.x) * 2) + 1008)] = (((((7 <= ((((int)threadIdx.x) * 2) % 63)) && (((((int)threadIdx.x) * 2) % 63) < 56)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + (((((int)threadIdx.x) * 2) / 63) * 49)) + rx_outer_outer) + ((((int)threadIdx.x) * 2) % 63)) + 776)] : 0.000000e+00f);
+ pad_temp_shared[((((int)threadIdx.x) * 2) + 1009)] = (((((7 <= (((((int)threadIdx.x) * 2) + 1) % 63)) && ((((((int)threadIdx.x) * 2) + 1) % 63) < 56)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 1) / 63) * 49)) + rx_outer_outer) + (((((int)threadIdx.x) * 2) + 1) % 63)) + 776)] : 0.000000e+00f);
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1120) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 7) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 7) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 1120) / 63) * 49)) + (((((((int [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1121) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 7) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 7) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1232) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 5) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 5) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 1232) / 63) * 49)) + (((((((int [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1233) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 5) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 5) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1344) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 3) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 3) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 1344) / 63) * 49)) + (((((((int [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1345) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 3) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 3) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1456) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 1) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 1) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 1456) / 63) * 49)) + (((((((int [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1457) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 1) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 1) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1568) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 8) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 8) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 1568) / 63) * 49)) + (((((((int [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1569) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 8) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 8) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1680) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 6) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 6) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 1680) / 63) * 49)) + (((((((int [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1681) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 6) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 6) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1792) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 4) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 4) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 1792) / 63) * 49)) + (((((((int [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1793) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 4) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 4) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1904) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 2) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 2) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 1904) / 63) * 49)) + (((((((int [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1905) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 2) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 2) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * [...]
+ pad_temp_shared[((((int)threadIdx.x) * 2) + 2016)] = (((((7 <= ((((int)threadIdx.x) * 2) % 63)) && (((((int)threadIdx.x) * 2) % 63) < 56)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + (((((int)threadIdx.x) * 2) / 63) * 49)) + rx_outer_outer) + ((((int)threadIdx.x) * 2) % 63)) + 1560)] : 0.000000e+00f);
+ pad_temp_shared[((((int)threadIdx.x) * 2) + 2017)] = (((((7 <= (((((int)threadIdx.x) * 2) + 1) % 63)) && ((((((int)threadIdx.x) * 2) + 1) % 63) < 56)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 1) / 63) * 49)) + rx_outer_outer) + (((((int)threadIdx.x) * 2) + 1) % 63)) + 1560)] : 0.000000e+00f);
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2128) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 7) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 7) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 2128) / 63) * 49)) + (((((((int [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2129) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 7) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 7) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2240) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 5) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 5) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 2240) / 63) * 49)) + (((((((int [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2241) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 5) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 5) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2352) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 3) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 3) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 2352) / 63) * 49)) + (((((((int [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2353) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 3) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 3) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2464) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 1) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 1) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 2464) / 63) * 49)) + (((((((int [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2465) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 1) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 1) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2576) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 8) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 8) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 2576) / 63) * 49)) + (((((((int [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2577) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 8) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 8) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2688) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 6) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 6) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 2688) / 63) * 49)) + (((((((int [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2689) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 6) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 6) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2800) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 4) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 4) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 2800) / 63) * 49)) + (((((((int [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2801) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 4) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 4) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2912) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 2) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 2) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 2912) / 63) * 49)) + (((((((int [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2913) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 2) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 2) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * [...]
+ pad_temp_shared[((((int)threadIdx.x) * 2) + 3024)] = (((((7 <= ((((int)threadIdx.x) * 2) % 63)) && (((((int)threadIdx.x) * 2) % 63) < 56)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + (((((int)threadIdx.x) * 2) / 63) * 49)) + rx_outer_outer) + ((((int)threadIdx.x) * 2) % 63)) + 2344)] : 0.000000e+00f);
+ pad_temp_shared[((((int)threadIdx.x) * 2) + 3025)] = (((((7 <= (((((int)threadIdx.x) * 2) + 1) % 63)) && ((((((int)threadIdx.x) * 2) + 1) % 63) < 56)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 1) / 63) * 49)) + rx_outer_outer) + (((((int)threadIdx.x) * 2) + 1) % 63)) + 2344)] : 0.000000e+00f);
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3136) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 7) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 7) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 3136) / 63) * 49)) + (((((((int [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3137) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 7) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 7) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3248) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 5) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 5) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 3248) / 63) * 49)) + (((((((int [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3249) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 5) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 5) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3360) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 3) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 3) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 3360) / 63) * 49)) + (((((((int [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3361) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 3) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 3) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3472) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 1) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 1) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 3472) / 63) * 49)) + (((((((int [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3473) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 1) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 1) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3584) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 8) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 8) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 3584) / 63) * 49)) + (((((((int [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3585) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 8) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 8) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3696) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 6) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 6) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 3696) / 63) * 49)) + (((((((int [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3697) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 6) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 6) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3808) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 4) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 4) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 3808) / 63) * 49)) + (((((((int [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3809) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 4) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 4) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3920) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 2) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 2) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 3920) / 63) * 49)) + (((((((int [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3921) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 2) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 2) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * [...]
+ kernel_shared[((int)threadIdx.x)] = kernel[((((((int)blockIdx.x) * 36864) + (rc_outer_outer * 576)) + (((int)threadIdx.x) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 56)] = kernel[(((((((int)blockIdx.x) * 36864) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 56) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 112)] = kernel[(((((((int)blockIdx.x) * 36864) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 112) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 168)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 168) / 192) * 4608)) + (rc_outer_outer * 576)) + ((((((int)threadIdx.x) / 3) + 56) & 63) * 9)) + ((((int)threadIdx.x) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 224)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 224) / 192) * 4608)) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 32) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 280)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 280) / 192) * 4608)) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 88) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 336)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 336) / 192) * 4608)) + (rc_outer_outer * 576)) + ((((((int)threadIdx.x) / 3) + 48) & 63) * 9)) + ((((int)threadIdx.x) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 392)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 392) / 192) * 4608)) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 8) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 448)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 448) / 192) * 4608)) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 64) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 504)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 504) / 192) * 4608)) + (rc_outer_outer * 576)) + (((int)threadIdx.x) * 3)) + rx_outer_outer) + 360)];
+ kernel_shared[(((int)threadIdx.x) + 560)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 560) / 192) * 4608)) + (rc_outer_outer * 576)) + ((((((int)threadIdx.x) + 176) % 192) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 616)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 616) / 192) * 4608)) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 40) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 672)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 672) / 192) * 4608)) + (rc_outer_outer * 576)) + (((int)threadIdx.x) * 3)) + rx_outer_outer) + 288)];
+ kernel_shared[(((int)threadIdx.x) + 728)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 728) / 192) * 4608)) + (rc_outer_outer * 576)) + ((((((int)threadIdx.x) + 152) % 192) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 784)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 784) / 192) * 4608)) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 16) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 840)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 840) / 192) * 4608)) + (rc_outer_outer * 576)) + (((int)threadIdx.x) * 3)) + rx_outer_outer) + 216)];
+ kernel_shared[(((int)threadIdx.x) + 896)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 896) / 192) * 4608)) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 128) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 952)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 952) / 192) * 4608)) + (rc_outer_outer * 576)) + ((((((int)threadIdx.x) + 184) % 192) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 1008)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 1008) / 192) * 4608)) + (rc_outer_outer * 576)) + (((int)threadIdx.x) * 3)) + rx_outer_outer) + 144)];
+ kernel_shared[(((int)threadIdx.x) + 1064)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 1064) / 192) * 4608)) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 104) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 1120)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 1120) / 192) * 4608)) + (rc_outer_outer * 576)) + ((((((int)threadIdx.x) + 160) % 192) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 1176)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 1176) / 192) * 4608)) + (rc_outer_outer * 576)) + (((int)threadIdx.x) * 3)) + rx_outer_outer) + 72)];
+ kernel_shared[(((int)threadIdx.x) + 1232)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 1232) / 192) * 4608)) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 80) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 1288)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 1288) / 192) * 4608)) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 136) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 1344)] = kernel[(((((((int)blockIdx.x) * 36864) + (rc_outer_outer * 576)) + (((int)threadIdx.x) * 3)) + rx_outer_outer) + 32256)];
+ kernel_shared[(((int)threadIdx.x) + 1400)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 1400) / 192) * 4608)) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 56) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 1456)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 1456) / 192) * 4608)) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 112) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ if (((int)threadIdx.x) < 24) {
+ kernel_shared[(((int)threadIdx.x) + 1512)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 1512) / 192) * 4608)) + (rc_outer_outer * 576)) + (((int)threadIdx.x) * 3)) + rx_outer_outer) + 504)];
}
- if (((int)threadIdx.x) < 18) {
- pad_temp_shared[((((int)threadIdx.x) * 4) + 1)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 1) % 9))) && ((((((int)threadIdx.x) * 4) + 1) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 1) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 1) % 9)) - 8)] : 0.000000e+00f);
- }
- if (((int)threadIdx.x) < 18) {
- pad_temp_shared[((((int)threadIdx.x) * 4) + 2)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 2) % 9))) && ((((((int)threadIdx.x) * 4) + 2) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 2) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 2) % 9)) - 8)] : 0.000000e+00f);
- }
- if (((int)threadIdx.x) < 18) {
- pad_temp_shared[((((int)threadIdx.x) * 4) + 3)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 3) % 9))) && ((((((int)threadIdx.x) * 4) + 3) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 3) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 3) % 9)) - 8)] : 0.000000e+00f);
- }
- kernel_shared[((int)threadIdx.x)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3))];
- kernel_shared[(((int)threadIdx.x) + 64)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 64) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 128)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 128) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 192)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 36864)];
- kernel_shared[(((int)threadIdx.x) + 256)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 256) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 320)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 320) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 384)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 73728)];
- kernel_shared[(((int)threadIdx.x) + 448)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 448) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 512)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 512) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 576)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 110592)];
- kernel_shared[(((int)threadIdx.x) + 640)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 640) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 704)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 704) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 768)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 147456)];
- kernel_shared[(((int)threadIdx.x) + 832)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 832) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 896)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 896) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 960)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 184320)];
- kernel_shared[(((int)threadIdx.x) + 1024)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1024) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1088)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1088) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1152)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 221184)];
- kernel_shared[(((int)threadIdx.x) + 1216)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1216) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1280)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1280) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1344)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 258048)];
- kernel_shared[(((int)threadIdx.x) + 1408)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1408) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1472)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1472) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1536)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 294912)];
- kernel_shared[(((int)threadIdx.x) + 1600)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1600) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1664)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1664) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1728)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 331776)];
- kernel_shared[(((int)threadIdx.x) + 1792)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1792) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1856)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1856) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1920)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 368640)];
- kernel_shared[(((int)threadIdx.x) + 1984)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1984) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2048)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2048) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2112)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 405504)];
- kernel_shared[(((int)threadIdx.x) + 2176)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2176) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2240)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2240) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2304)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 442368)];
- kernel_shared[(((int)threadIdx.x) + 2368)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2368) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2432)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2432) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2496)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 479232)];
- kernel_shared[(((int)threadIdx.x) + 2560)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2560) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2624)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2624) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2688)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 516096)];
- kernel_shared[(((int)threadIdx.x) + 2752)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2752) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2816)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2816) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2880)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 552960)];
- kernel_shared[(((int)threadIdx.x) + 2944)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2944) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 3008)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 3008) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
__syncthreads();
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[0] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[1] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[2] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[3] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[4] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[5] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[6] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[0] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+ for (int rc_outer_inner = 0; rc_outer_inner < 16; ++rc_outer_inner) {
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 252) + (((int)threadIdx.x) % 7))] * kernel_shared[(((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12))]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 7)] * kernel_shared[(((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12))]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 14)] * kernel_shared[(((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12))]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 21)] * kernel_shared[(((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12))]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 28)] * kernel_shared[(((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12))]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 35)] * kernel_shared[(((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12))]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 42)] * kernel_shared[(((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12))]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 70)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 77)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 84)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 98)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 105)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 126)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 133)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 140)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 147)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 154)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 161)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 168)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 189)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 196)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 203)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 210)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 217)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 224)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 231)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 7)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 21)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 28)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 35)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 42)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 49)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 70)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 77)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 84)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 98)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 105)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 112)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 133)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 140)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 147)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 154)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 161)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 168)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 175)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 196)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 203)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 210)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 217)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 224)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 231)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 238)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 21)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 28)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 35)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 42)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 49)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 56)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 77)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 84)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 98)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 105)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 112)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 119)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 140)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 147)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 154)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 161)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 168)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 175)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 182)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 203)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 210)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 217)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 224)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 231)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 238)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 245)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 11)]));
+ }
}
}
- for (int i1_inner = 0; i1_inner < 2; ++i1_inner) {
- for (int i3_inner = 0; i3_inner < 7; ++i3_inner) {
- compute[((((((((int)blockIdx.x) / 7) * 6272) + (((int)threadIdx.x) * 98)) + (i1_inner * 49)) + ((((int)blockIdx.x) % 7) * 7)) + i3_inner)] = max((conv2d_nchw[((i1_inner * 7) + i3_inner)] + bias[((((((int)blockIdx.x) / 7) * 128) + (((int)threadIdx.x) * 2)) + i1_inner)]), 0.000000e+00f);
- }
+ for (int i2_inner = 0; i2_inner < 7; ++i2_inner) {
+ compute[((((((int)blockIdx.x) * 392) + ((((int)threadIdx.x) / 7) * 49)) + (i2_inner * 7)) + (((int)threadIdx.x) % 7))] = max((conv2d_nchw[i2_inner] + bias[((((int)blockIdx.x) * 8) + (((int)threadIdx.x) / 7))]), 0.000000e+00f);
}
}
@@ -1378,7 +994,7 @@ In the example below we resume the status and do more 5 trials.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 3 minutes 20.140 seconds)
+ **Total running time of the script:** ( 3 minutes 21.096 seconds)
.. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
index 77267b2f81..b5db98acf5 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
@@ -643,7 +643,7 @@ so we can read the log file and load the best schedules.
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 8.1900 8.1873 8.2025 8.1802 0.0093
+ 8.1540 8.1555 8.1560 8.1506 0.0024
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
index b8c2145616..e5ee88c0c1 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
@@ -662,7 +662,7 @@ so we can read the log file and load the best schedules.
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 752.8236 752.8325 753.0603 752.5780 0.1970
+ 755.7181 756.6768 756.7382 753.7392 1.3995
@@ -690,7 +690,7 @@ Other Tips
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 21.526 seconds)
+ **Total running time of the script:** ( 1 minutes 22.136 seconds)
.. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_network_x86.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
index faf7cd5d47..2b57bc781d 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
@@ -397,105 +397,78 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
placeholder_4: Buffer(placeholder_14: Pointer(float32), float32, [65536], []),
compute: Buffer(compute_2: Pointer(float32), float32, [65536], [])}
buffer_map = {placeholder_5: placeholder, placeholder_6: placeholder_1, placeholder_7: placeholder_2, placeholder_8: placeholder_3, placeholder_9: placeholder_4, compute_1: compute}
- preflattened_buffer_map = {placeholder_5: placeholder_15: Buffer(placeholder_10, float32, [128, 256], []), placeholder_8: placeholder_16: Buffer(placeholder_13, int32, [33], []), placeholder_9: placeholder_17: Buffer(placeholder_14, float32, [128, 512], []), compute_1: compute_3: Buffer(compute_2, float32, [128, 512], []), placeholder_6: placeholder_18: Buffer(placeholder_11, float32, [4916, 16, 1], []), placeholder_7: placeholder_19: Buffer(placeholder_12, int32, [4916], [])} {
- for (i0.outer.i1.outer.fused: int32, 0, 32) "parallel" {
- allocate(compute_4: Pointer(global float32), float32, [2048]), storage_scope = global {
- for (i.outer.inner: int32, 0, 16) {
- for (i.inner.init: int32, 0, 8) {
- let cse_var_1: int32 = ((i.outer.inner*128) + (i.inner.init*16))
- {
- compute_5: Buffer(compute_4, float32, [2048], [])[cse_var_1] = 0f32
- compute_5[(cse_var_1 + 1)] = 0f32
- compute_5[(cse_var_1 + 2)] = 0f32
- compute_5[(cse_var_1 + 3)] = 0f32
- compute_5[(cse_var_1 + 4)] = 0f32
- compute_5[(cse_var_1 + 5)] = 0f32
- compute_5[(cse_var_1 + 6)] = 0f32
- compute_5[(cse_var_1 + 7)] = 0f32
- compute_5[(cse_var_1 + 8)] = 0f32
- compute_5[(cse_var_1 + 9)] = 0f32
- compute_5[(cse_var_1 + 10)] = 0f32
- compute_5[(cse_var_1 + 11)] = 0f32
- compute_5[(cse_var_1 + 12)] = 0f32
- compute_5[(cse_var_1 + 13)] = 0f32
- compute_5[(cse_var_1 + 14)] = 0f32
- compute_5[(cse_var_1 + 15)] = 0f32
- }
- }
- for (elem_idx: int32, 0, (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])) {
- for (i.inner: int32, 0, 8) {
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_2: int32 = ((i.outer.inner*128) + (i.inner*16))
- compute_5[cse_var_2] = (compute_5[cse_var_2] + (placeholder_1[((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16))]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_3: int32 = (((i.outer.inner*128) + (i.inner*16)) + 1)
- compute_5[cse_var_3] = (compute_5[cse_var_3] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 1)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_4: int32 = (((i.outer.inner*128) + (i.inner*16)) + 2)
- compute_5[cse_var_4] = (compute_5[cse_var_4] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 2)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_5: int32 = (((i.outer.inner*128) + (i.inner*16)) + 3)
- compute_5[cse_var_5] = (compute_5[cse_var_5] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 3)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_6: int32 = (((i.outer.inner*128) + (i.inner*16)) + 4)
- compute_5[cse_var_6] = (compute_5[cse_var_6] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 4)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_7: int32 = (((i.outer.inner*128) + (i.inner*16)) + 5)
- compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 5)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_8: int32 = (((i.outer.inner*128) + (i.inner*16)) + 6)
- compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 6)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_9: int32 = (((i.outer.inner*128) + (i.inner*16)) + 7)
- compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 7)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ preflattened_buffer_map = {placeholder_6: placeholder_15: Buffer(placeholder_11, float32, [4916, 16, 1], []), placeholder_7: placeholder_16: Buffer(placeholder_12, int32, [4916], []), placeholder_8: placeholder_17: Buffer(placeholder_13, int32, [33], []), placeholder_5: placeholder_18: Buffer(placeholder_10, float32, [128, 256], []), compute_1: compute_3: Buffer(compute_2, float32, [128, 512], []), placeholder_9: placeholder_19: Buffer(placeholder_14, float32, [128, 512], [])} {
+ for (i0.outer.i1.outer.fused: int32, 0, 16) "parallel" {
+ allocate(compute_4: Pointer(global float32), float32, [4096]), storage_scope = global {
+ for (i.outer.inner: int32, 0, 8) {
+ for (nb_j.inner: int32, 0, 2) {
+ for (i.inner.init: int32, 0, 16) {
+ let cse_var_1: int32 = (((i.outer.inner*512) + (i.inner.init*32)) + (nb_j.inner*16))
+ {
+ compute_5: Buffer(compute_4, float32, [4096], [])[cse_var_1] = 0f32
+ compute_5[(cse_var_1 + 1)] = 0f32
+ compute_5[(cse_var_1 + 2)] = 0f32
+ compute_5[(cse_var_1 + 3)] = 0f32
+ compute_5[(cse_var_1 + 4)] = 0f32
+ compute_5[(cse_var_1 + 5)] = 0f32
+ compute_5[(cse_var_1 + 6)] = 0f32
+ compute_5[(cse_var_1 + 7)] = 0f32
+ compute_5[(cse_var_1 + 8)] = 0f32
+ compute_5[(cse_var_1 + 9)] = 0f32
+ compute_5[(cse_var_1 + 10)] = 0f32
+ compute_5[(cse_var_1 + 11)] = 0f32
+ compute_5[(cse_var_1 + 12)] = 0f32
+ compute_5[(cse_var_1 + 13)] = 0f32
+ compute_5[(cse_var_1 + 14)] = 0f32
+ compute_5[(cse_var_1 + 15)] = 0f32
}
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_10: int32 = (((i.outer.inner*128) + (i.inner*16)) + 8)
- compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 8)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_11: int32 = (((i.outer.inner*128) + (i.inner*16)) + 9)
- compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 9)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_12: int32 = (((i.outer.inner*128) + (i.inner*16)) + 10)
- compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 10)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_13: int32 = (((i.outer.inner*128) + (i.inner*16)) + 11)
- compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 11)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_14: int32 = (((i.outer.inner*128) + (i.inner*16)) + 12)
- compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 12)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_15: int32 = (((i.outer.inner*128) + (i.inner*16)) + 13)
- compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 13)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_16: int32 = (((i.outer.inner*128) + (i.inner*16)) + 14)
- compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 14)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_17: int32 = (((i.outer.inner*128) + (i.inner*16)) + 15)
- compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 15)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ for (elem_idx: int32, 0, let cse_var_2: int32 = ((i0.outer.i1.outer.fused*2) + nb_j.inner) in (placeholder_3[(cse_var_2 + 1)] - placeholder_3[cse_var_2])) {
+ for (i.inner: int32, 0, 16) {
+ let cse_var_21: int32 = (elem_idx*16)
+ let cse_var_20: int32 = ((i0.outer.i1.outer.fused*2) + nb_j.inner)
+ let cse_var_19: int32 = ((i.outer.inner*4096) + (i.inner*256))
+ let cse_var_18: int32 = (((i.outer.inner*512) + (i.inner*32)) + (nb_j.inner*16))
+ let cse_var_17: int32 = (cse_var_18 + 9)
+ let cse_var_16: int32 = (cse_var_18 + 8)
+ let cse_var_15: int32 = (cse_var_18 + 7)
+ let cse_var_14: int32 = (cse_var_18 + 6)
+ let cse_var_13: int32 = (cse_var_18 + 5)
+ let cse_var_12: int32 = (cse_var_18 + 4)
+ let cse_var_11: int32 = (cse_var_18 + 3)
+ let cse_var_10: int32 = (cse_var_18 + 2)
+ let cse_var_9: int32 = (cse_var_18 + 15)
+ let cse_var_8: int32 = (cse_var_18 + 14)
+ let cse_var_7: int32 = (cse_var_18 + 13)
+ let cse_var_6: int32 = (cse_var_18 + 12)
+ let cse_var_5: int32 = (cse_var_18 + 11)
+ let cse_var_4: int32 = (cse_var_18 + 10)
+ let cse_var_3: int32 = (cse_var_18 + 1)
+ {
+ compute_5[cse_var_18] = (compute_5[cse_var_18] + (placeholder_1[((placeholder_3[cse_var_20]*16) + cse_var_21)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_3] = (compute_5[cse_var_3] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 1)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 2)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 3)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 4)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 5)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 6)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 7)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 8)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 9)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_4] = (compute_5[cse_var_4] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 10)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_5] = (compute_5[cse_var_5] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 11)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_6] = (compute_5[cse_var_6] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 12)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 13)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 14)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 15)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ }
}
}
}
}
for (i0.inner: int32, 0, 128) {
- for (i1.inner: int32, 0, 16) {
- let cse_var_18: int32 = (((i0.inner*512) + (i0.outer.i1.outer.fused*16)) + i1.inner)
- compute[cse_var_18] = max((compute_5[((i0.inner*16) + i1.inner)] + placeholder_4[cse_var_18]), 0f32)
- }
+ let cse_var_22: int32 = ((i0.inner*512) + (i0.outer.i1.outer.fused*32))
+ compute[ramp(cse_var_22, 1, 32)] = max((compute_5[ramp((i0.inner*32), 1, 32)] + placeholder_4[ramp(cse_var_22, 1, 32)]), broadcast(0f32, 32))
}
}
}
@@ -551,7 +524,7 @@ We build the binary and check its correctness and performance.
.. code-block:: none
- Execution time of this operator: 1.823 ms
+ Execution time of this operator: 1.722 ms
diff --git a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
index e6af9cb3aa..d222112b4f 100644
--- a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
@@ -5,16 +5,16 @@
Computation times
=================
-**00:32.020** total execution time for **how_to_tune_with_autotvm** files:
+**00:29.562** total execution time for **how_to_tune_with_autotvm** files:
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``) | 00:31.984 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``) | 00:29.526 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``) | 00:00.021 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``) | 00:00.020 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_cuda.py` (``tune_relay_cuda.py``) | 00:00.005 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_mobile_gpu.py` (``tune_relay_mobile_gpu.py``) | 00:00.005 | 0.0 MB |
-+--------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_arm.py` (``tune_relay_arm.py``) | 00:00.005 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_mobile_gpu.py` (``tune_relay_mobile_gpu.py``) | 00:00.005 | 0.0 MB |
++--------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
index 8f0bfc5b88..6bdbabbb43 100644
--- a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
@@ -277,7 +277,9 @@ for this template
waiting for device...
device available
Get devices for measurement successfully!
- No: 1 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ No: 1 GFLOPS: 24.13/24.13 result: MeasureResult(costs=(0.009593230454545455,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.932642698287964, timestamp=1663882327.1766) [('tile_f', [-1, 4, 4, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8593881
+ No: 2 GFLOPS: 83.93/83.93 result: MeasureResult(costs=(0.0027582268448275863,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.507809638977051, timestamp=1663882328.0813944) [('tile_f', [-1, 16, 16, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,8717369
+ No: 3 GFLOPS: 0.00/83.93 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -399,9 +401,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 64, 1, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 256]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9095346
- No: 2 GFLOPS: 3.44/3.44 result: MeasureResult(costs=(0.06739114375,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.442340850830078, timestamp=1663875119.019349) [('tile_f', [-1, 4, 1, 8]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 16, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,274698
- No: 3 GFLOPS: 0.00/3.44 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 4, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 256, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10132656
+ No: 4 GFLOPS: 0.00/83.93 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -523,8 +524,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 2, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 512, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7002488
- No: 4 GFLOPS: 0.00/3.44 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 16, 16]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 128, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6414062
+ No: 5 GFLOPS: 0.00/83.93 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -646,8 +647,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 8, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 512, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3712624
- No: 5 GFLOPS: 0.00/3.44 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 2, 64]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 2, 16]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10384866
+ No: 6 GFLOPS: 0.00/83.93 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -769,8 +770,9 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 16, 2]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 128]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5595345
- No: 6 GFLOPS: 0.00/3.44 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 8, 8]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 32]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8280956
+ No: 7 GFLOPS: 91.25/91.25 result: MeasureResult(costs=(0.002536887925,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.1385846138000488, timestamp=1663882331.5116751) [('tile_f', [-1, 2, 64, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6074686
+ No: 8 GFLOPS: 0.00/91.25 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -892,9 +894,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 64, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 64]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9838447
- No: 7 GFLOPS: 3.57/3.57 result: MeasureResult(costs=(0.06481407925,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.5743279457092285, timestamp=1663875125.0370524) [('tile_f', [-1, 1, 32, 16]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 2, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4686404
- No: 8 GFLOPS: 0.00/3.57 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 1, 64]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 128]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8309381
+ No: 9 GFLOPS: 0.00/91.25 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1016,9 +1017,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 256, 2, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 8]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8422278
- No: 9 GFLOPS: 59.00/59.00 result: MeasureResult(costs=(0.003923450037037037,), error_no=MeasureErrorNo.NO_ERROR, all_cost=5.324518442153931, timestamp=1663875130.5515735) [('tile_f', [-1, 1, 8, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 8]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8238461
- No: 10 GFLOPS: 0.00/59.00 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 4, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 64, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1185556
+ No: 10 GFLOPS: 0.00/91.25 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1140,9 +1140,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 16, 16, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 8, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7790458
- No: 11 GFLOPS: 234.33/234.33 result: MeasureResult(costs=(0.0009879114594594594,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.1547462940216064, timestamp=1663875131.228428) [('tile_f', [-1, 2, 32, 1]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 64]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5386081
- No: 12 GFLOPS: 0.00/234.33 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 8, 16]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 8]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4361239
+ No: 11 GFLOPS: 0.00/91.25 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1264,8 +1263,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 1, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 128, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9512987
- No: 13 GFLOPS: 0.00/234.33 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 2, 16]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 128]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9862111
+ No: 12 GFLOPS: 0.00/91.25 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1387,8 +1386,10 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 32, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 4]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7048272
- No: 14 GFLOPS: 0.00/234.33 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 16, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 8]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,875069
+ No: 13 GFLOPS: 34.82/91.25 result: MeasureResult(costs=(0.00664771,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.352648973464966, timestamp=1663882335.246479) [('tile_f', [-1, 1, 8, 32]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 16]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,1862937
+ No: 14 GFLOPS: 0.98/91.25 result: MeasureResult(costs=(0.23562669349999998,), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.185566663742065, timestamp=1663882338.6236575) [('tile_f', [-1, 256, 2, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,260498
+ No: 15 GFLOPS: 0.00/91.25 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1510,10 +1511,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 32, 4, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 64, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7413900
- No: 15 GFLOPS: 817.00/817.00 result: MeasureResult(costs=(0.00028335495759717314,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.201112985610962, timestamp=1663875132.6936429) [('tile_f', [-1, 1, 8, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1557847
- No: 16 GFLOPS: 823.54/823.54 result: MeasureResult(costs=(0.0002811060505226481,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.271367073059082, timestamp=1663875133.633881) [('tile_f', [-1, 1, 32, 1]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6791440
- No: 17 GFLOPS: 0.00/823.54 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 1, 16]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 128]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5792986
+ No: 16 GFLOPS: 0.00/91.25 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1635,9 +1634,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 16, 2, 16]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1699894
- No: 18 GFLOPS: 305.95/823.54 result: MeasureResult(costs=(0.0007566715524475524,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.3589026927947998, timestamp=1663875135.184876) [('tile_f', [-1, 1, 16, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 16, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9308725
- No: 19 GFLOPS: 0.00/823.54 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 32, 1]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 32, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,474802
+ No: 17 GFLOPS: 0.00/91.25 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1759,8 +1757,9 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 1, 128]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 16, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10278391
- No: 20 GFLOPS: 0.00/823.54 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 32, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 2, 16]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7673692
+ No: 18 GFLOPS: 1204.15/1204.15 result: MeasureResult(costs=(0.0001922534928741093,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.188037395477295, timestamp=1663882340.0105078) [('tile_f', [-1, 1, 8, 2]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,1947959
+ No: 19 GFLOPS: 0.00/1204.15 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1882,7 +1881,130 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 8, 16]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 16, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2182359
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 4, 32]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 16]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,130215
+ No: 20 GFLOPS: 0.00/1204.15 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+ func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ func = build(s, args, target_host=task.target_host, runtime=runtime)
+ File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+ input_mod = lower(inputs, args, name=name, binds=binds)
+ File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+ return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
+ File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
+ tvm._ffi.base.TVMError: Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1731
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1671
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1646
+ 13: operator()
+ at ../src/driver/driver_api.cc:379
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:365
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:260
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:453
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1750
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1694
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1618
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+ Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1731
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1671
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1646
+ 13: operator()
+ at ../src/driver/driver_api.cc:379
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:365
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:260
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:453
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1750
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1694
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1618
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 1, 32]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 16]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5929408
@@ -1937,9 +2059,9 @@ and measure running time.
Finish loading 20 records
Best config:
- [('tile_f', [-1, 1, 32, 1]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6791440
+ [('tile_f', [-1, 1, 8, 2]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,1947959
Finish loading 20 records
- Time cost of this operator: 0.000695
+ Time cost of this operator: 0.000484
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
index d88207aeef..db1a9c7fbf 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
@@ -327,10 +327,10 @@ Timing the untuned program
########## Build without Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs Measurements(us)
--------- --- -------- ------- ----- ------ ------- ----------------
- tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 311.7 98.72 (1, 2, 10, 10, 3) 2 1 [311.7]
- tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.071 0.973 (1, 6, 10, 10) 1 1 [3.071]
- tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.97 0.307 (1, 1, 10, 10, 3) 1 1 [0.97]
- Total_time - 315.741 - - - - -
+ tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 310.4 98.714 (1, 2, 10, 10, 3) 2 1 [310.4]
+ tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.086 0.981 (1, 6, 10, 10) 1 1 [3.086]
+ tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.958 0.305 (1, 1, 10, 10, 3) 1 1 [0.958]
+ Total_time - 314.444 - - - - -
@@ -394,10 +394,10 @@ Timing the tuned program
########## Build with Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs Measurements(us)
--------- --- -------- ------- ----- ------ ------- ----------------
- tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 136.0 98.039 (1, 6, 10, 10, 1) 2 1 [136.0]
- tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.765 1.273 (1, 6, 10, 10) 1 1 [1.765]
- tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.956 0.689 (1, 1, 10, 10, 3) 1 1 [0.956]
- Total_time - 138.721 - - - - -
+ tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 181.3 98.431 (1, 1, 10, 10, 6) 2 1 [181.3]
+ tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.93 1.048 (1, 6, 10, 10) 1 1 [1.93]
+ tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.96 0.521 (1, 1, 10, 10, 3) 1 1 [0.96]
+ Total_time - 184.189 - - - - -
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
index 99fd15dfbe..e4d9ed3c38 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
@@ -225,7 +225,7 @@ take about **2 minutes** to download the Stanford Cars, while COCO 2017 validati
.. code-block:: none
- '/tmp/tmpnvgydylo/images/random'
+ '/tmp/tmp64nz5xxv/images/random'
@@ -316,7 +316,7 @@ objects to other stuff? We can display some examples from our datasets using ``m
.. image-sg:: /how_to/work_with_microtvm/images/sphx_glr_micro_train_001.png
- :alt: [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]
+ :alt: [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]
:srcset: /how_to/work_with_microtvm/images/sphx_glr_micro_train_001.png
:class: sphx-glr-single-img
@@ -325,8 +325,8 @@ objects to other stuff? We can display some examples from our datasets using ``m
.. code-block:: none
- /tmp/tmpnvgydylo/images/target contains 8144 images
- /tmp/tmpnvgydylo/images/random contains 5000 images
+ /tmp/tmp64nz5xxv/images/target contains 8144 images
+ /tmp/tmp64nz5xxv/images/random contains 5000 images
@@ -501,13 +501,13 @@ the time on our validation set).
.. code-block:: none
Epoch 1/3
- 328/328 - 47s - loss: 0.2145 - accuracy: 0.9249 - val_loss: 0.1352 - val_accuracy: 0.9494 - 47s/epoch - 143ms/step
+ 328/328 - 47s - loss: 0.2080 - accuracy: 0.9267 - val_loss: 0.1079 - val_accuracy: 0.9619 - 47s/epoch - 142ms/step
Epoch 2/3
- 328/328 - 43s - loss: 0.1034 - accuracy: 0.9635 - val_loss: 0.1398 - val_accuracy: 0.9588 - 43s/epoch - 131ms/step
+ 328/328 - 43s - loss: 0.0903 - accuracy: 0.9685 - val_loss: 0.0994 - val_accuracy: 0.9687 - 43s/epoch - 132ms/step
Epoch 3/3
- 328/328 - 43s - loss: 0.0649 - accuracy: 0.9764 - val_loss: 0.1187 - val_accuracy: 0.9600 - 43s/epoch - 131ms/step
+ 328/328 - 43s - loss: 0.0624 - accuracy: 0.9774 - val_loss: 0.0998 - val_accuracy: 0.9619 - 43s/epoch - 131ms/step
- <keras.callbacks.History object at 0x7f1f3eb99810>
+ <keras.callbacks.History object at 0x7f340b3e67d0>
@@ -864,7 +864,7 @@ Arduino tutorial for how to do that `on GitHub <https://github.com/guberti/tvm-a
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 4 minutes 32.249 seconds)
+ **Total running time of the script:** ( 4 minutes 28.626 seconds)
.. _sphx_glr_download_how_to_work_with_microtvm_micro_train.py:
diff --git a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
index 0de23636a0..4762351eb8 100644
--- a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
@@ -5,16 +5,16 @@
Computation times
=================
-**05:25.397** total execution time for **how_to_work_with_microtvm** files:
+**05:20.845** total execution time for **how_to_work_with_microtvm** files:
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py` (``micro_train.py``) | 04:32.249 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py` (``micro_train.py``) | 04:28.626 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``) | 00:42.333 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``) | 00:41.541 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_aot.py` (``micro_aot.py``) | 00:07.457 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_aot.py` (``micro_aot.py``) | 00:07.415 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``) | 00:03.357 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``) | 00:03.261 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_microtvm_micro_ethosu.py` (``micro_ethosu.py``) | 00:00.001 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
index 3983cb5dfd..8365a1e2b7 100644
--- a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
Computation times
=================
-**00:40.072** total execution time for **how_to_work_with_relay** files:
+**00:42.533** total execution time for **how_to_work_with_relay** files:
+----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_using_pipeline_executor.py` (``using_pipeline_executor.py``) | 00:31.961 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_using_pipeline_executor.py` (``using_pipeline_executor.py``) | 00:31.214 | 0.0 MB |
+----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``) | 00:06.405 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``) | 00:09.844 | 0.0 MB |
+----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``) | 00:01.699 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``) | 00:01.469 | 0.0 MB |
+----------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_relay_using_relay_viz.py` (``using_relay_viz.py``) | 00:00.007 | 0.0 MB |
+----------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt b/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
index 9d2bb7d4e4..0402d76924 100644
--- a/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
@@ -261,7 +261,7 @@ The following example customizes CUDA lowering rule for :code:`exp`.
.. code-block:: none
- <function my_cuda_math_rule at 0x7f1ed8bea170>
+ <function my_cuda_math_rule at 0x7f33abc0d950>
diff --git a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
index 440a7153a7..6922a7cfc1 100644
--- a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
@@ -5,20 +5,20 @@
Computation times
=================
-**00:04.868** total execution time for **how_to_work_with_schedules** files:
+**00:07.564** total execution time for **how_to_work_with_schedules** files:
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``) | 00:02.324 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``) | 00:05.338 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``) | 00:01.228 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``) | 00:00.990 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``) | 00:00.576 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``) | 00:00.539 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``) | 00:00.562 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``) | 00:00.518 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_schedules_extern_op.py` (``extern_op.py``) | 00:00.098 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_schedule_primitives.py` (``schedule_primitives.py``) | 00:00.039 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_schedule_primitives.py` (``schedule_primitives.py``) | 00:00.040 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_schedules_tedd.py` (``tedd.py``) | 00:00.027 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
index a39c990b59..a55946062d 100644
--- a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
@@ -347,7 +347,7 @@ The importing needs to happen before the tensorized GEMV being executed.
C: Buffer(C_2: Pointer(float32), float32, [524288], [])}
buffer_map = {A_1: A, B_1: B, C_1: C}
preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [1024, 64], []), B_1: B_3: Buffer(B_2, float32, [512, 64], []), C_1: C_3: Buffer(C_2, float32, [1024, 512], [])} {
- attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmpo8bhi268/input0.cc'\nsource_filename = \"/tmp/tmpo8bhi268/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = alloca float*, align 8\n %8 = alloca float*, align 8\n %9 = alloca floa [...]
+ attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmpnpp5qs6m/input0.cc'\nsource_filename = \"/tmp/tmpnpp5qs6m/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = alloca float*, align 8\n %8 = alloca float*, align 8\n %9 = alloca floa [...]
for (i, 0, 1024) {
for (j.outer: int32, 0, 32) {
@tir.call_extern("gemv_update", @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), C_2, ((i*512) + (j.outer*16)), 16, 2, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), A_2, (i*64), 64, 1, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), B_2, (j.outer*1024), 1024, 1, dtype=handle), 16, 64, 64, dtype=int32)
diff --git a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
index f07549b26f..b3de2275de 100644
--- a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**00:20.990** total execution time for **topic_vta_tutorials_autotvm** files:
+**00:21.165** total execution time for **topic_vta_tutorials_autotvm** files:
+---------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``) | 00:20.984 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``) | 00:21.159 | 0.0 MB |
+---------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_alu_vta.py` (``tune_alu_vta.py``) | 00:00.006 | 0.0 MB |
+---------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
index 58464dd2fa..7aba74dda8 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
@@ -289,7 +289,7 @@ The compilation steps are:
DeprecationWarning,
/workspace/vta/tutorials/frontend/deploy_classification.py:213: DeprecationWarning: legacy graph executor behavior of producing json / lib / params will be removed in the next release. Please see documents of tvm.contrib.graph_executor.GraphModule for the new recommended usage.
relay_prog, target=tvm.target.Target(target, host=env.target_host), params=params
- resnet18_v1 inference graph built in 22.37s!
+ resnet18_v1 inference graph built in 22.56s!
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
index fa661ea885..c4247a6f41 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
@@ -333,7 +333,7 @@ The compilation steps are:
/workspace/python/tvm/relay/build_module.py:348: DeprecationWarning: Please use input parameter mod (tvm.IRModule) instead of deprecated parameter mod (tvm.relay.function.Function)
DeprecationWarning,
- yolov3-tiny inference graph built in 15.88s!
+ yolov3-tiny inference graph built in 16.16s!
diff --git a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
index 035d794408..9e10a88aba 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**01:31.190** total execution time for **topic_vta_tutorials_frontend** files:
+**01:31.415** total execution time for **topic_vta_tutorials_frontend** files:
+------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``) | 00:48.507 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``) | 00:48.656 | 0.0 MB |
+------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``) | 00:42.683 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``) | 00:42.759 | 0.0 MB |
+------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
index 61ad71f8a0..a298ce9c9b 100644
--- a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**00:03.110** total execution time for **topic_vta_tutorials_optimize** files:
+**00:03.047** total execution time for **topic_vta_tutorials_optimize** files:
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``) | 00:02.674 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``) | 00:02.652 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``) | 00:00.437 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``) | 00:00.395 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
index ee43c51a62..960fcd4ba6 100644
--- a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**00:00.810** total execution time for **topic_vta_tutorials** files:
+**00:00.742** total execution time for **topic_vta_tutorials** files:
+---------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``) | 00:00.424 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``) | 00:00.401 | 0.0 MB |
+---------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``) | 00:00.385 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``) | 00:00.341 | 0.0 MB |
+---------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
index 06ef72d45d..ef7755cbbf 100644
--- a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
@@ -326,7 +326,7 @@ We build the binary and check its correctness and performance.
.. code-block:: none
- Execution time of this operator: 94.206 ms
+ Execution time of this operator: 93.349 ms
diff --git a/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt b/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
index fe2b6efe33..8971d37df1 100644
--- a/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
@@ -462,16 +462,16 @@ reduce variance, we take 5 measurements and average them.
waiting for device...
device available
Get devices for measurement successfully!
- No: 1 GFLOPS: 2.94/2.94 result: MeasureResult(costs=(0.0911788836,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.6111869812011719, timestamp=1663873943.0273447) [('tile_y', [-1, 4]), ('tile_x', [-1, 8])],None,32
- No: 2 GFLOPS: 2.88/2.94 result: MeasureResult(costs=(0.09307869220000001,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.6428868770599365, timestamp=1663873944.6814759) [('tile_y', [-1, 2]), ('tile_x', [-1, 16])],None,41
- No: 3 GFLOPS: 12.63/12.63 result: MeasureResult(costs=(0.021255255199999996,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.493274450302124, timestamp=1663873945.7211375) [('tile_y', [-1, 32]), ('tile_x', [-1, 128])],None,75
- No: 4 GFLOPS: 1.47/12.63 result: MeasureResult(costs=(0.1824866904,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.0455541610717773, timestamp=1663873949.333493) [('tile_y', [-1, 4]), ('tile_x', [-1, 1])],None,2
- No: 5 GFLOPS: 2.41/12.63 result: MeasureResult(costs=(0.11147621160000001,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.9205944538116455, timestamp=1663873951.4879708) [('tile_y', [-1, 8]), ('tile_x', [-1, 4])],None,23
- No: 6 GFLOPS: 11.60/12.63 result: MeasureResult(costs=(0.02313667,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6075718402862549, timestamp=1663873952.0718408) [('tile_y', [-1, 16]), ('tile_x', [-1, 256])],None,84
- No: 7 GFLOPS: 12.40/12.63 result: MeasureResult(costs=(0.0216459084,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.4777839183807373, timestamp=1663873953.1172845) [('tile_y', [-1, 2]), ('tile_x', [-1, 512])],None,91
- No: 8 GFLOPS: 12.81/12.81 result: MeasureResult(costs=(0.0209483958,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5884060859680176, timestamp=1663873953.6207442) [('tile_y', [-1, 64]), ('tile_x', [-1, 128])],None,76
- No: 9 GFLOPS: 1.57/12.81 result: MeasureResult(costs=(0.17083591859999997,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.8375895023345947, timestamp=1663873956.573674) [('tile_y', [-1, 32]), ('tile_x', [-1, 4])],None,25
- No: 10 GFLOPS: 7.43/12.81 result: MeasureResult(costs=(0.03613698580000001,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6751558780670166, timestamp=1663873957.3136365) [('tile_y', [-1, 1]), ('tile_x', [-1, 32])],None,50
+ No: 1 GFLOPS: 1.84/1.84 result: MeasureResult(costs=(0.14583982,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.4632906913757324, timestamp=1663881151.2192934) [('tile_y', [-1, 1]), ('tile_x', [-1, 4])],None,20
+ No: 2 GFLOPS: 9.78/9.78 result: MeasureResult(costs=(0.027460926999999996,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6101272106170654, timestamp=1663881151.8633597) [('tile_y', [-1, 2]), ('tile_x', [-1, 32])],None,51
+ No: 3 GFLOPS: 13.69/13.69 result: MeasureResult(costs=(0.0196095172,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.4507744312286377, timestamp=1663881152.8681498) [('tile_y', [-1, 256]), ('tile_x', [-1, 64])],None,68
+ No: 4 GFLOPS: 2.28/13.69 result: MeasureResult(costs=(0.1179887436,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.0154857635498047, timestamp=1663881155.4523356) [('tile_y', [-1, 2]), ('tile_x', [-1, 8])],None,31
+ No: 5 GFLOPS: 0.90/13.69 result: MeasureResult(costs=(0.2992063222,), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.922318696975708, timestamp=1663881160.54972) [('tile_y', [-1, 128]), ('tile_x', [-1, 2])],None,17
+ No: 6 GFLOPS: 9.13/13.69 result: MeasureResult(costs=(0.029390212200000005,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6517925262451172, timestamp=1663881161.1827745) [('tile_y', [-1, 16]), ('tile_x', [-1, 32])],None,54
+ No: 7 GFLOPS: 1.66/13.69 result: MeasureResult(costs=(0.161289652,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.7063543796539307, timestamp=1663881164.4588513) [('tile_y', [-1, 512]), ('tile_x', [-1, 4])],None,29
+ No: 8 GFLOPS: 3.70/13.69 result: MeasureResult(costs=(0.0725344232,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.3382151126861572, timestamp=1663881165.8220372) [('tile_y', [-1, 128]), ('tile_x', [-1, 16])],None,47
+ No: 9 GFLOPS: 13.02/13.69 result: MeasureResult(costs=(0.0206113372,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.4957246780395508, timestamp=1663881166.43362) [('tile_y', [-1, 128]), ('tile_x', [-1, 128])],None,77
+ No: 10 GFLOPS: 9.99/13.69 result: MeasureResult(costs=(0.0268662134,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5497944355010986, timestamp=1663881167.0240293) [('tile_y', [-1, 4]), ('tile_x', [-1, 64])],None,62
diff --git a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
index 8ee96dac82..791f98ce13 100644
--- a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
@@ -320,7 +320,7 @@ standard deviation.
.. code-block:: none
- {'mean': 513.4868455500055, 'median': 513.3193053500236, 'std': 0.9942950218432403}
+ {'mean': 510.5090909899979, 'median': 510.57848135000654, 'std': 1.257277929183221}
@@ -554,31 +554,31 @@ the tuning data to.
.. code-block:: none
-
[Task 1/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 1/25] Current/Best: 18.56/ 18.56 GFLOPS | Progress: (4/20) | 5.48 s
[Task 1/25] Current/Best: 12.62/ 22.54 GFLOPS | Progress: (8/20) | 9.12 s
[Task 1/25] Current/Best: 17.64/ 22.54 GFLOPS | Progress: (12/20) | 11.45 s
[Task 1/25] Current/Best: 18.40/ 22.54 GFLOPS | Progress: (16/20) | 13.98 s
[Task 1/25] Current/Best: 8.51/ 22.54 GFLOPS | Progress: (20/20) | 17.14 s Done.
-
[Task 2/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 2/25] Current/Best: 18.18/ 18.18 GFLOPS | Progress: (4/20) | 2.31 s
[Task 2/25] Current/Best: 8.12/ 18.54 GFLOPS | Progress: (8/20) | 3.50 s
[Task 2/25] Current/Best: 11.09/ 20.07 GFLOPS | Progress: (12/20) | 5.01 s
[Task 2/25] Current/Best: 15.22/ 20.07 GFLOPS | Progress: (16/20) | 6.27 s
[Task 2/25] Current/Best: 13.87/ 20.07 GFLOPS | Progress: (20/20) | 7.61 s Done.
-
[Task 3/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 3/25] Current/Best: 22.46/ 22.46 GFLOPS | Progress: (4/20) | 3.25 s
[Task 3/25] Current/Best: 10.29/ 22.46 GFLOPS | Progress: (8/20) | 6.66 s
[Task 3/25] Current/Best: 9.93/ 22.46 GFLOPS | Progress: (12/20) | 8.69 s
[Task 3/25] Current/Best: 12.38/ 22.46 GFLOPS | Progress: (16/20) | 11.17 s
[Task 3/25] Current/Best: 6.99/ 22.46 GFLOPS | Progress: (20/20) | 12.96 s Done.
-
[Task 4/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 4/25] Current/Best: 22.71/ 22.71 GFLOPS | Progress: (4/20) | 2.43 s
[Task 4/25] Current/Best: 15.99/ 22.71 GFLOPS | Progress: (8/20) | 4.75 s
[Task 4/25] Current/Best: 10.47/ 22.71 GFLOPS | Progress: (12/20) | 6.31 s
[Task 4/25] Current/Best: 13.68/ 22.71 GFLOPS | Progress: (16/20) | 11.25 s
[Task 4/25] Current/Best: 16.59/ 22.71 GFLOPS | Progress: (20/20) | 12.85 s Done.
-
[Task 5/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 5/25] Current/Best: 5.63/ 17.87 GFLOPS | Progress: (4/20) | 3.58 s
[Task 5/25] Current/Best: 13.72/ 19.84 GFLOPS | Progress: (8/20) | 5.26 s
[Task 5/25] Current/Best: 4.86/ 19.84 GFLOPS | Progress: (12/20) | 7.10 s
[Task 5/25] Current/Best: 4.69/ 19.84 GFLOPS | Progress: (16/20) | 8.77 s
[Task 5/25] Current/Best: 5.92/ 19.84 GFLOPS | Progress: (20/20) | 10.82 s Done.
-
[Task 6/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 6/25] Current/Best: 5.16/ 10.27 GFLOPS | Progress: (4/20) | 4.35 s
[Task 6/25] Current/Best: 15.06/ 18.44 GFLOPS | Progress: (8/20) | 7.22 s
[Task 6/25] Current/Best: 20.17/ 20.17 GFLOPS | Progress: (12/20) | 9.21 s
[Task 6/25] Current/Best: 9.77/ 23.55 GFLOPS | Progress: (16/20) | 10.97 s
[Task 6/25] Current/Best: 15.99/ 23.55 GFLOPS | Progress: (20/20) | 13.73 s Done.
-
[Task 7/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 7/25] Current/Best: 19.50/ 19.50 GFLOPS | Progress: (4/20) | 3.56 s
[Task 7/25] Current/Best: 6.24/ 19.50 GFLOPS | Progress: (8/20) | 5.35 s
[Task 7/25] Current/Best: 19.63/ 19.63 GFLOPS | Progress: (12/20) | 7.67 s
[Task 7/25] Current/Best: 14.73/ 19.63 GFLOPS | Progress: (16/20) | 10.20 s
[Task 7/25] Current/Best: 9.15/ 19.63 GFLOPS | Progress: (20/20) | 12.60 s Done.
-
[Task 8/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 8/25] Current/Best: 5.81/ 15.10 GFLOPS | Progress: (4/20) | 3.36 s
[Task 8/25] Current/Best: 8.59/ 15.10 GFLOPS | Progress: (8/20) | 8.08 s
[Task 8/25] Current/Best: 5.97/ 20.16 GFLOPS | Progress: (12/20) | 17.74 s
[Task 8/25] Current/Best: 11.81/ 20.16 GFLOPS | Progress: (16/20) | 19.55 s
[Task 8/25] Current/Best: 8.07/ 20.16 GFLOPS | Progress: (20/20) | 21.81 s Done.
-
[Task 9/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 9/25] Current/Best: 8.28/ 14.21 GFLOPS | Progress: (4/20) | 5.63 s
[Task 9/25] Current/Best: 4.83/ 18.45 GFLOPS | Progress: (8/20) | 7.15 s
[Task 9/25] Current/Best: 5.77/ 23.29 GFLOPS | Progress: (12/20) | 8.44 s
[Task 9/25] Current/Best: 13.01/ 23.29 GFLOPS | Progress: (16/20) | 18.58 s
[Task 9/25] Current/Best: 13.56/ 23.29 GFLOPS | Progress: (20/20) | 20.04 s Done.
-
[Task 10/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 10/25] Current/Best: 8.88/ 16.15 GFLOPS | Progress: (4/20) | 2.79 s
[Task 10/25] Current/Best: 3.30/ 16.15 GFLOPS | Progress: (8/20) | 6.06 s
[Task 10/25] Current/Best: 11.14/ 21.95 GFLOPS | Progress: (12/20) | 7.55 s
[Task 10/25] Current/Best: 14.71/ 21.95 GFLOPS | Progress: (16/20) | 10.47 s
[Task 10/25] Current/Best: 18.31/ 21.95 GFLOPS | Progress: (20/20) | 11.97 s Done.
-
[Task 11/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 11/25] Current/Best: 23.22/ 23.22 GFLOPS | Progress: (4/20) | 2.85 s
[Task 11/25] Current/Best: 5.91/ 23.22 GFLOPS | Progress: (8/20) | 5.38 s
[Task 11/25] Current/Best: 16.49/ 23.22 GFLOPS | Progress: (12/20) | 7.61 s
[Task 11/25] Current/Best: 7.95/ 23.22 GFLOPS | Progress: (16/20) | 11.48 s
[Task 11/25] Current/Best: 18.03/ 23.22 GFLOPS | Progress: (20/20) | 13.37 s Done.
-
[Task 12/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 12/25] Current/Best: 9.19/ 20.14 GFLOPS | Progress: (4/20) | 2.76 s
[Task 12/25] Current/Best: 8.12/ 20.14 GFLOPS | Progress: (8/20) | 5.47 s
[Task 12/25] Current/Best: 8.92/ 20.47 GFLOPS | Progress: (12/20) | 10.80 s
[Task 12/25] Current/Best: 18.65/ 20.47 GFLOPS | Progress: (16/20) | 13.36 s
[Task 12/25] Current/Best: 18.68/ 20.47 GFLOPS | Progress: (20/20) | 15.26 s Done.
-
[Task 13/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 13/25] Current/Best: 13.45/ 17.73 GFLOPS | Progress: (4/20) | 3.77 s
[Task 13/25] Current/Best: 20.63/ 20.63 GFLOPS | Progress: (8/20) | 6.53 s
[Task 13/25] Current/Best: 9.91/ 20.63 GFLOPS | Progress: (12/20) | 10.06 s
[Task 13/25] Current/Best: 12.72/ 20.63 GFLOPS | Progress: (16/20) | 14.07 s
[Task 13/25] Current/Best: 9.83/ 22.14 GFLOPS | Progress: (20/20) | 17.71 s Done.
-
[Task 14/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 14/25] Current/Best: 3.64/ 12.58 GFLOPS | Progress: (4/20) | 5.30 s
[Task 14/25] Current/Best: 18.97/ 18.97 GFLOPS | Progress: (8/20) | 7.32 s
[Task 14/25] Current/Best: 8.89/ 18.97 GFLOPS | Progress: (12/20) | 10.64 s
[Task 14/25] Current/Best: 2.96/ 18.97 GFLOPS | Progress: (16/20) | 14.12 s
[Task 14/25] Current/Best: 10.93/ 18.97 GFLOPS | Progress: (20/20) | 16.34 s
[Task 15/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 15/25] Current/Best: 15.78/ 15.78 GFLOPS | Progress: (4/20) | 2.74 s
[Task 15/25] Current/Best: 11.70/ 15.78 GFLOPS | Progress: (8/20) | 6.55 s Done.
-
[Task 15/25] Current/Best: 21.08/ 21.08 GFLOPS | Progress: (12/20) | 7.90 s
[Task 15/25] Current/Best: 14.05/ 21.08 GFLOPS | Progress: (16/20) | 11.88 s
[Task 15/25] Current/Best: 11.35/ 21.08 GFLOPS | Progress: (20/20) | 14.09 s Done.
-
[Task 16/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 16/25] Current/Best: 14.48/ 14.48 GFLOPS | Progress: (4/20) | 2.92 s
[Task 16/25] Current/Best: 12.70/ 16.76 GFLOPS | Progress: (8/20) | 5.13 s
[Task 16/25] Current/Best: 17.54/ 18.94 GFLOPS | Progress: (12/20) | 6.47 s
[Task 16/25] Current/Best: 13.72/ 18.94 GFLOPS | Progress: (16/20) | 9.83 s
[Task 16/25] Current/Best: 9.53/ 18.94 GFLOPS | Progress: (20/20) | 11.65 s Done.
-
[Task 17/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 17/25] Current/Best: 19.58/ 19.58 GFLOPS | Progress: (4/20) | 4.03 s
[Task 17/25] Current/Best: 11.21/ 19.95 GFLOPS | Progress: (8/20) | 6.62 s
[Task 17/25] Current/Best: 21.33/ 21.33 GFLOPS | Progress: (12/20) | 8.72 s
[Task 17/25] Current/Best: 11.82/ 21.33 GFLOPS | Progress: (16/20) | 11.80 s
[Task 17/25] Current/Best: 10.26/ 23.33 GFLOPS | Progress: (20/20) | 14.18 s Done.
-
[Task 18/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 18/25] Current/Best: 10.66/ 10.66 GFLOPS | Progress: (4/20) | 4.62 s
[Task 18/25] Current/Best: 8.62/ 19.45 GFLOPS | Progress: (8/20) | 8.04 s
[Task 18/25] Current/Best: 12.35/ 19.45 GFLOPS | Progress: (12/20) | 10.26 s
[Task 18/25] Current/Best: 14.39/ 19.45 GFLOPS | Progress: (16/20) | 12.42 s
[Task 18/25] Current/Best: 6.54/ 19.45 GFLOPS | Progress: (20/20) | 14.47 s Done.
-
[Task 19/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 19/25] Current/Best: 17.15/ 17.15 GFLOPS | Progress: (4/20) | 4.98 s
[Task 19/25] Current/Best: 12.19/ 21.20 GFLOPS | Progress: (8/20) | 7.64 s
[Task 19/25] Current/Best: 13.85/ 22.59 GFLOPS | Progress: (12/20) | 10.92 s
[Task 19/25] Current/Best: 19.25/ 22.59 GFLOPS | Progress: (16/20) | 16.58 s
[Task 19/25] Current/Best: 16.85/ 22.59 GFLOPS | Progress: (20/20) | 18.45 s Done.
-
[Task 20/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 20/25] Current/Best: 10.59/ 18.67 GFLOPS | Progress: (4/20) | 3.17 s
[Task 20/25] Current/Best: 14.45/ 18.67 GFLOPS | Progress: (8/20) | 6.73 s
[Task 20/25] Current/Best: 3.09/ 18.67 GFLOPS | Progress: (12/20) | 8.82 s
[Task 20/25] Current/Best: 16.34/ 18.67 GFLOPS | Progress: (16/20) | 11.64 s
[Task 20/25] Current/Best: 18.07/ 18.67 GFLOPS | Progress: (20/20) | 13.29 s
[Task 21/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 21/25] Current/Best: 11.97/ 18.56 GFLOPS | Progress: (4/20) | 2.58 s
[Task 21/25] Current/Best: 13.47/ 18.60 GFLOPS | Progress: (8/20) | 4.68 s
[Task 21/25] Current/Best: 1.61/ 18.60 GFLOPS | Progress: (12/20) | 7.03 s
[Task 21/25] Current/Best: 10.75/ 18.60 GFLOPS | Progress: (16/20) | 7.58 s
[Task 21/25] Current/Best: 5.37/ 22.51 GFLOPS | Progress: (20/20) |
10.64 s
[Task 22/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s Done.
+
[Task 1/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 1/25] Current/Best: 13.11/ 17.63 GFLOPS | Progress: (4/20) | 6.23 s
[Task 1/25] Current/Best: 12.57/ 17.63 GFLOPS | Progress: (8/20) | 9.13 s
[Task 1/25] Current/Best: 12.85/ 17.63 GFLOPS | Progress: (12/20) | 12.68 s
[Task 1/25] Current/Best: 22.20/ 23.74 GFLOPS | Progress: (16/20) | 14.18 s
[Task 1/25] Current/Best: 15.64/ 23.74 GFLOPS | Progress: (20/20) | 16.39 s Done.
+
[Task 2/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 2/25] Current/Best: 5.24/ 17.12 GFLOPS | Progress: (4/20) | 2.65 s
[Task 2/25] Current/Best: 6.69/ 17.12 GFLOPS | Progress: (8/20) | 4.16 s
[Task 2/25] Current/Best: 12.39/ 22.34 GFLOPS | Progress: (12/20) | 5.34 s
[Task 2/25] Current/Best: 11.77/ 22.34 GFLOPS | Progress: (16/20) | 6.48 s
[Task 2/25] Current/Best: 11.94/ 22.34 GFLOPS | Progress: (20/20) | 7.81 s Done.
+
[Task 3/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 3/25] Current/Best: 22.67/ 22.67 GFLOPS | Progress: (4/20) | 3.05 s
[Task 3/25] Current/Best: 9.93/ 22.67 GFLOPS | Progress: (8/20) | 5.15 s
[Task 3/25] Current/Best: 7.56/ 24.14 GFLOPS | Progress: (12/20) | 6.98 s
[Task 3/25] Current/Best: 13.81/ 24.14 GFLOPS | Progress: (16/20) | 9.56 s
[Task 3/25] Current/Best: 10.17/ 24.14 GFLOPS | Progress: (20/20) | 12.84 s Done.
+
[Task 4/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 4/25] Current/Best: 9.82/ 12.04 GFLOPS | Progress: (4/20) | 3.74 s
[Task 4/25] Current/Best: 10.00/ 12.04 GFLOPS | Progress: (8/20) | 7.04 s
[Task 4/25] Current/Best: 10.67/ 14.85 GFLOPS | Progress: (12/20) | 12.10 s
[Task 4/25] Current/Best: 19.67/ 19.67 GFLOPS | Progress: (16/20) | 13.74 s
[Task 4/25] Current/Best: 4.70/ 19.67 GFLOPS | Progress: (20/20) | 20.09 s Done.
+
[Task 5/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 5/25] Current/Best: 17.82/ 17.82 GFLOPS | Progress: (4/20) | 2.81 s
[Task 5/25] Current/Best: 13.67/ 17.82 GFLOPS | Progress: (8/20) | 4.48 s
[Task 5/25] Current/Best: 22.60/ 22.60 GFLOPS | Progress: (12/20) | 6.10 s
[Task 5/25] Current/Best: 7.53/ 22.60 GFLOPS | Progress: (16/20) | 9.01 s
[Task 5/25] Current/Best: 16.10/ 22.60 GFLOPS | Progress: (20/20) | 10.84 s Done.
+
[Task 6/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 6/25] Current/Best: 4.72/ 13.07 GFLOPS | Progress: (4/20) | 3.95 s
[Task 6/25] Current/Best: 5.37/ 16.15 GFLOPS | Progress: (8/20) | 6.29 s
[Task 6/25] Current/Best: 10.04/ 16.15 GFLOPS | Progress: (12/20) | 9.50 s
[Task 6/25] Current/Best: 13.25/ 16.64 GFLOPS | Progress: (16/20) | 11.90 s
[Task 6/25] Current/Best: 11.18/ 16.64 GFLOPS | Progress: (20/20) | 14.74 s Done.
+
[Task 7/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 7/25] Current/Best: 8.46/ 12.73 GFLOPS | Progress: (4/20) | 3.83 s
[Task 7/25] Current/Best: 3.09/ 12.73 GFLOPS | Progress: (8/20) | 6.93 s
[Task 7/25] Current/Best: 12.53/ 14.14 GFLOPS | Progress: (12/20) | 8.99 s
[Task 7/25] Current/Best: 12.34/ 21.96 GFLOPS | Progress: (16/20) | 11.14 s
[Task 7/25] Current/Best: 8.69/ 21.96 GFLOPS | Progress: (20/20) | 13.07 s Done.
+
[Task 8/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 8/25] Current/Best: 12.15/ 14.25 GFLOPS | Progress: (4/20) | 4.04 s
[Task 8/25] Current/Best: 7.75/ 14.25 GFLOPS | Progress: (8/20) | 15.29 s
[Task 8/25] Current/Best: 5.52/ 14.25 GFLOPS | Progress: (12/20) | 17.78 s
[Task 8/25] Current/Best: 8.44/ 14.25 GFLOPS | Progress: (16/20) | 28.66 s
[Task 8/25] Current/Best: 3.47/ 14.25 GFLOPS | Progress: (20/20) | 40.33 s
[Task 9/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 9/25] Current/Best: 12.55/ 18.97 GFLOPS | Progress: (4/20) | 3.22 s
[Task 9/25] Current/Best: 4.89/ 18.97 GFLOPS | Progress: (8/20) | 4.95 s
[Task 9/25] Current/Best: 14.74/ 18.97 GFLOPS | Progress: (12/20) | 9.14 s
[Task 9/25] Current/Best: 13.05/ 18.97 GFLOPS | Progress: (16/20) | 12.64 s
[Task 9/25] Current/Best: 13.46/ 18.97 GFLOPS | Progress: (20/20
) | 23.15 s Done.
+
[Task 10/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 10/25] Current/Best: 5.69/ 19.06 GFLOPS | Progress: (4/20) | 2.60 s
[Task 10/25] Current/Best: 6.51/ 19.06 GFLOPS | Progress: (8/20) | 4.72 s
[Task 10/25] Current/Best: 8.94/ 19.06 GFLOPS | Progress: (12/20) | 6.74 s
[Task 10/25] Current/Best: 11.31/ 20.46 GFLOPS | Progress: (16/20) | 9.40 s
[Task 10/25] Current/Best: 4.30/ 20.46 GFLOPS | Progress: (20/20) | 11.65 s Done.
+
[Task 11/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 11/25] Current/Best: 16.08/ 16.08 GFLOPS | Progress: (4/20) | 3.16 s
[Task 11/25] Current/Best: 12.30/ 16.08 GFLOPS | Progress: (8/20) | 6.74 s Done.
+
[Task 11/25] Current/Best: 21.12/ 21.12 GFLOPS | Progress: (12/20) | 9.36 s
[Task 11/25] Current/Best: 14.41/ 21.12 GFLOPS | Progress: (16/20) | 11.89 s
[Task 11/25] Current/Best: 10.03/ 21.12 GFLOPS | Progress: (20/20) | 13.87 s Done.
+
[Task 12/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 12/25] Current/Best: 11.71/ 16.56 GFLOPS | Progress: (4/20) | 3.26 s
[Task 12/25] Current/Best: 11.87/ 19.03 GFLOPS | Progress: (8/20) | 7.14 s
[Task 12/25] Current/Best: 14.81/ 19.03 GFLOPS | Progress: (12/20) | 12.75 s
[Task 12/25] Current/Best: 13.24/ 19.03 GFLOPS | Progress: (16/20) | 14.57 s
[Task 12/25] Current/Best: 12.94/ 20.27 GFLOPS | Progress: (20/20) | 17.77 s Done.
+
[Task 13/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 13/25] Current/Best: 17.27/ 17.27 GFLOPS | Progress: (4/20) | 3.92 s
[Task 13/25] Current/Best: 12.70/ 20.91 GFLOPS | Progress: (8/20) | 6.23 s
[Task 13/25] Current/Best: 6.05/ 20.91 GFLOPS | Progress: (12/20) | 9.90 s
[Task 13/25] Current/Best: 12.19/ 20.91 GFLOPS | Progress: (16/20) | 11.56 s
[Task 13/25] Current/Best: 11.79/ 22.56 GFLOPS | Progress: (20/20) | 14.16 s Done.
+
[Task 14/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 14/25] Current/Best: 9.72/ 19.79 GFLOPS | Progress: (4/20) | 5.24 s
[Task 14/25] Current/Best: 13.17/ 19.79 GFLOPS | Progress: (8/20) | 7.27 s
[Task 14/25] Current/Best: 20.73/ 20.73 GFLOPS | Progress: (12/20) | 11.41 s
[Task 14/25] Current/Best: 6.51/ 20.73 GFLOPS | Progress: (16/20) | 15.17 s
[Task 14/25] Current/Best: 4.25/ 20.73 GFLOPS | Progress: (20/20) | 16.57 s
[Task 15/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 15/25] Current/Best: 9.85/ 18.25 GFLOPS | Progress: (4/20) | 4.40 s
[Task 15/25] Current/Best: 9.48/ 18.25 GFLOPS | Progress: (8/20) | 7.95 s
[Task 15/25] Current/Best: 14.42/ 18.25 GFLOPS | Progress: (12/20) | 10.22 s
[Task 15/25] Current/Best: 18.24/ 19.13 GFLOPS | Progress: (16/20) | 11.35 s
[Task 15/25] Current/Best: 12.19/ 19.91 GFLOPS | Progress: (20/20
) | 12.86 s Done.
+
[Task 16/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 16/25] Current/Best: 12.58/ 18.83 GFLOPS | Progress: (4/20) | 2.33 s
[Task 16/25] Current/Best: 14.67/ 19.42 GFLOPS | Progress: (8/20) | 4.16 s
[Task 16/25] Current/Best: 12.15/ 19.42 GFLOPS | Progress: (12/20) | 5.51 s
[Task 16/25] Current/Best: 11.06/ 19.42 GFLOPS | Progress: (16/20) | 8.33 s
[Task 16/25] Current/Best: 17.07/ 19.42 GFLOPS | Progress: (20/20) | 9.70 s Done.
+
[Task 17/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 17/25] Current/Best: 13.27/ 19.72 GFLOPS | Progress: (4/20) | 2.74 s
[Task 17/25] Current/Best: 22.15/ 22.15 GFLOPS | Progress: (8/20) | 5.36 s
[Task 17/25] Current/Best: 20.10/ 22.15 GFLOPS | Progress: (12/20) | 7.12 s
[Task 17/25] Current/Best: 8.98/ 22.15 GFLOPS | Progress: (16/20) | 9.66 s
[Task 17/25] Current/Best: 10.52/ 22.80 GFLOPS | Progress: (20/20) | 11.67 s Done.
+
[Task 18/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 18/25] Current/Best: 18.41/ 18.41 GFLOPS | Progress: (4/20) | 6.70 s
[Task 18/25] Current/Best: 9.06/ 18.41 GFLOPS | Progress: (8/20) | 9.80 s
[Task 18/25] Current/Best: 3.08/ 18.49 GFLOPS | Progress: (12/20) | 12.11 s
[Task 18/25] Current/Best: 22.64/ 22.64 GFLOPS | Progress: (16/20) | 15.66 s
[Task 18/25] Current/Best: 10.54/ 22.64 GFLOPS | Progress: (20/20) | 18.19 s Done.
+
[Task 19/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 19/25] Current/Best: 19.92/ 19.92 GFLOPS | Progress: (4/20) | 4.02 s
[Task 19/25] Current/Best: 19.18/ 19.92 GFLOPS | Progress: (8/20) | 9.42 s
[Task 19/25] Current/Best: 10.44/ 19.92 GFLOPS | Progress: (12/20) | 12.55 s
[Task 19/25] Current/Best: 8.83/ 20.91 GFLOPS | Progress: (16/20) | 14.70 s
[Task 19/25] Current/Best: 1.55/ 20.91 GFLOPS | Progress: (20/20) | 18.55 s Done.
+
[Task 20/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 20/25] Current/Best: 6.07/ 13.53 GFLOPS | Progress: (4/20) | 3.17 s
[Task 20/25] Current/Best: 16.45/ 16.45 GFLOPS | Progress: (8/20) | 4.19 s
[Task 20/25] Current/Best: 16.94/ 16.94 GFLOPS | Progress: (12/20) | 6.69 s
[Task 20/25] Current/Best: 2.54/ 18.24 GFLOPS | Progress: (16/20) | 8.87 s
[Task 20/25] Current/Best: 16.66/ 18.24 GFLOPS | Progress: (20/20) | 11.04 s
[Task 21/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s Done.
Done.
-
[Task 22/25] Current/Best: 2.31/ 14.54 GFLOPS | Progress: (4/20) | 3.46 s
[Task 22/25] Current/Best: 6.25/ 15.35 GFLOPS | Progress: (8/20) | 7.64 s
[Task 22/25] Current/Best: 20.38/ 20.38 GFLOPS | Progress: (12/20) | 9.60 s
[Task 22/25] Current/Best: 20.90/ 20.90 GFLOPS | Progress: (16/20) | 12.57 s
[Task 22/25] Current/Best: 5.18/ 20.90 GFLOPS | Progress: (20/20) | 14.63 s Done.
-
[Task 23/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 23/25] Current/Best: 4.65/ 12.47 GFLOPS | Progress: (4/20) | 4.92 s
[Task 23/25] Current/Best: 8.91/ 12.47 GFLOPS | Progress: (8/20) | 10.90 s
[Task 23/25] Current/Best: 10.26/ 12.47 GFLOPS | Progress: (12/20) | 15.51 s
[Task 23/25] Current/Best: 10.70/ 18.62 GFLOPS | Progress: (16/20) | 19.22 s
[Task 23/25] Current/Best: 21.43/ 21.43 GFLOPS | Progress: (20/20) | 21.30 s Done.
-
[Task 24/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 24/25] Current/Best: 1.67/ 6.04 GFLOPS | Progress: (4/20) | 11.62 s
[Task 24/25] Current/Best: 3.45/ 6.04 GFLOPS | Progress: (8/20) | 23.19 s
[Task 24/25] Current/Best: 9.75/ 9.75 GFLOPS | Progress: (12/20) | 33.96 s
[Task 24/25] Current/Best: 2.30/ 9.75 GFLOPS | Progress: (16/20) | 45.52 s
[Task 24/25] Current/Best: 10.72/ 10.72 GFLOPS | Progress: (20/20) | 56.55 s
[Task 25/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s Done.
-
[Task 25/25] Current/Best: 8.05/ 8.05 GFLOPS | Progress: (4/20) | 11.78 s
[Task 25/25] Current/Best: 6.30/ 9.13 GFLOPS | Progress: (8/20) | 13.46 s
[Task 25/25] Current/Best: 7.53/ 9.46 GFLOPS | Progress: (12/20) | 24.19 s
[Task 25/25] Current/Best: 8.67/ 9.46 GFLOPS | Progress: (16/20) | 35.45 s
[Task 25/25] Current/Best: 8.69/ 9.46 GFLOPS | Progress: (20/20) | 46.75 s
+
[Task 21/25] Current/Best: 16.61/ 17.58 GFLOPS | Progress: (4/20) | 2.54 s
[Task 21/25] Current/Best: 12.83/ 19.74 GFLOPS | Progress: (8/20) | 4.17 s
[Task 21/25] Current/Best: 7.44/ 19.74 GFLOPS | Progress: (12/20) | 6.46 s
[Task 21/25] Current/Best: 11.03/ 19.74 GFLOPS | Progress: (16/20) | 8.42 s
[Task 21/25] Current/Best: 14.05/ 19.74 GFLOPS | Progress: (20/20) | 9.79 s
[Task 22/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 22/25] Current/Best: 13.98/ 15.58 GFLOPS | Progress: (4/20) | 3.24 s
[Task 22/25] Current/Best: 14.48/ 15.62 GFLOPS | Progress: (8/20) | 5.21 s
[Task 22/25] Current/Best: 1.56/ 15.62 GFLOPS | Progress: (12/20) | 8.53 s
[Task 22/25] Current/Best: 8.81/ 15.81 GFLOPS | Progress: (16/20) | 9.85 s
[Task 22/25] Current/Best: 15.63/ 15.81 GFLOPS | Progress: (20/20) | 13.17 s Done.
+
[Task 23/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 23/25] Current/Best: 13.06/ 23.91 GFLOPS | Progress: (4/20) | 3.21 s
[Task 23/25] Current/Best: 14.51/ 23.91 GFLOPS | Progress: (8/20) | 5.74 s
[Task 23/25] Current/Best: 10.71/ 23.91 GFLOPS | Progress: (12/20) | 8.15 s
[Task 23/25] Current/Best: 8.47/ 23.91 GFLOPS | Progress: (16/20) | 11.05 s
[Task 23/25] Current/Best: 2.65/ 23.91 GFLOPS | Progress: (20/20) | 17.52 s Done.
+
[Task 24/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 24/25] Current/Best: 3.75/ 6.27 GFLOPS | Progress: (4/20) | 11.80 s
[Task 24/25] Current/Best: 3.13/ 6.27 GFLOPS | Progress: (8/20) | 22.29 s
[Task 24/25] Current/Best: 8.26/ 8.26 GFLOPS | Progress: (12/20) | 24.07 s Done.
+
[Task 24/25] Current/Best: 5.58/ 8.26 GFLOPS | Progress: (16/20) | 26.32 s
[Task 24/25] Current/Best: 3.38/ 10.72 GFLOPS | Progress: (20/20) | 35.13 s Done.
+
[Task 25/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 25/25] Current/Best: 7.96/ 8.86 GFLOPS | Progress: (4/20) | 11.78 s
[Task 25/25] Current/Best: 1.48/ 8.86 GFLOPS | Progress: (8/20) | 22.45 s
[Task 25/25] Current/Best: 5.21/ 8.86 GFLOPS | Progress: (12/20) | 23.97 s
[Task 25/25] Current/Best: 7.22/ 8.86 GFLOPS | Progress: (16/20) | 29.02 s
[Task 25/25] Current/Best: 5.06/ 9.65 GFLOPS | Progress: (20/20) | 39.71 s
@@ -732,8 +732,8 @@ improvement in comparing the optimized model to the unoptimized model.
.. code-block:: none
- optimized: {'mean': 405.1291182800105, 'median': 404.8909023500073, 'std': 0.9162805964274249}
- unoptimized: {'mean': 513.4868455500055, 'median': 513.3193053500236, 'std': 0.9942950218432403}
+ optimized: {'mean': 416.18359676999717, 'median': 416.04919939999263, 'std': 1.080433058309313}
+ unoptimized: {'mean': 510.5090909899979, 'median': 510.57848135000654, 'std': 1.257277929183221}
@@ -756,7 +756,7 @@ profiling/benchmarking.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 10 minutes 30.047 seconds)
+ **Total running time of the script:** ( 10 minutes 19.428 seconds)
.. _sphx_glr_download_tutorial_autotvm_relay_x86.py:
diff --git a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
index 1e852b726d..c66fc5cdb8 100644
--- a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
+++ b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
@@ -282,7 +282,7 @@ device and returns the measured cost. Network overhead is excluded.
.. code-block:: none
- 1.278e-07 secs/op
+ 1.268e-07 secs/op
diff --git a/docs/_sources/tutorial/intro_topi.rst.txt b/docs/_sources/tutorial/intro_topi.rst.txt
index d8d878dba2..08839e8617 100644
--- a/docs/_sources/tutorial/intro_topi.rst.txt
+++ b/docs/_sources/tutorial/intro_topi.rst.txt
@@ -263,7 +263,7 @@ As you can see, scheduled stages of computation have been accumulated and we can
.. code-block:: none
- [stage(a, placeholder(a, 0x1afc5a30)), stage(b, placeholder(b, 0x22997200)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(mi [...]
+ [stage(a, placeholder(a, 0xded78f0)), stage(b, placeholder(b, 0x116427e0)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min [...]
diff --git a/docs/_sources/tutorial/sg_execution_times.rst.txt b/docs/_sources/tutorial/sg_execution_times.rst.txt
index 52852b6225..1e47632390 100644
--- a/docs/_sources/tutorial/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorial/sg_execution_times.rst.txt
@@ -5,32 +5,32 @@
Computation times
=================
-**13:14.689** total execution time for **tutorial** files:
+**13:06.819** total execution time for **tutorial** files:
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``) | 10:30.047 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``) | 10:19.428 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``) | 00:59.763 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``) | 01:01.733 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``) | 00:54.021 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``) | 00:51.954 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``) | 00:30.679 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``) | 00:30.678 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``) | 00:18.798 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``) | 00:21.074 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``) | 00:00.700 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``) | 00:01.089 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``) | 00:00.519 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``) | 00:00.697 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``) | 00:00.155 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``) | 00:00.157 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_tutorial_introduction.py` (``introduction.py``) | 00:00.005 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_uma.py` (``uma.py``) | 00:00.001 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_uma.py` (``uma.py``) | 00:00.002 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_install.py` (``install.py``) | 00:00.001 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tvmc_command_line_driver.py` (``tvmc_command_line_driver.py``) | 00:00.001 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_tutorial_tvmc_python.py` (``tvmc_python.py``) | 00:00.001 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tvmc_command_line_driver.py` (``tvmc_command_line_driver.py``) | 00:00.001 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_install.py` (``install.py``) | 00:00.001 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
index 8d1e7d15be..a971b681a4 100644
--- a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
+++ b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
@@ -394,7 +394,7 @@ compile and run this new schedule with the parallel operation applied:
.. code-block:: none
- parallel: 0.000008
+ parallel: 0.000007
@@ -501,10 +501,10 @@ We can now compare the different schedules
.. code-block:: none
Operator Timing Performance
- numpy 7.683510002607363e-06 1.0
- naive 6.7804e-06 0.8824612706561329
- parallel 7.811700000000002e-06 1.0166837808955982
- vector 2.45441e-05 3.194386418664266
+ numpy 7.637039998371619e-06 1.0
+ naive 7.0358000000000005e-06 0.9212731636210082
+ parallel 7.3192e-06 0.9583817816275167
+ vector 2.45864e-05 3.2193624762005104
@@ -925,7 +925,7 @@ matrix multiplication.
.. code-block:: none
- Numpy running time: 0.018031
+ Numpy running time: 0.018433
@@ -983,7 +983,7 @@ optimizations.
.. code-block:: none
- none: 3.337305
+ none: 3.488412
@@ -1086,7 +1086,7 @@ schedule.
.. code-block:: none
- blocking: 0.300104
+ blocking: 0.296257
@@ -1182,7 +1182,7 @@ already cache friendly from our previous optimizations.
.. code-block:: none
- vectorization: 0.335423
+ vectorization: 0.335934
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1256,7 +1256,7 @@ more cache friendly.
.. code-block:: none
- loop permutation: 0.115159
+ loop permutation: 0.114811
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1355,7 +1355,7 @@ optimized schedule.
.. code-block:: none
- array packing: 0.108299
+ array packing: 0.107889
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1448,7 +1448,7 @@ to `C` when all the block results are ready.
.. code-block:: none
- block caching: 0.110883
+ block caching: 0.110217
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1534,7 +1534,7 @@ of thread-level parallelization.
.. code-block:: none
- parallelization: 0.146468
+ parallelization: 0.145541
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1615,13 +1615,13 @@ working, we can compare the results.
.. code-block:: none
Operator Timing Performance
- none 3.3373046971 1.0
- blocking 0.30010360329999997 0.08992394478118207
- vectorization 0.3354231334 0.10050719483044832
- loop permutation 0.1151585266 0.03450644668437638
- array packing 0.10829879949999999 0.03245097745917771
- block caching 0.1108825838 0.03322519034487713
- parallelization 0.1464679989 0.04388811097388726
+ none 3.4884116279000006 1.0
+ blocking 0.2962569233 0.08492602218458507
+ vectorization 0.3359337591 0.09629991954310443
+ loop permutation 0.1148114477 0.0329122420019898
+ array packing 0.107888973 0.030927821744748743
+ block caching 0.1102171894 0.031595236215386076
+ parallelization 0.1455408451 0.04172123608807444
@@ -1661,6 +1661,11 @@ operations with tunable parameters that allows you to automatically optimize
the computation for specific platforms.
+.. rst-class:: sphx-glr-timing
+
+ **Total running time of the script:** ( 1 minutes 1.733 seconds)
+
+
.. _sphx_glr_download_tutorial_tensor_expr_get_started.py:
.. only:: html
diff --git a/docs/commit_hash b/docs/commit_hash
index 39862b4400..e02dec290f 100644
--- a/docs/commit_hash
+++ b/docs/commit_hash
@@ -1 +1 @@
-195ae72b5c6f0df68fac41f7808d125d155a6345
+4e783a6087fd236c588cde30e0ac99daa15afe61
diff --git a/docs/genindex.html b/docs/genindex.html
index 45baa22325..c5afc44599 100644
--- a/docs/genindex.html
+++ b/docs/genindex.html
@@ -2320,7 +2320,11 @@
<li><a href="reference/api/python/relay/transform.html#tvm.relay.transform.LambdaLift">LambdaLift() (in module tvm.relay.transform)</a>
</li>
<li><a href="reference/api/python/relay/nn.html#tvm.relay.nn.layer_norm">layer_norm() (in module tvm.relay.nn)</a>
+
+ <ul>
+ <li><a href="reference/api/python/topi.html#tvm.topi.nn.layer_norm">(in module tvm.topi.nn)</a>
</li>
+ </ul></li>
<li><a href="reference/api/python/tir.html#tvm.tir.Layout">Layout (class in tvm.tir)</a>
</li>
<li><a href="reference/api/python/tir.html#tvm.tir.layout">layout() (in module tvm.tir)</a>
diff --git a/docs/how_to/compile_models/from_darknet.html b/docs/how_to/compile_models/from_darknet.html
index 073704df08..d847787e1e 100644
--- a/docs/how_to/compile_models/from_darknet.html
+++ b/docs/how_to/compile_models/from_darknet.html
@@ -572,7 +572,7 @@ class:['truck 0.9266'] left:471 top:83 right:689 bottom:169
class:['bicycle 0.9984'] left:111 top:113 right:577 bottom:447
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 2.659 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 2.345 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-darknet-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7716f96385bd5abb6e822041e285be54/from_darknet.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_darknet.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/from_keras.html b/docs/how_to/compile_models/from_keras.html
index 504c498347..5c96618a47 100644
--- a/docs/how_to/compile_models/from_keras.html
+++ b/docs/how_to/compile_models/from_keras.html
@@ -493,7 +493,7 @@ pip install -U tensorflow --user
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Relay top-1 id: 285, class name: Egyptian cat
1/1 [==============================] - ETA: 0s
-1/1 [==============================] - 1s 939ms/step
+1/1 [==============================] - 1s 952ms/step
Keras top-1 id: 285, class name: Egyptian cat
</pre></div>
</div>
diff --git a/docs/how_to/compile_models/from_mxnet.html b/docs/how_to/compile_models/from_mxnet.html
index e272c2b5ac..1d015ba89a 100644
--- a/docs/how_to/compile_models/from_mxnet.html
+++ b/docs/how_to/compile_models/from_mxnet.html
@@ -427,7 +427,7 @@ to download the full example code</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"x"</span><span class="p">,</span> <a href="https://docs.python.org/3/library/stdtypes.html#tuple" title="builtins.tuple" class="sphx-glr-backref-module-builtins sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">x</span><span class="o">.</span><span class="n">shape</span></a><span class="p">)</span>
</pre></div>
</div>
-<img src="../../_images/sphx_glr_from_mxnet_001.png" srcset="../../_images/sphx_glr_from_mxnet_001.png" alt="from mxnet" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zipad65b5d5-f351-40df-b354-b0c5e6ea4e50 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+<img src="../../_images/sphx_glr_from_mxnet_001.png" srcset="../../_images/sphx_glr_from_mxnet_001.png" alt="from mxnet" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zipb4fe9518-3411-4a64-8110-b3cf781ae214 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
x (1, 3, 224, 224)
</pre></div>
</div>
diff --git a/docs/how_to/compile_models/from_oneflow.html b/docs/how_to/compile_models/from_oneflow.html
index 61e909aab6..0c99a28827 100644
--- a/docs/how_to/compile_models/from_oneflow.html
+++ b/docs/how_to/compile_models/from_oneflow.html
@@ -435,13 +435,12 @@ Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdo
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip" to /workspace/.oneflow/flowvision_cache/resnet18.zip
0%| | 0.00/41.5M [00:00<?, ?B/s]
- 15%|#5 | 6.33M/41.5M [00:00<00:00, 60.4MB/s]
- 29%|##9 | 12.1M/41.5M [00:00<00:00, 47.3MB/s]
- 40%|#### | 16.7M/41.5M [00:00<00:00, 32.8MB/s]
- 58%|#####7 | 24.0M/41.5M [00:00<00:00, 39.5MB/s]
- 77%|#######7 | 32.0M/41.5M [00:00<00:00, 45.6MB/s]
- 96%|#########6| 40.0M/41.5M [00:00<00:00, 48.1MB/s]
-100%|##########| 41.5M/41.5M [00:00<00:00, 46.4MB/s]
+ 19%|#9 | 7.99M/41.5M [00:00<00:00, 66.9MB/s]
+ 39%|###8 | 16.0M/41.5M [00:00<00:00, 71.1MB/s]
+ 58%|#####7 | 24.0M/41.5M [00:00<00:00, 68.0MB/s]
+ 77%|#######7 | 32.0M/41.5M [00:00<00:00, 73.2MB/s]
+ 94%|#########4| 39.1M/41.5M [00:00<00:00, 52.7MB/s]
+100%|##########| 41.5M/41.5M [00:00<00:00, 59.7MB/s]
</pre></div>
</div>
</div>
diff --git a/docs/how_to/compile_models/from_pytorch.html b/docs/how_to/compile_models/from_pytorch.html
index c79d61b966..7f16c61a51 100644
--- a/docs/how_to/compile_models/from_pytorch.html
+++ b/docs/how_to/compile_models/from_pytorch.html
@@ -414,9 +414,9 @@ be unstable.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
0%| | 0.00/44.7M [00:00<?, ?B/s]
- 44%|####3 | 19.6M/44.7M [00:00<00:00, 205MB/s]
- 93%|#########3| 41.6M/44.7M [00:00<00:00, 220MB/s]
-100%|##########| 44.7M/44.7M [00:00<00:00, 215MB/s]
+ 22%|##1 | 9.76M/44.7M [00:00<00:00, 102MB/s]
+ 46%|####6 | 20.6M/44.7M [00:00<00:00, 109MB/s]
+100%|##########| 44.7M/44.7M [00:00<00:00, 157MB/s]
</pre></div>
</div>
</div>
diff --git a/docs/how_to/compile_models/from_tensorflow.html b/docs/how_to/compile_models/from_tensorflow.html
index efc67c3056..3eb1c570ac 100644
--- a/docs/how_to/compile_models/from_tensorflow.html
+++ b/docs/how_to/compile_models/from_tensorflow.html
@@ -632,7 +632,7 @@ banana (score = 0.00022)
desk (score = 0.00019)
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 3.207 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 3.087 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-tensorflow-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7f1d3d1b878694c201c614c807cdebc8/from_tensorflow.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_tensorflow.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/sg_execution_times.html b/docs/how_to/compile_models/sg_execution_times.html
index dd844fb0e0..b41d61f10e 100644
--- a/docs/how_to/compile_models/sg_execution_times.html
+++ b/docs/how_to/compile_models/sg_execution_times.html
@@ -327,7 +327,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-compile-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>05:02.275</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
+<p><strong>05:02.529</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 81%" />
@@ -336,43 +336,43 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="from_tensorflow.html#sphx-glr-how-to-compile-models-from-tensorflow-py"><span class="std std-ref">Compile Tensorflow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tensorflow.py</span></code>)</p></td>
-<td><p>01:03.207</p></td>
+<td><p>01:03.087</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_darknet.html#sphx-glr-how-to-compile-models-from-darknet-py"><span class="std std-ref">Compile YOLO-V2 and YOLO-V3 in DarkNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_darknet.py</span></code>)</p></td>
-<td><p>01:02.659</p></td>
+<td><p>01:02.345</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="from_paddle.html#sphx-glr-how-to-compile-models-from-paddle-py"><span class="std std-ref">Compile PaddlePaddle Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_paddle.py</span></code>)</p></td>
-<td><p>00:38.688</p></td>
+<td><p>00:38.976</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_oneflow.html#sphx-glr-how-to-compile-models-from-oneflow-py"><span class="std std-ref">Compile OneFlow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_oneflow.py</span></code>)</p></td>
-<td><p>00:28.892</p></td>
+<td><p>00:27.497</p></td>
<td><p>0.0 MB</p></td>
</tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="from_tflite.html#sphx-glr-how-to-compile-models-from-tflite-py"><span class="std std-ref">Compile TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tflite.py</span></code>)</p></td>
-<td><p>00:25.105</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="from_mxnet.html#sphx-glr-how-to-compile-models-from-mxnet-py"><span class="std std-ref">Compile MXNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_mxnet.py</span></code>)</p></td>
+<td><p>00:26.280</p></td>
<td><p>0.0 MB</p></td>
</tr>
-<tr class="row-even"><td><p><a class="reference internal" href="from_mxnet.html#sphx-glr-how-to-compile-models-from-mxnet-py"><span class="std std-ref">Compile MXNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_mxnet.py</span></code>)</p></td>
-<td><p>00:24.955</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="from_tflite.html#sphx-glr-how-to-compile-models-from-tflite-py"><span class="std std-ref">Compile TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tflite.py</span></code>)</p></td>
+<td><p>00:24.429</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="from_coreml.html#sphx-glr-how-to-compile-models-from-coreml-py"><span class="std std-ref">Compile CoreML Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_coreml.py</span></code>)</p></td>
-<td><p>00:21.186</p></td>
+<td><p>00:21.363</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_pytorch.html#sphx-glr-how-to-compile-models-from-pytorch-py"><span class="std std-ref">Compile PyTorch Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_pytorch.py</span></code>)</p></td>
-<td><p>00:18.982</p></td>
+<td><p>00:19.564</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="from_keras.html#sphx-glr-how-to-compile-models-from-keras-py"><span class="std std-ref">Compile Keras Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_keras.py</span></code>)</p></td>
-<td><p>00:16.211</p></td>
+<td><p>00:16.439</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_onnx.html#sphx-glr-how-to-compile-models-from-onnx-py"><span class="std std-ref">Compile ONNX Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_onnx.py</span></code>)</p></td>
-<td><p>00:02.390</p></td>
+<td><p>00:02.550</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/deploy_models/deploy_model_on_android.html b/docs/how_to/deploy_models/deploy_model_on_android.html
index 885a78ae92..38701c5cfc 100644
--- a/docs/how_to/deploy_models/deploy_model_on_android.html
+++ b/docs/how_to/deploy_models/deploy_model_on_android.html
@@ -649,7 +649,7 @@ to the remote android device.</p>
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 15.5880 15.5711 15.8581 15.4948 0.1032
+ 15.6156 15.5715 15.9768 15.5318 0.1261
</pre></div>
</div>
</div>
diff --git a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
index d78cb2e3c9..7540b2050c 100644
--- a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
+++ b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
@@ -436,17 +436,16 @@ be unstable.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
0%| | 0.00/170M [00:00<?, ?B/s]
- 3%|2 | 4.25M/170M [00:00<00:03, 44.6MB/s]
- 5%|5 | 8.97M/170M [00:00<00:03, 47.4MB/s]
- 17%|#6 | 28.3M/170M [00:00<00:01, 118MB/s]
- 32%|###1 | 54.1M/170M [00:00<00:00, 178MB/s]
- 42%|####1 | 71.1M/170M [00:00<00:00, 173MB/s]
- 52%|#####1 | 87.7M/170M [00:00<00:00, 167MB/s]
- 61%|######1 | 104M/170M [00:00<00:00, 164MB/s]
- 76%|#######5 | 129M/170M [00:00<00:00, 195MB/s]
- 87%|########7 | 148M/170M [00:00<00:00, 198MB/s]
- 99%|#########8| 167M/170M [00:01<00:00, 180MB/s]
-100%|##########| 170M/170M [00:01<00:00, 167MB/s]
+ 2%|2 | 4.24M/170M [00:00<00:03, 44.5MB/s]
+ 5%|4 | 8.48M/170M [00:00<00:04, 41.6MB/s]
+ 17%|#7 | 29.4M/170M [00:00<00:01, 120MB/s]
+ 31%|###1 | 53.0M/170M [00:00<00:00, 169MB/s]
+ 44%|####4 | 75.3M/170M [00:00<00:00, 190MB/s]
+ 55%|#####5 | 93.7M/170M [00:00<00:00, 167MB/s]
+ 66%|######5 | 111M/170M [00:00<00:00, 173MB/s]
+ 78%|#######7 | 132M/170M [00:00<00:00, 185MB/s]
+ 89%|########9 | 152M/170M [00:00<00:00, 193MB/s]
+100%|##########| 170M/170M [00:01<00:00, 171MB/s]
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:3878: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
for i in range(dim)
/usr/local/lib/python3.7/dist-packages/torchvision/models/detection/anchor_utils.py:127: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
@@ -540,7 +539,7 @@ torchvision rcnn models.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Get 9 valid boxes
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 56.393 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 54.438 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-object-detection-pytorch-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7795da4b258c8feff986668b95ef57ad/deploy_object_detection_pytorch.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_object_detection_pytorch.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized.html b/docs/how_to/deploy_models/deploy_prequantized.html
index 16e2583e2d..48c7afb750 100644
--- a/docs/how_to/deploy_models/deploy_prequantized.html
+++ b/docs/how_to/deploy_models/deploy_prequantized.html
@@ -480,9 +480,7 @@ training. Other models require a full post training calibration.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
0%| | 0.00/13.6M [00:00<?, ?B/s]
- 26%|##6 | 3.54M/13.6M [00:00<00:00, 37.0MB/s]
- 52%|#####2 | 7.08M/13.6M [00:00<00:00, 35.3MB/s]
-100%|##########| 13.6M/13.6M [00:00<00:00, 58.6MB/s]
+100%|##########| 13.6M/13.6M [00:00<00:00, 156MB/s]
</pre></div>
</div>
</div>
@@ -567,7 +565,7 @@ output values are identical out of 1000 outputs from mobilenet v2.</p>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 90.2916 90.1593 96.2862 89.9157 0.6391
+ 90.1920 90.1001 92.8598 89.9293 0.3587
</pre></div>
</div>
<div class="admonition note">
@@ -606,7 +604,7 @@ This includes support for the VNNI 8 bit dot product instruction (CascadeLake or
<div class="section" id="deploy-a-quantized-tflite-model">
<h2>Deploy a quantized TFLite Model<a class="headerlink" href="#deploy-a-quantized-tflite-model" title="Permalink to this headline">¶</a></h2>
<p>TODO</p>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 7.576 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 7.664 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/fb8217c13f4351224c6cf3aacf1a87fc/deploy_prequantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized_tflite.html b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
index d303d9bcf4..5b87d64d01 100644
--- a/docs/how_to/deploy_models/deploy_prequantized_tflite.html
+++ b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
@@ -569,7 +569,7 @@ TFLite Top-5 labels: [387 102 386 341 349]
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 120.0710 119.9796 124.3624 118.7161 0.8110
+ 119.2496 119.4333 122.3497 117.1914 1.0613
</pre></div>
</div>
<div class="admonition note">
@@ -597,7 +597,7 @@ network for ARM CPU</span></a>.</p></li>
</ul>
</div></blockquote>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 52.732 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 52.227 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-tflite-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/56691c7a27d45da61d112276334640d3/deploy_prequantized_tflite.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized_tflite.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_quantized.html b/docs/how_to/deploy_models/deploy_quantized.html
index 6016c5bdb7..21bf04b1df 100644
--- a/docs/how_to/deploy_models/deploy_quantized.html
+++ b/docs/how_to/deploy_models/deploy_quantized.html
@@ -507,7 +507,7 @@ for calibration. But the accuracy might be impacted.</p>
DeprecationWarning,
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 25.819 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 22.168 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-quantized-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7810ecf51bfc05f7d5e8a400ac3e815d/deploy_quantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_quantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
index 45401bb40b..9474a8588a 100644
--- a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
+++ b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
@@ -441,23 +441,25 @@ to your device.</p>
Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
0%| | 0/132723 [00:00<?, ?KB/s]
- 4%|3 | 4699/132723 [00:00<00:02, 46984.78KB/s]
- 10%|9 | 12752/132723 [00:00<00:01, 66712.47KB/s]
- 16%|#5 | 20944/132723 [00:00<00:01, 73654.74KB/s]
- 22%|##1 | 29143/132723 [00:00<00:01, 76942.53KB/s]
- 28%|##8 | 37304/132723 [00:00<00:01, 78623.23KB/s]
- 34%|###4 | 45445/132723 [00:00<00:01, 79568.58KB/s]
- 40%|#### | 53637/132723 [00:00<00:00, 80334.48KB/s]
- 47%|####6 | 61831/132723 [00:00<00:00, 80842.53KB/s]
- 53%|#####2 | 69978/132723 [00:00<00:00, 81037.13KB/s]
- 59%|#####8 | 78082/132723 [00:01<00:00, 78412.88KB/s]
- 65%|######4 | 86209/132723 [00:01<00:00, 79260.15KB/s]
- 71%|#######1 | 94425/132723 [00:01<00:00, 80126.85KB/s]
- 77%|#######7 | 102561/132723 [00:01<00:00, 80489.18KB/s]
- 83%|########3 | 110757/132723 [00:01<00:00, 80928.15KB/s]
- 90%|########9 | 118856/132723 [00:01<00:00, 79754.76KB/s]
- 96%|#########5| 126840/132723 [00:01<00:00, 79420.54KB/s]
-100%|##########| 132723/132723 [00:01<00:00, 78475.19KB/s]
+ 1%|1 | 1853/132723 [00:00<00:07, 18523.44KB/s]
+ 4%|4 | 5643/132723 [00:00<00:04, 29916.54KB/s]
+ 10%|9 | 12820/132723 [00:00<00:02, 49023.87KB/s]
+ 15%|#5 | 20439/132723 [00:00<00:01, 59744.91KB/s]
+ 21%|##1 | 28091/132723 [00:00<00:01, 65787.26KB/s]
+ 27%|##6 | 35715/132723 [00:00<00:01, 69339.71KB/s]
+ 33%|###2 | 43376/132723 [00:00<00:01, 71713.82KB/s]
+ 38%|###8 | 50981/132723 [00:00<00:01, 73091.49KB/s]
+ 44%|####4 | 58665/132723 [00:00<00:00, 74257.33KB/s]
+ 50%|##### | 66402/132723 [00:01<00:00, 75214.32KB/s]
+ 56%|#####5 | 74108/132723 [00:01<00:00, 75778.10KB/s]
+ 62%|######1 | 81818/132723 [00:01<00:00, 76175.14KB/s]
+ 67%|######7 | 89505/132723 [00:01<00:00, 76384.64KB/s]
+ 73%|#######3 | 97144/132723 [00:01<00:00, 76378.58KB/s]
+ 79%|#######8 | 104782/132723 [00:01<00:00, 75872.05KB/s]
+ 85%|########4 | 112370/132723 [00:01<00:00, 75751.69KB/s]
+ 90%|######### | 119946/132723 [00:01<00:00, 75375.03KB/s]
+ 96%|#########6| 127485/132723 [00:01<00:00, 75175.26KB/s]
+100%|##########| 132723/132723 [00:01<00:00, 70741.71KB/s]
</pre></div>
</div>
<p>Create TVM runtime and do inference
@@ -496,7 +498,7 @@ Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from h
<span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
-<img src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" srcset="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" alt="deploy ssd gluoncv" class = "sphx-glr-single-img"/><p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 32.893 seconds)</p>
+<img src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" srcset="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" alt="deploy ssd gluoncv" class = "sphx-glr-single-img"/><p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 33.980 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-ssd-gluoncv-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/cccb17d28e5e8b2e94ea8cd5ec59f6ed/deploy_ssd_gluoncv.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_ssd_gluoncv.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/sg_execution_times.html b/docs/how_to/deploy_models/sg_execution_times.html
index 5f1687eba4..090a2f817f 100644
--- a/docs/how_to/deploy_models/sg_execution_times.html
+++ b/docs/how_to/deploy_models/sg_execution_times.html
@@ -327,7 +327,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-deploy-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>11:09.444</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
+<p><strong>11:03.720</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 86%" />
@@ -336,39 +336,39 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_object_detection_pytorch.html#sphx-glr-how-to-deploy-models-deploy-object-detection-pytorch-py"><span class="std std-ref">Compile PyTorch Object Detection Models</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_object_detection_pytorch.py</span></code>)</p></td>
-<td><p>02:56.393</p></td>
+<td><p>02:54.438</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_ssd_gluoncv.html#sphx-glr-how-to-deploy-models-deploy-ssd-gluoncv-py"><span class="std std-ref">Deploy Single Shot Multibox Detector(SSD) model</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_ssd_gluoncv.py</span></code>)</p></td>
-<td><p>02:32.893</p></td>
+<td><p>02:33.980</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_prequantized_tflite.html#sphx-glr-how-to-deploy-models-deploy-prequantized-tflite-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM - Part 3 (TFLite)</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized_tflite.py</span></code>)</p></td>
-<td><p>01:52.732</p></td>
+<td><p>01:52.227</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_quantized.html#sphx-glr-how-to-deploy-models-deploy-quantized-py"><span class="std std-ref">Deploy a Quantized Model on Cuda</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_quantized.py</span></code>)</p></td>
-<td><p>01:25.819</p></td>
+<td><p>01:22.168</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_prequantized.html#sphx-glr-how-to-deploy-models-deploy-prequantized-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized.py</span></code>)</p></td>
-<td><p>01:07.576</p></td>
+<td><p>01:07.664</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_model_on_android.html#sphx-glr-how-to-deploy-models-deploy-model-on-android-py"><span class="std std-ref">Deploy the Pretrained Model on Android</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_android.py</span></code>)</p></td>
-<td><p>00:29.309</p></td>
+<td><p>00:29.555</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_model_on_nano.html#sphx-glr-how-to-deploy-models-deploy-model-on-nano-py"><span class="std std-ref">Deploy the Pretrained Model on Jetson Nano</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_nano.py</span></code>)</p></td>
-<td><p>00:22.628</p></td>
+<td><p>00:22.025</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_model_on_rasp.html#sphx-glr-how-to-deploy-models-deploy-model-on-rasp-py"><span class="std std-ref">Deploy the Pretrained Model on Raspberry Pi</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_rasp.py</span></code>)</p></td>
-<td><p>00:22.088</p></td>
+<td><p>00:21.658</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_sparse.html#sphx-glr-how-to-deploy-models-deploy-sparse-py"><span class="std std-ref">Deploy a Hugging Face Pruned Model on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_sparse.py</span></code>)</p></td>
-<td><p>00:00.006</p></td>
+<td><p>00:00.007</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/extend_tvm/bring_your_own_datatypes.html b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
index 174b94829f..5878f88e4b 100644
--- a/docs/how_to/extend_tvm/bring_your_own_datatypes.html
+++ b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
@@ -608,7 +608,7 @@ In this alpha state of the Bring Your Own Datatypes framework, we have not imple
<span class="n">module</span><span class="p">,</span> <a href="https://docs.python.org/3/library/stdtypes.html#dict" title="builtins.dict" class="sphx-glr-backref-module-builtins sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">params</span></a> <span class="o">=</span> <span class="n">get_mobilenet</span><span class="p">()</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip43250d6e-f713-42db-bbf0-2168e938c07c from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip18a0ccaa-7530-4369-9b75-790d8fb0e3ef from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
</pre></div>
</div>
<p>It’s easy to execute MobileNet with native TVM:</p>
diff --git a/docs/how_to/extend_tvm/sg_execution_times.html b/docs/how_to/extend_tvm/sg_execution_times.html
index f29d56e84c..0ae4fb1513 100644
--- a/docs/how_to/extend_tvm/sg_execution_times.html
+++ b/docs/how_to/extend_tvm/sg_execution_times.html
@@ -327,7 +327,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-extend-tvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:40.258</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
+<p><strong>00:40.753</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 84%" />
@@ -336,19 +336,19 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="bring_your_own_datatypes.html#sphx-glr-how-to-extend-tvm-bring-your-own-datatypes-py"><span class="std std-ref">Bring Your Own Datatypes to TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">bring_your_own_datatypes.py</span></code>)</p></td>
-<td><p>00:37.192</p></td>
+<td><p>00:37.678</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="use_pass_instrument.html#sphx-glr-how-to-extend-tvm-use-pass-instrument-py"><span class="std std-ref">How to Use TVM Pass Instrument</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_instrument.py</span></code>)</p></td>
-<td><p>00:02.156</p></td>
+<td><p>00:02.153</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="use_pass_infra.html#sphx-glr-how-to-extend-tvm-use-pass-infra-py"><span class="std std-ref">How to Use TVM Pass Infra</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_infra.py</span></code>)</p></td>
-<td><p>00:00.902</p></td>
+<td><p>00:00.915</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="low_level_custom_pass.html#sphx-glr-how-to-extend-tvm-low-level-custom-pass-py"><span class="std std-ref">Writing a Customized Pass</span></a> (<code class="docutils literal notranslate"><span class="pre">low_level_custom_pass.py</span></code>)</p></td>
-<td><p>00:00.007</p></td>
+<td><p>00:00.008</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/extend_tvm/use_pass_instrument.html b/docs/how_to/extend_tvm/use_pass_instrument.html
index 4ac50566d1..1a6241b827 100644
--- a/docs/how_to/extend_tvm/use_pass_instrument.html
+++ b/docs/how_to/extend_tvm/use_pass_instrument.html
@@ -512,10 +512,10 @@ profile the execution time of each passes.</p>
</pre></div>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 6774us [6774us] (45.94%; 45.94%)
-FoldScaleAxis: 7972us [5us] (54.06%; 54.06%)
- FoldConstant: 7968us [1673us] (54.03%; 99.94%)
- InferType: 6295us [6295us] (42.69%; 79.00%)
+InferType: 6751us [6751us] (45.94%; 45.94%)
+FoldScaleAxis: 7945us [5us] (54.06%; 54.06%)
+ FoldConstant: 7940us [1626us] (54.03%; 99.94%)
+ InferType: 6313us [6313us] (42.96%; 79.52%)
</pre></div>
</div>
</div>
@@ -537,10 +537,10 @@ Refer to following sections and <a class="reference internal" href="../../refere
</pre></div>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 6278us [6278us] (44.45%; 44.45%)
-FoldScaleAxis: 7846us [4us] (55.55%; 55.55%)
- FoldConstant: 7841us [1647us] (55.52%; 99.95%)
- InferType: 6195us [6195us] (43.86%; 79.00%)
+InferType: 6342us [6342us] (44.68%; 44.68%)
+FoldScaleAxis: 7852us [5us] (55.32%; 55.32%)
+ FoldConstant: 7847us [1632us] (55.29%; 99.94%)
+ InferType: 6215us [6215us] (43.79%; 79.20%)
</pre></div>
</div>
<p>Register empty list to clear existing instruments.</p>
diff --git a/docs/how_to/optimize_operators/opt_conv_cuda.html b/docs/how_to/optimize_operators/opt_conv_cuda.html
index abd5d8da57..893c864ed9 100644
--- a/docs/how_to/optimize_operators/opt_conv_cuda.html
+++ b/docs/how_to/optimize_operators/opt_conv_cuda.html
@@ -564,7 +564,7 @@ latency of convolution.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Convolution: </span><span class="si">%f</span><span class="s2"> ms"</span> <span class="o">%</span> <span class="p">(</span><span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">b</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span> <span class="o">*</span> <span cl [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 39.244670 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 54.208480 ms
</pre></div>
</div>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-optimize-operators-opt-conv-cuda-py">
diff --git a/docs/how_to/optimize_operators/opt_conv_tensorcore.html b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
index b8d7d8bb4f..a54f355180 100644
--- a/docs/how_to/optimize_operators/opt_conv_tensorcore.html
+++ b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
@@ -906,7 +906,7 @@ be able to run on our build server</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"conv2d with tensor core: </span><span class="si">%f</span><span class="s2"> ms"</span> <span class="o">%</span> <span class="p">(</span><span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span> <span class="o">* [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 13.376755 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 6.682477 ms
</pre></div>
</div>
</div>
diff --git a/docs/how_to/optimize_operators/opt_gemm.html b/docs/how_to/optimize_operators/opt_gemm.html
index 61f3149ed8..cb402f8ea5 100644
--- a/docs/how_to/optimize_operators/opt_gemm.html
+++ b/docs/how_to/optimize_operators/opt_gemm.html
@@ -461,8 +461,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
<span class="nb">print</span><span class="p">(</span><span class="s2">"Baseline: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.018204
-Baseline: 3.337629
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.017839
+Baseline: 3.487118
</pre></div>
</div>
<p>In TVM, we can always inspect lower level IR to debug or optimize our schedule.
@@ -522,7 +522,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt1: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.294708
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.293766
</pre></div>
</div>
<p>Here is the generated IR after blocking.</p>
@@ -589,7 +589,7 @@ vastly.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt2: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.326730
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.330934
</pre></div>
</div>
<p>Here is the generated IR after vectorization.</p>
@@ -650,7 +650,7 @@ the access pattern for A matrix is more cache friendly.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt3: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.115941
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.113946
</pre></div>
</div>
<p>Here is the generated IR after loop permutation.</p>
@@ -733,7 +733,7 @@ flattening.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt4: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.109459
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.109250
</pre></div>
</div>
<p>Here is the generated IR after array packing.</p>
@@ -819,7 +819,7 @@ write to C when all the block results are ready.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt5: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.113573
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.111532
</pre></div>
</div>
<p>Here is the generated IR after blocking.</p>
@@ -909,7 +909,7 @@ write to C when all the block results are ready.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt6: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">opt6_time</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.147124
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.147087
</pre></div>
</div>
<p>Here is the generated IR after parallelization.</p>
diff --git a/docs/how_to/optimize_operators/sg_execution_times.html b/docs/how_to/optimize_operators/sg_execution_times.html
index 32d43596f6..17aef5823d 100644
--- a/docs/how_to/optimize_operators/sg_execution_times.html
+++ b/docs/how_to/optimize_operators/sg_execution_times.html
@@ -327,7 +327,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-optimize-operators-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:34.390</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
+<p><strong>00:34.556</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 83%" />
@@ -336,15 +336,15 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="opt_gemm.html#sphx-glr-how-to-optimize-operators-opt-gemm-py"><span class="std std-ref">How to optimize GEMM on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_gemm.py</span></code>)</p></td>
-<td><p>00:31.811</p></td>
+<td><p>00:32.220</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="opt_conv_tensorcore.html#sphx-glr-how-to-optimize-operators-opt-conv-tensorcore-py"><span class="std std-ref">How to optimize convolution using TensorCores</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_tensorcore.py</span></code>)</p></td>
-<td><p>00:01.419</p></td>
+<td><p>00:01.290</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="opt_conv_cuda.html#sphx-glr-how-to-optimize-operators-opt-conv-cuda-py"><span class="std std-ref">How to optimize convolution on GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_cuda.py</span></code>)</p></td>
-<td><p>00:01.160</p></td>
+<td><p>00:01.046</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
index a5390cc6d3..3982f8d5f3 100644
--- a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
+++ b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
@@ -327,7 +327,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-tune-with-autoscheduler-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>06:13.305</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
+<p><strong>06:26.086</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 85%" />
@@ -336,27 +336,27 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_conv2d_layer_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py"><span class="std std-ref">Auto-scheduling a Convolution Layer for GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_layer_cuda.py</span></code>)</p></td>
-<td><p>03:20.140</p></td>
+<td><p>03:21.096</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_network_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-x86-py"><span class="std std-ref">Auto-scheduling a Neural Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_x86.py</span></code>)</p></td>
-<td><p>01:21.526</p></td>
+<td><p>01:22.136</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_network_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-cuda-py"><span class="std std-ref">Auto-scheduling a Neural Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_cuda.py</span></code>)</p></td>
-<td><p>00:55.852</p></td>
+<td><p>00:56.053</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_sparse_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-sparse-x86-py"><span class="std std-ref">Auto-scheduling Sparse Matrix Multiplication on CPU with Custom Sketch Rule</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_sparse_x86.py</span></code>)</p></td>
-<td><p>00:18.698</p></td>
+<td><p>00:29.591</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_network_mali.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-mali-py"><span class="std std-ref">Auto-scheduling a Neural Network for mali GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_mali.py</span></code>)</p></td>
-<td><p>00:08.639</p></td>
+<td><p>00:08.701</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_network_arm.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-arm-py"><span class="std std-ref">Auto-scheduling a Neural Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_arm.py</span></code>)</p></td>
-<td><p>00:08.450</p></td>
+<td><p>00:08.509</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
index cf7121ac70..8b0d7b157d 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
@@ -491,483 +491,316 @@ cooperative fetching, unrolling and operator fusion.</p>
compute: Buffer(compute_2: Pointer(float32), float32, [25088], [])}
buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, compute_1: compute}
preflattened_buffer_map = {data_1: data_3: Buffer(data_2, float32, [1, 512, 7, 7], []), kernel_1: kernel_3: Buffer(kernel_2, float32, [512, 512, 3, 3], []), bias_1: bias_3: Buffer(bias_2, float32, [1, 512, 1, 1], []), compute_1: compute_3: Buffer(compute_2, float32, [1, 512, 7, 7], [])} {
- attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 28;
- allocate(conv2d_nchw: Pointer(local float32), float32, [14]), storage_scope = local;
- allocate(pad_temp.shared: Pointer(shared float32), float32, [72]), storage_scope = shared;
- allocate(kernel.shared: Pointer(shared float32), float32, [3072]), storage_scope = shared;
- attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64 {
- conv2d_nchw_1: Buffer(conv2d_nchw, float32, [14], [], scope="local", align=32)[0] = 0f32
+ attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 64;
+ allocate(conv2d_nchw: Pointer(local float32), float32, [7]), storage_scope = local;
+ allocate(pad_temp.shared: Pointer(shared float32), float32, [4032]), storage_scope = shared;
+ allocate(kernel.shared: Pointer(shared float32), float32, [1536]), storage_scope = shared;
+ attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ conv2d_nchw_1: Buffer(conv2d_nchw, float32, [7], [], scope="local", align=16)[0] = 0f32
conv2d_nchw_1[1] = 0f32
conv2d_nchw_1[2] = 0f32
conv2d_nchw_1[3] = 0f32
conv2d_nchw_1[4] = 0f32
conv2d_nchw_1[5] = 0f32
conv2d_nchw_1[6] = 0f32
- conv2d_nchw_1[7] = 0f32
- conv2d_nchw_1[8] = 0f32
- conv2d_nchw_1[9] = 0f32
- conv2d_nchw_1[10] = 0f32
- conv2d_nchw_1[11] = 0f32
- conv2d_nchw_1[12] = 0f32
- conv2d_nchw_1[13] = 0f32
- for (rc.outer.outer: int32, 0, 64) {
- for (ry.outer.outer: int32, 0, 3) {
- let cse_var_2: int32 = (rc.outer.outer*72)
- let cse_var_1: int32 = (ry.outer.outer*3)
+ for (rc.outer.outer: int32, 0, 8) {
+ for (rx.outer.outer: int32, 0, 3) {
+ let cse_var_2: int32 = (rc.outer.outer*3136)
+ let cse_var_1: int32 = (rc.outer.outer*576)
{
- attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64 {
- if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
- pad_temp.shared_1: Buffer(pad_temp.shared, float32, [72], [], scope="shared")[(threadIdx.x_1*4)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1*4), 9))) && (floormod((threadIdx.x_1*4), 9) < 8)), data[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1*4), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + [...]
- }
- if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
- pad_temp.shared_1[((threadIdx.x_1*4) + 1)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 1), 9))) && (floormod(((threadIdx.x_1*4) + 1), 9) < 8)), data[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 1), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 1), 9)) - 8)], 0 [...]
- }
- if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
- pad_temp.shared_1[((threadIdx.x_1*4) + 2)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 2), 9))) && (floormod(((threadIdx.x_1*4) + 2), 9) < 8)), data[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 2), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 2), 9)) - 8)], 0 [...]
- }
- if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
- pad_temp.shared_1[((threadIdx.x_1*4) + 3)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 3), 9))) && (floormod(((threadIdx.x_1*4) + 3), 9) < 8)), data[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 3), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 3), 9)) - 8)], 0 [...]
- }
+ attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1: Buffer(pad_temp.shared, float32, [4032], [], scope="shared")[(threadIdx.x_1*2)] = @tir.if_then_else(((((7 <= floormod((threadIdx.x_1*2), 63)) && (floormod((threadIdx.x_1*2), 63) < 56)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1*2), 63)*49)) + rx.outer.outer) + floormod((threadIdx.x_1*2), 6 [...]
+ pad_temp.shared_1[((threadIdx.x_1*2) + 1)] = @tir.if_then_else(((((7 <= floormod(((threadIdx.x_1*2) + 1), 63)) && (floormod(((threadIdx.x_1*2) + 1), 63) < 56)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 1), 63)*49)) + rx.outer.outer) + floormod(((threadIdx.x_1*2) + 1), 63)) - 8)], 0f32, dtype=float32)
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 112), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 7), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 7), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (fl [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 113), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 7), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 7), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 224), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 5), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 5), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (fl [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 225), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 5), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 5), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 336), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 3), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 3), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (fl [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 337), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 3), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 3), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 448), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 1), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 1), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (fl [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 449), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 1), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 1), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 560), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 8), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 8), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (fl [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 561), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 8), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 8), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 672), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 6), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 6), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (fl [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 673), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 6), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 6), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 784), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 4), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 4), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (fl [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 785), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 4), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 4), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 896), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 2), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 2), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (fl [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 897), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 2), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 2), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[((threadIdx.x_1*2) + 1008)] = @tir.if_then_else(((((7 <= floormod((threadIdx.x_1*2), 63)) && (floormod((threadIdx.x_1*2), 63) < 56)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1*2), 63)*49)) + rx.outer.outer) + floormod((threadIdx.x_1*2), 63)) + 776)], 0f32, dtype=float32)
+ pad_temp.shared_1[((threadIdx.x_1*2) + 1009)] = @tir.if_then_else(((((7 <= floormod(((threadIdx.x_1*2) + 1), 63)) && (floormod(((threadIdx.x_1*2) + 1), 63) < 56)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 1), 63)*49)) + rx.outer.outer) + floormod(((threadIdx.x_1*2) + 1), 63)) + 776)], 0f32, dtype=float32)
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1120), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 7), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 7), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (f [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1121), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 7), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 7), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1232), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 5), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 5), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (f [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1233), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 5), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 5), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1344), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 3), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 3), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (f [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1345), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 3), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 3), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1456), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 1), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 1), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (f [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1457), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 1), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 1), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1568), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 8), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 8), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (f [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1569), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 8), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 8), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1680), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 6), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 6), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (f [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1681), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 6), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 6), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1792), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 4), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 4), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (f [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1793), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 4), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 4), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1904), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 2), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 2), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (f [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 1905), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 2), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 2), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[((threadIdx.x_1*2) + 2016)] = @tir.if_then_else(((((7 <= floormod((threadIdx.x_1*2), 63)) && (floormod((threadIdx.x_1*2), 63) < 56)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1*2), 63)*49)) + rx.outer.outer) + floormod((threadIdx.x_1*2), 63)) + 1560)], 0f32, dtype=float32)
+ pad_temp.shared_1[((threadIdx.x_1*2) + 2017)] = @tir.if_then_else(((((7 <= floormod(((threadIdx.x_1*2) + 1), 63)) && (floormod(((threadIdx.x_1*2) + 1), 63) < 56)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 1), 63)*49)) + rx.outer.outer) + floormod(((threadIdx.x_1*2) + 1), 63)) + 1560)], 0f32, dtype=float32)
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2128), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 7), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 7), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (f [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2129), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 7), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 7), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2240), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 5), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 5), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (f [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2241), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 5), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 5), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2352), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 3), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 3), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (f [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2353), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 3), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 3), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2464), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 1), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 1), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (f [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2465), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 1), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 1), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2576), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 8), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 8), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (f [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2577), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 8), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 8), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2688), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 6), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 6), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (f [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2689), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 6), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 6), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2800), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 4), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 4), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (f [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2801), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 4), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 4), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2912), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 2), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 2), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (f [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 2913), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 2), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 2), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[((threadIdx.x_1*2) + 3024)] = @tir.if_then_else(((((7 <= floormod((threadIdx.x_1*2), 63)) && (floormod((threadIdx.x_1*2), 63) < 56)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1*2), 63)*49)) + rx.outer.outer) + floormod((threadIdx.x_1*2), 63)) + 2344)], 0f32, dtype=float32)
+ pad_temp.shared_1[((threadIdx.x_1*2) + 3025)] = @tir.if_then_else(((((7 <= floormod(((threadIdx.x_1*2) + 1), 63)) && (floormod(((threadIdx.x_1*2) + 1), 63) < 56)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)) < 8)), data[((((cse_var_2 + (floordiv(((threadIdx.x_1*2) + 1), 63)*49)) + rx.outer.outer) + floormod(((threadIdx.x_1*2) + 1), 63)) + 2344)], 0f32, dtype=float32)
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3136), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 7), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 7), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (f [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3137), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 7), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 7), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3248), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 5), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 5), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (f [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3249), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 5), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 5), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3360), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 3), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 3), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (f [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3361), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 3), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 3), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3472), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 1), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 1), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (f [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3473), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 1), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 1), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3584), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 8), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 8), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (f [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3585), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 8), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 8), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3696), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 6), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 6), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (f [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3697), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 6), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 6), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3808), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 4), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 4), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (f [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3809), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 4), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 4), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7) [...]
+ }
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3920), 63)*63) + (floormod((floordiv((threadIdx.x_1*2), 7) + 2), 9)*7)) + floormod((threadIdx.x_1*2), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv((threadIdx.x_1*2), 7) + 2), 9)) && (floormod((floordiv((threadIdx.x_1*2), 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod((threadIdx.x_1*2), 7)))) && ((rx.outer.outer + floormod((threadIdx.x_1*2), 7)) < 8)), data[(((((cse_var_2 + (f [...]
+ pad_temp.shared_1[(((floordiv(((threadIdx.x_1*2) + 3921), 63)*63) + (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 2), 9)*7)) + floormod(((threadIdx.x_1*2) + 1), 7))] = @tir.if_then_else(((((1 <= floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 2), 9)) && (floormod((floordiv(((threadIdx.x_1*2) + 1), 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7)))) && ((rx.outer.outer + floormod(((threadIdx.x_1*2) + 1), 7) [...]
+ }
+ attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1: Buffer(kernel.shared, float32, [1536], [], scope="shared")[threadIdx.x_2] = kernel[((((blockIdx.x*36864) + cse_var_1) + (threadIdx.x_2*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 56)] = kernel[(((((blockIdx.x*36864) + cse_var_1) + (floordiv((threadIdx.x_2 + 56), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 112)] = kernel[(((((blockIdx.x*36864) + cse_var_1) + (floordiv((threadIdx.x_2 + 112), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 168)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 168), 192)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 56), 64)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 224)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 224), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 192), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 280)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 280), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 88), 192), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 336)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 336), 192)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 48), 64)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 392)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 392), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 8), 192), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 448)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 448), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 64), 192), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 504)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 504), 192)*4608)) + cse_var_1) + ((floordiv(threadIdx.x_2, 3) + 40)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 560)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 560), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 176), 192), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 616)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 616), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 40), 192), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 672)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 672), 192)*4608)) + cse_var_1) + ((floordiv(threadIdx.x_2, 3) + 32)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 728)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 728), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 152), 192), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 784)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 784), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 16), 192), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 840)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 840), 192)*4608)) + cse_var_1) + ((floordiv(threadIdx.x_2, 3) + 24)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 896)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 896), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 128), 192), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 952)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 952), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 184), 192), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 1008)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 1008), 192)*4608)) + cse_var_1) + ((floordiv(threadIdx.x_2, 3) + 16)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 1064)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 1064), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 104), 192), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 1120)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 1120), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 160), 192), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 1176)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 1176), 192)*4608)) + cse_var_1) + ((floordiv(threadIdx.x_2, 3) + 8)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 1232)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 1232), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 80), 192), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 1288)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 1288), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 136), 192), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 1344)] = kernel[(((((blockIdx.x*36864) + cse_var_1) + (threadIdx.x_2*3)) + rx.outer.outer) + 32256)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 1400)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 1400), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 56), 192), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ kernel.shared_1[(threadIdx.x_2 + 1456)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 1456), 192)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 112), 192), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+ if @tir.likely((threadIdx.x_2 < 24), dtype=bool) {
+ kernel.shared_1[(threadIdx.x_2 + 1512)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 1512), 192)*4608)) + cse_var_1) + ((floordiv(threadIdx.x_2, 3) + 56)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ }
+ for (rc.outer.inner: int32, 0, 16) {
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((rc.outer.inner*252) + floormod(threadIdx.x, 7))]*kernel.shared_1[((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12))]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 7)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12))]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 14)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12))]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 21)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12))]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 28)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12))]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 35)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12))]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 42)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12))]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 3)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 70)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 3)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 77)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 3)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 84)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 3)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 3)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 98)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 3)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 105)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 3)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 126)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 6)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 133)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 6)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 140)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 6)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 147)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 6)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 154)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 6)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 161)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 6)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 168)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 6)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 189)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 9)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 196)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 9)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 203)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 9)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 210)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 9)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 217)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 9)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 224)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 9)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 231)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 9)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 7)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 1)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 14)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 1)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 21)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 1)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 28)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 1)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 35)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 1)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 42)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 1)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 49)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 1)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 70)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 4)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 77)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 4)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 84)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 4)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 4)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 98)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 4)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 105)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 4)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 112)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 4)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 133)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 7)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 140)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 7)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 147)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 7)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 154)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 7)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 161)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 7)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 168)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 7)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 175)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 7)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 196)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 10)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 203)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 10)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 210)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 10)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 217)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 10)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 224)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 10)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 231)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 10)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 238)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 10)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 14)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 2)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 21)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 2)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 28)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 2)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 35)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 2)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 42)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 2)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 49)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 2)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 56)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 2)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 77)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 5)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 84)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 5)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 5)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 98)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 5)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 105)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 5)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 112)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 5)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 119)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 5)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 140)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 8)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 147)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 8)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 154)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 8)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 161)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 8)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 168)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 8)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 175)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 8)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 182)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 8)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 203)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 11)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 210)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 11)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 217)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 11)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 224)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 11)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 231)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 11)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 238)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 11)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*252) + floormod(threadIdx.x, 7)) + 245)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*192) + (rc.outer.inner*12)) + 11)]))
}
- attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1: Buffer(kernel.shared, float32, [3072], [], scope="shared")[threadIdx.x_2] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 64)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 64), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 128)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 128), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 192)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 36864)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 256)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 256), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 320)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 320), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 384)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 73728)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 448)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 448), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 512)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 512), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 576)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 110592)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 640)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 640), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 704)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 704), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 768)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 147456)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 832)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 832), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 896)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 896), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 960)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 184320)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1024)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1024), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1088)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1088), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1152)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 221184)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1216)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1216), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1280)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1280), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1344)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 258048)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1408)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1408), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1472)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1472), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1536)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 294912)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1600)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1600), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1664)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1664), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1728)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 331776)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1792)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1792), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1856)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1856), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1920)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 368640)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1984)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1984), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2048)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2048), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2112)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 405504)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2176)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2176), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2240)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2240), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2304)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 442368)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2368)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2368), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2432)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2432), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2496)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 479232)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2560)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2560), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2624)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2624), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2688)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 516096)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2752)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2752), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2816)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2816), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2880)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 552960)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2944)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2944), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 3008)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 3008), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[0]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[9]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[1]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[2]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[3]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[4]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[5]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[6]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[0]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[9]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[8]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[17]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[8]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[17]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[18]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[27]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[18]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[27]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[26]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[35]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[26]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[35]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[36]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[45]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[36]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[45]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[44]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[53]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[44]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[53]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[54]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[63]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[54]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[63]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[62]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[71]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[62]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[71]*kernel.shared_1[((threadIdx.x*48) + 47)]))
}
}
}
- for (i1.inner: int32, 0, 2) {
- for (i3.inner: int32, 0, 7) {
- compute[(((((floordiv(blockIdx.x, 7)*6272) + (threadIdx.x*98)) + (i1.inner*49)) + (floormod(blockIdx.x, 7)*7)) + i3.inner)] = max((conv2d_nchw_1[((i1.inner*7) + i3.inner)] + bias[(((floordiv(blockIdx.x, 7)*128) + (threadIdx.x*2)) + i1.inner)]), 0f32)
- }
+ for (i2.inner: int32, 0, 7) {
+ compute[((((blockIdx.x*392) + (floordiv(threadIdx.x, 7)*49)) + (i2.inner*7)) + floormod(threadIdx.x, 7))] = max((conv2d_nchw_1[i2.inner] + bias[((blockIdx.x*8) + floordiv(threadIdx.x, 7))]), 0f32)
}
}
}
@@ -1004,7 +837,7 @@ cooperative fetching, unrolling and operator fusion.</p>
<span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 0.359 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 0.367 ms
</pre></div>
</div>
</div>
@@ -1034,35 +867,35 @@ conv2d_nchw_nn_o_o_i, conv2d_nchw_nn_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o
conv2d_nchw_nn_o_o_o_i, conv2d_nchw_nn_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_i, factor=1)
conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
-conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=2)
-conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=64)
+conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=1)
+conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=8)
conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
-conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
+conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=7)
conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=1)
conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
-conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=7)
-conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
+conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
+conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=7)
conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
-conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=2)
-conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=4)
+conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=4)
+conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=16)
conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1)
-conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=1)
+conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=3)
conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=1)
-conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=3)
+conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=1)
s[conv2d_nchw].reorder(conv2d_nchw_nn_o_o_o_o, conv2d_nchw_ff_o_o_o_o, conv2d_nchw_yy_o_o_o_o, conv2d_nchw_xx_o_o_o_o, conv2d_nchw_nn_o_o_o_i, conv2d_nchw_ff_o_o_o_i, conv2d_nchw_yy_o_o_o_i, conv2d_nchw_xx_o_o_o_i, conv2d_nchw_nn_o_o_i, conv2d_nchw_ff_o_o_i, conv2d_nchw_yy_o_o_i, conv2d_nchw_xx_o_o_i, conv2d_nchw_rc_o_o, conv2d_nchw_ry_o_o, conv2d_nchw_rx_o_o, conv2d_nchw_rc_o_i, conv2d_nchw_ry_o_i, conv2d_nchw_rx_o_i, conv2d_nchw_nn_o_i, conv2d_nchw_ff_o_i, conv2d_nchw_yy_o_i, conv2d_nc [...]
compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1)
compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
-compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=2)
-compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=64)
+compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=1)
+compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=8)
compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
-compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
+compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=7)
compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=1)
compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
-compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
-compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
+compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
+compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=7)
compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
@@ -1082,12 +915,12 @@ s[compute].bind(compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused, te.thread
kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=56)
s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
-pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=4)
+pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=2)
s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=56)
s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 512)
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "unroll_explicit", True)
@@ -1107,10 +940,10 @@ CUDA source code:
#define int64_t long long
#define uint64_t unsigned long long
#endif
-extern "C" __global__ void __launch_bounds__(64) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
- float conv2d_nchw[14];
- __shared__ float pad_temp_shared[72];
- __shared__ float kernel_shared[3072];
+extern "C" __global__ void __launch_bounds__(56) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+ float conv2d_nchw[7];
+ __shared__ float pad_temp_shared[4032];
+ __shared__ float kernel_shared[1536];
conv2d_nchw[0] = 0.000000e+00f;
conv2d_nchw[1] = 0.000000e+00f;
conv2d_nchw[2] = 0.000000e+00f;
@@ -1118,419 +951,202 @@ extern "C" __global__ void __launch_bounds__(64) default_function_kern
conv2d_nchw[4] = 0.000000e+00f;
conv2d_nchw[5] = 0.000000e+00f;
conv2d_nchw[6] = 0.000000e+00f;
- conv2d_nchw[7] = 0.000000e+00f;
- conv2d_nchw[8] = 0.000000e+00f;
- conv2d_nchw[9] = 0.000000e+00f;
- conv2d_nchw[10] = 0.000000e+00f;
- conv2d_nchw[11] = 0.000000e+00f;
- conv2d_nchw[12] = 0.000000e+00f;
- conv2d_nchw[13] = 0.000000e+00f;
- for (int rc_outer_outer = 0; rc_outer_outer < 64; ++rc_outer_outer) {
- for (int ry_outer_outer = 0; ry_outer_outer < 3; ++ry_outer_outer) {
+ for (int rc_outer_outer = 0; rc_outer_outer < 8; ++rc_outer_outer) {
+ for (int rx_outer_outer = 0; rx_outer_outer < 3; ++rx_outer_outer) {
__syncthreads();
- if (((int)threadIdx.x) < 18) {
- pad_temp_shared[(((int)threadIdx.x) * 4)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) * 4) % 9))) && (((((int)threadIdx.x) * 4) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + (((((int)threadIdx.x) * 4) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) * 4) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) * 2)] = (((((7 <= ((((int)threadIdx.x) * 2) % 63)) && (((((int)threadIdx.x) * 2) % 63) < 56)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + (((((int)threadIdx.x) * 2) / 63) * 49)) + rx_outer_outer) + ((((int)threadIdx.x) * 2) % 63)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[((((int)threadIdx.x) * 2) + 1)] = (((((7 <= (((((int)threadIdx.x) * 2) + 1) % 63)) && ((((((int)threadIdx.x) * 2) + 1) % 63) < 56)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 1) / 63) * 49)) + rx_outer_outer) + (((((int)threadIdx.x) * 2) + 1) % 63)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 112) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 7) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 7) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 113) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 7) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 7) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 31 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 224) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 5) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 5) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 225) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 5) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 5) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 31 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 336) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 3) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 3) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 337) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 3) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 3) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 31 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 448) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 1) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 1) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 449) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 1) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 1) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 31 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 560) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 8) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 8) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 561) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 8) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 8) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 31 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 672) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 6) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 6) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 673) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 6) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 6) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 31 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 784) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 4) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 4) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 785) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 4) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 4) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 31 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 896) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 2) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 2) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 897) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 2) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 2) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 31 [...]
+ pad_temp_shared[((((int)threadIdx.x) * 2) + 1008)] = (((((7 <= ((((int)threadIdx.x) * 2) % 63)) && (((((int)threadIdx.x) * 2) % 63) < 56)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + (((((int)threadIdx.x) * 2) / 63) * 49)) + rx_outer_outer) + ((((int)threadIdx.x) * 2) % 63)) + 776)] : 0.000000e+00f);
+ pad_temp_shared[((((int)threadIdx.x) * 2) + 1009)] = (((((7 <= (((((int)threadIdx.x) * 2) + 1) % 63)) && ((((((int)threadIdx.x) * 2) + 1) % 63) < 56)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 1) / 63) * 49)) + rx_outer_outer) + (((((int)threadIdx.x) * 2) + 1) % 63)) + 776)] : 0.000000e+00f);
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1120) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 7) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 7) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1121) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 7) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 7) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1232) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 5) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 5) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1233) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 5) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 5) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1344) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 3) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 3) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1345) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 3) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 3) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1456) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 1) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 1) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1457) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 1) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 1) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1568) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 8) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 8) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1569) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 8) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 8) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1680) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 6) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 6) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1681) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 6) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 6) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1792) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 4) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 4) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1793) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 4) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 4) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1904) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 2) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 2) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 1905) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 2) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 2) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3 [...]
+ pad_temp_shared[((((int)threadIdx.x) * 2) + 2016)] = (((((7 <= ((((int)threadIdx.x) * 2) % 63)) && (((((int)threadIdx.x) * 2) % 63) < 56)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + (((((int)threadIdx.x) * 2) / 63) * 49)) + rx_outer_outer) + ((((int)threadIdx.x) * 2) % 63)) + 1560)] : 0.000000e+00f);
+ pad_temp_shared[((((int)threadIdx.x) * 2) + 2017)] = (((((7 <= (((((int)threadIdx.x) * 2) + 1) % 63)) && ((((((int)threadIdx.x) * 2) + 1) % 63) < 56)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 1) / 63) * 49)) + rx_outer_outer) + (((((int)threadIdx.x) * 2) + 1) % 63)) + 1560)] : 0.000000e+00f);
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2128) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 7) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 7) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2129) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 7) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 7) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2240) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 5) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 5) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2241) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 5) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 5) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2352) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 3) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 3) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2353) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 3) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 3) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2464) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 1) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 1) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2465) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 1) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 1) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2576) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 8) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 8) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2577) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 8) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 8) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2688) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 6) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 6) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2689) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 6) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 6) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2800) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 4) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 4) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2801) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 4) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 4) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2912) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 2) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 2) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 2913) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 2) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 2) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3 [...]
+ pad_temp_shared[((((int)threadIdx.x) * 2) + 3024)] = (((((7 <= ((((int)threadIdx.x) * 2) % 63)) && (((((int)threadIdx.x) * 2) % 63) < 56)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + (((((int)threadIdx.x) * 2) / 63) * 49)) + rx_outer_outer) + ((((int)threadIdx.x) * 2) % 63)) + 2344)] : 0.000000e+00f);
+ pad_temp_shared[((((int)threadIdx.x) * 2) + 3025)] = (((((7 <= (((((int)threadIdx.x) * 2) + 1) % 63)) && ((((((int)threadIdx.x) * 2) + 1) % 63) < 56)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + 1) / 63) * 49)) + rx_outer_outer) + (((((int)threadIdx.x) * 2) + 1) % 63)) + 2344)] : 0.000000e+00f);
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3136) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 7) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 7) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3137) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 7) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 7) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3248) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 5) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 5) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3249) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 5) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 5) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3360) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 3) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 3) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3361) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 3) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 3) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3472) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 1) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 1) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3473) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 1) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 1) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3584) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 8) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 8) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3585) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 8) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 8) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3696) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 6) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 6) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3697) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 6) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 6) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3808) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 4) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 4) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3809) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 4) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 4) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3 [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3920) / 63) * 63) + (((((((int)threadIdx.x) * 2) / 7) + 2) % 9) * 7)) + ((((int)threadIdx.x) * 2) % 7))] = (((((1 <= ((((((int)threadIdx.x) * 2) / 7) + 2) % 9)) && (((((((int)threadIdx.x) * 2) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)))) && ((rx_outer_outer + ((((int)threadIdx.x) * 2) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + ((((((int)threadIdx.x) * 2) + [...]
+ pad_temp_shared[((((((((int)threadIdx.x) * 2) + 3921) / 63) * 63) + ((((((((int)threadIdx.x) * 2) + 1) / 7) + 2) % 9) * 7)) + (((((int)threadIdx.x) * 2) + 1) % 7))] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) / 7) + 2) % 9)) && ((((((((int)threadIdx.x) * 2) + 1) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)))) && ((rx_outer_outer + (((((int)threadIdx.x) * 2) + 1) % 7)) < 8)) ? data[((((((rc_outer_outer * 3 [...]
+ kernel_shared[((int)threadIdx.x)] = kernel[((((((int)blockIdx.x) * 36864) + (rc_outer_outer * 576)) + (((int)threadIdx.x) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 56)] = kernel[(((((((int)blockIdx.x) * 36864) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 56) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 112)] = kernel[(((((((int)blockIdx.x) * 36864) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 112) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 168)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 168) / 192) * 4608)) + (rc_outer_outer * 576)) + ((((((int)threadIdx.x) / 3) + 56) & 63) * 9)) + ((((int)threadIdx.x) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 224)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 224) / 192) * 4608)) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 32) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 280)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 280) / 192) * 4608)) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 88) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 336)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 336) / 192) * 4608)) + (rc_outer_outer * 576)) + ((((((int)threadIdx.x) / 3) + 48) & 63) * 9)) + ((((int)threadIdx.x) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 392)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 392) / 192) * 4608)) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 8) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 448)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 448) / 192) * 4608)) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 64) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 504)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 504) / 192) * 4608)) + (rc_outer_outer * 576)) + (((int)threadIdx.x) * 3)) + rx_outer_outer) + 360)];
+ kernel_shared[(((int)threadIdx.x) + 560)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 560) / 192) * 4608)) + (rc_outer_outer * 576)) + ((((((int)threadIdx.x) + 176) % 192) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 616)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 616) / 192) * 4608)) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 40) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 672)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 672) / 192) * 4608)) + (rc_outer_outer * 576)) + (((int)threadIdx.x) * 3)) + rx_outer_outer) + 288)];
+ kernel_shared[(((int)threadIdx.x) + 728)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 728) / 192) * 4608)) + (rc_outer_outer * 576)) + ((((((int)threadIdx.x) + 152) % 192) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 784)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 784) / 192) * 4608)) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 16) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 840)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 840) / 192) * 4608)) + (rc_outer_outer * 576)) + (((int)threadIdx.x) * 3)) + rx_outer_outer) + 216)];
+ kernel_shared[(((int)threadIdx.x) + 896)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 896) / 192) * 4608)) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 128) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 952)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 952) / 192) * 4608)) + (rc_outer_outer * 576)) + ((((((int)threadIdx.x) + 184) % 192) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 1008)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 1008) / 192) * 4608)) + (rc_outer_outer * 576)) + (((int)threadIdx.x) * 3)) + rx_outer_outer) + 144)];
+ kernel_shared[(((int)threadIdx.x) + 1064)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 1064) / 192) * 4608)) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 104) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 1120)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 1120) / 192) * 4608)) + (rc_outer_outer * 576)) + ((((((int)threadIdx.x) + 160) % 192) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 1176)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 1176) / 192) * 4608)) + (rc_outer_outer * 576)) + (((int)threadIdx.x) * 3)) + rx_outer_outer) + 72)];
+ kernel_shared[(((int)threadIdx.x) + 1232)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 1232) / 192) * 4608)) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 80) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 1288)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 1288) / 192) * 4608)) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 136) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 1344)] = kernel[(((((((int)blockIdx.x) * 36864) + (rc_outer_outer * 576)) + (((int)threadIdx.x) * 3)) + rx_outer_outer) + 32256)];
+ kernel_shared[(((int)threadIdx.x) + 1400)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 1400) / 192) * 4608)) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 56) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 1456)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 1456) / 192) * 4608)) + (rc_outer_outer * 576)) + (((((int)threadIdx.x) + 112) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ if (((int)threadIdx.x) < 24) {
+ kernel_shared[(((int)threadIdx.x) + 1512)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 1512) / 192) * 4608)) + (rc_outer_outer * 576)) + (((int)threadIdx.x) * 3)) + rx_outer_outer) + 504)];
}
- if (((int)threadIdx.x) < 18) {
- pad_temp_shared[((((int)threadIdx.x) * 4) + 1)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 1) % 9))) && ((((((int)threadIdx.x) * 4) + 1) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 1) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 1) % 9)) - 8)] : 0.000000e+00f);
- }
- if (((int)threadIdx.x) < 18) {
- pad_temp_shared[((((int)threadIdx.x) * 4) + 2)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 2) % 9))) && ((((((int)threadIdx.x) * 4) + 2) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 2) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 2) % 9)) - 8)] : 0.000000e+00f);
- }
- if (((int)threadIdx.x) < 18) {
- pad_temp_shared[((((int)threadIdx.x) * 4) + 3)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 3) % 9))) && ((((((int)threadIdx.x) * 4) + 3) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 3) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 3) % 9)) - 8)] : 0.000000e+00f);
- }
- kernel_shared[((int)threadIdx.x)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3))];
- kernel_shared[(((int)threadIdx.x) + 64)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 64) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 128)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 128) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 192)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 36864)];
- kernel_shared[(((int)threadIdx.x) + 256)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 256) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 320)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 320) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 384)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 73728)];
- kernel_shared[(((int)threadIdx.x) + 448)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 448) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 512)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 512) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 576)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 110592)];
- kernel_shared[(((int)threadIdx.x) + 640)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 640) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 704)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 704) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 768)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 147456)];
- kernel_shared[(((int)threadIdx.x) + 832)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 832) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 896)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 896) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 960)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 184320)];
- kernel_shared[(((int)threadIdx.x) + 1024)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1024) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1088)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1088) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1152)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 221184)];
- kernel_shared[(((int)threadIdx.x) + 1216)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1216) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1280)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1280) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1344)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 258048)];
- kernel_shared[(((int)threadIdx.x) + 1408)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1408) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1472)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1472) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1536)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 294912)];
- kernel_shared[(((int)threadIdx.x) + 1600)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1600) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1664)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1664) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1728)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 331776)];
- kernel_shared[(((int)threadIdx.x) + 1792)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1792) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1856)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1856) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1920)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 368640)];
- kernel_shared[(((int)threadIdx.x) + 1984)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1984) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2048)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2048) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2112)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 405504)];
- kernel_shared[(((int)threadIdx.x) + 2176)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2176) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2240)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2240) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2304)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 442368)];
- kernel_shared[(((int)threadIdx.x) + 2368)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2368) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2432)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2432) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2496)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 479232)];
- kernel_shared[(((int)threadIdx.x) + 2560)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2560) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2624)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2624) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2688)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 516096)];
- kernel_shared[(((int)threadIdx.x) + 2752)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2752) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2816)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2816) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2880)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 552960)];
- kernel_shared[(((int)threadIdx.x) + 2944)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2944) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 3008)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 3008) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
__syncthreads();
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[0] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[1] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[2] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[3] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[4] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[5] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[6] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[0] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+ for (int rc_outer_inner = 0; rc_outer_inner < 16; ++rc_outer_inner) {
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 252) + (((int)threadIdx.x) % 7))] * kernel_shared[(((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12))]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 7)] * kernel_shared[(((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12))]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 14)] * kernel_shared[(((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12))]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 21)] * kernel_shared[(((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12))]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 28)] * kernel_shared[(((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12))]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 35)] * kernel_shared[(((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12))]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 42)] * kernel_shared[(((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12))]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 70)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 77)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 84)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 98)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 105)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 126)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 133)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 140)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 147)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 154)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 161)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 168)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 189)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 196)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 203)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 210)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 217)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 224)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 231)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 7)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 21)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 28)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 35)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 42)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 49)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 70)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 77)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 84)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 98)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 105)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 112)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 133)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 140)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 147)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 154)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 161)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 168)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 175)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 196)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 203)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 210)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 217)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 224)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 231)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 238)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 21)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 28)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 35)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 42)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 49)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 56)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 77)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 84)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 98)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 105)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 112)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 119)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 140)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 147)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 154)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 161)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 168)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 175)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 182)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 203)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 210)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 217)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 224)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 231)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 238)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + (((int)threadIdx.x) % 7)) + 245)] * kernel_shared[((((((int)threadIdx.x) / 7) * 192) + (rc_outer_inner * 12)) + 11)]));
+ }
}
}
- for (int i1_inner = 0; i1_inner < 2; ++i1_inner) {
- for (int i3_inner = 0; i3_inner < 7; ++i3_inner) {
- compute[((((((((int)blockIdx.x) / 7) * 6272) + (((int)threadIdx.x) * 98)) + (i1_inner * 49)) + ((((int)blockIdx.x) % 7) * 7)) + i3_inner)] = max((conv2d_nchw[((i1_inner * 7) + i3_inner)] + bias[((((((int)blockIdx.x) / 7) * 128) + (((int)threadIdx.x) * 2)) + i1_inner)]), 0.000000e+00f);
- }
+ for (int i2_inner = 0; i2_inner < 7; ++i2_inner) {
+ compute[((((((int)blockIdx.x) * 392) + ((((int)threadIdx.x) / 7) * 49)) + (i2_inner * 7)) + (((int)threadIdx.x) % 7))] = max((conv2d_nchw[i2_inner] + bias[((((int)blockIdx.x) * 8) + (((int)threadIdx.x) / 7))]), 0.000000e+00f);
}
}
</pre></div>
@@ -1567,7 +1183,7 @@ In the example below we resume the status and do more 5 trials.</p>
Get devices for measurement successfully!
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes 20.140 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes 21.096 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/e3e540f3b477c0c52d8eb73e674e8ffd/tune_conv2d_layer_cuda.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_conv2d_layer_cuda.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
index eacdbc3cc6..c2221ed890 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
@@ -902,7 +902,7 @@ so we can read the log file and load the best schedules.</p>
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 8.1900 8.1873 8.2025 8.1802 0.0093
+ 8.1540 8.1555 8.1560 8.1506 0.0024
</pre></div>
</div>
</div>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_x86.html b/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
index dfbea000e2..da7b53a5c2 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
@@ -921,7 +921,7 @@ so we can read the log file and load the best schedules.</p>
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 752.8236 752.8325 753.0603 752.5780 0.1970
+ 755.7181 756.6768 756.7382 753.7392 1.3995
</pre></div>
</div>
</div>
@@ -943,7 +943,7 @@ to learn how to use the RPC Tracker and RPC Server.
To use the RPC Tracker in auto-scheduler, replace the runner in <code class="code docutils literal notranslate"><span class="pre">TuningOptions</span></code>
with <a class="reference internal" href="../../reference/api/python/auto_scheduler.html#tvm.auto_scheduler.RPCRunner" title="tvm.auto_scheduler.RPCRunner"><code class="xref any py py-class docutils literal notranslate"><span class="pre">auto_scheduler.RPCRunner</span></code></a>.</p></li>
</ol>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 21.526 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 22.136 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-network-x86-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/e416b94ca1090b0897c0f6e0df95b911/tune_network_x86.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_network_x86.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html b/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
index 97e1a7ef6b..5de2a15c1f 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
@@ -625,105 +625,78 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
placeholder_4: Buffer(placeholder_14: Pointer(float32), float32, [65536], []),
compute: Buffer(compute_2: Pointer(float32), float32, [65536], [])}
buffer_map = {placeholder_5: placeholder, placeholder_6: placeholder_1, placeholder_7: placeholder_2, placeholder_8: placeholder_3, placeholder_9: placeholder_4, compute_1: compute}
- preflattened_buffer_map = {placeholder_5: placeholder_15: Buffer(placeholder_10, float32, [128, 256], []), placeholder_8: placeholder_16: Buffer(placeholder_13, int32, [33], []), placeholder_9: placeholder_17: Buffer(placeholder_14, float32, [128, 512], []), compute_1: compute_3: Buffer(compute_2, float32, [128, 512], []), placeholder_6: placeholder_18: Buffer(placeholder_11, float32, [4916, 16, 1], []), placeholder_7: placeholder_19: Buffer(placeholder_12, int32, [4916], [])} {
- for (i0.outer.i1.outer.fused: int32, 0, 32) "parallel" {
- allocate(compute_4: Pointer(global float32), float32, [2048]), storage_scope = global {
- for (i.outer.inner: int32, 0, 16) {
- for (i.inner.init: int32, 0, 8) {
- let cse_var_1: int32 = ((i.outer.inner*128) + (i.inner.init*16))
- {
- compute_5: Buffer(compute_4, float32, [2048], [])[cse_var_1] = 0f32
- compute_5[(cse_var_1 + 1)] = 0f32
- compute_5[(cse_var_1 + 2)] = 0f32
- compute_5[(cse_var_1 + 3)] = 0f32
- compute_5[(cse_var_1 + 4)] = 0f32
- compute_5[(cse_var_1 + 5)] = 0f32
- compute_5[(cse_var_1 + 6)] = 0f32
- compute_5[(cse_var_1 + 7)] = 0f32
- compute_5[(cse_var_1 + 8)] = 0f32
- compute_5[(cse_var_1 + 9)] = 0f32
- compute_5[(cse_var_1 + 10)] = 0f32
- compute_5[(cse_var_1 + 11)] = 0f32
- compute_5[(cse_var_1 + 12)] = 0f32
- compute_5[(cse_var_1 + 13)] = 0f32
- compute_5[(cse_var_1 + 14)] = 0f32
- compute_5[(cse_var_1 + 15)] = 0f32
- }
- }
- for (elem_idx: int32, 0, (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])) {
- for (i.inner: int32, 0, 8) {
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_2: int32 = ((i.outer.inner*128) + (i.inner*16))
- compute_5[cse_var_2] = (compute_5[cse_var_2] + (placeholder_1[((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16))]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_3: int32 = (((i.outer.inner*128) + (i.inner*16)) + 1)
- compute_5[cse_var_3] = (compute_5[cse_var_3] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 1)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_4: int32 = (((i.outer.inner*128) + (i.inner*16)) + 2)
- compute_5[cse_var_4] = (compute_5[cse_var_4] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 2)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_5: int32 = (((i.outer.inner*128) + (i.inner*16)) + 3)
- compute_5[cse_var_5] = (compute_5[cse_var_5] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 3)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_6: int32 = (((i.outer.inner*128) + (i.inner*16)) + 4)
- compute_5[cse_var_6] = (compute_5[cse_var_6] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 4)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_7: int32 = (((i.outer.inner*128) + (i.inner*16)) + 5)
- compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 5)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_8: int32 = (((i.outer.inner*128) + (i.inner*16)) + 6)
- compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 6)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_9: int32 = (((i.outer.inner*128) + (i.inner*16)) + 7)
- compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 7)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ preflattened_buffer_map = {placeholder_6: placeholder_15: Buffer(placeholder_11, float32, [4916, 16, 1], []), placeholder_7: placeholder_16: Buffer(placeholder_12, int32, [4916], []), placeholder_8: placeholder_17: Buffer(placeholder_13, int32, [33], []), placeholder_5: placeholder_18: Buffer(placeholder_10, float32, [128, 256], []), compute_1: compute_3: Buffer(compute_2, float32, [128, 512], []), placeholder_9: placeholder_19: Buffer(placeholder_14, float32, [128, 512], [])} {
+ for (i0.outer.i1.outer.fused: int32, 0, 16) "parallel" {
+ allocate(compute_4: Pointer(global float32), float32, [4096]), storage_scope = global {
+ for (i.outer.inner: int32, 0, 8) {
+ for (nb_j.inner: int32, 0, 2) {
+ for (i.inner.init: int32, 0, 16) {
+ let cse_var_1: int32 = (((i.outer.inner*512) + (i.inner.init*32)) + (nb_j.inner*16))
+ {
+ compute_5: Buffer(compute_4, float32, [4096], [])[cse_var_1] = 0f32
+ compute_5[(cse_var_1 + 1)] = 0f32
+ compute_5[(cse_var_1 + 2)] = 0f32
+ compute_5[(cse_var_1 + 3)] = 0f32
+ compute_5[(cse_var_1 + 4)] = 0f32
+ compute_5[(cse_var_1 + 5)] = 0f32
+ compute_5[(cse_var_1 + 6)] = 0f32
+ compute_5[(cse_var_1 + 7)] = 0f32
+ compute_5[(cse_var_1 + 8)] = 0f32
+ compute_5[(cse_var_1 + 9)] = 0f32
+ compute_5[(cse_var_1 + 10)] = 0f32
+ compute_5[(cse_var_1 + 11)] = 0f32
+ compute_5[(cse_var_1 + 12)] = 0f32
+ compute_5[(cse_var_1 + 13)] = 0f32
+ compute_5[(cse_var_1 + 14)] = 0f32
+ compute_5[(cse_var_1 + 15)] = 0f32
}
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_10: int32 = (((i.outer.inner*128) + (i.inner*16)) + 8)
- compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 8)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_11: int32 = (((i.outer.inner*128) + (i.inner*16)) + 9)
- compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 9)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_12: int32 = (((i.outer.inner*128) + (i.inner*16)) + 10)
- compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 10)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_13: int32 = (((i.outer.inner*128) + (i.inner*16)) + 11)
- compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 11)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_14: int32 = (((i.outer.inner*128) + (i.inner*16)) + 12)
- compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 12)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_15: int32 = (((i.outer.inner*128) + (i.inner*16)) + 13)
- compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 13)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_16: int32 = (((i.outer.inner*128) + (i.inner*16)) + 14)
- compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 14)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_17: int32 = (((i.outer.inner*128) + (i.inner*16)) + 15)
- compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 15)]*max(placeholder[(((i.outer.inner*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ for (elem_idx: int32, 0, let cse_var_2: int32 = ((i0.outer.i1.outer.fused*2) + nb_j.inner) in (placeholder_3[(cse_var_2 + 1)] - placeholder_3[cse_var_2])) {
+ for (i.inner: int32, 0, 16) {
+ let cse_var_21: int32 = (elem_idx*16)
+ let cse_var_20: int32 = ((i0.outer.i1.outer.fused*2) + nb_j.inner)
+ let cse_var_19: int32 = ((i.outer.inner*4096) + (i.inner*256))
+ let cse_var_18: int32 = (((i.outer.inner*512) + (i.inner*32)) + (nb_j.inner*16))
+ let cse_var_17: int32 = (cse_var_18 + 9)
+ let cse_var_16: int32 = (cse_var_18 + 8)
+ let cse_var_15: int32 = (cse_var_18 + 7)
+ let cse_var_14: int32 = (cse_var_18 + 6)
+ let cse_var_13: int32 = (cse_var_18 + 5)
+ let cse_var_12: int32 = (cse_var_18 + 4)
+ let cse_var_11: int32 = (cse_var_18 + 3)
+ let cse_var_10: int32 = (cse_var_18 + 2)
+ let cse_var_9: int32 = (cse_var_18 + 15)
+ let cse_var_8: int32 = (cse_var_18 + 14)
+ let cse_var_7: int32 = (cse_var_18 + 13)
+ let cse_var_6: int32 = (cse_var_18 + 12)
+ let cse_var_5: int32 = (cse_var_18 + 11)
+ let cse_var_4: int32 = (cse_var_18 + 10)
+ let cse_var_3: int32 = (cse_var_18 + 1)
+ {
+ compute_5[cse_var_18] = (compute_5[cse_var_18] + (placeholder_1[((placeholder_3[cse_var_20]*16) + cse_var_21)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_3] = (compute_5[cse_var_3] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 1)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 2)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 3)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 4)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 5)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 6)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 7)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 8)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 9)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_4] = (compute_5[cse_var_4] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 10)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_5] = (compute_5[cse_var_5] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 11)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_6] = (compute_5[cse_var_6] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 12)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 13)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 14)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[cse_var_20]*16) + cse_var_21) + 15)]*max(placeholder[(cse_var_19 + placeholder_2[(placeholder_3[cse_var_20] + elem_idx)])], 0f32)))
+ }
}
}
}
}
for (i0.inner: int32, 0, 128) {
- for (i1.inner: int32, 0, 16) {
- let cse_var_18: int32 = (((i0.inner*512) + (i0.outer.i1.outer.fused*16)) + i1.inner)
- compute[cse_var_18] = max((compute_5[((i0.inner*16) + i1.inner)] + placeholder_4[cse_var_18]), 0f32)
- }
+ let cse_var_22: int32 = ((i0.inner*512) + (i0.outer.i1.outer.fused*32))
+ compute[ramp(cse_var_22, 1, 32)] = max((compute_5[ramp((i0.inner*32), 1, 32)] + placeholder_4[ramp(cse_var_22, 1, 32)]), broadcast(0f32, 32))
}
}
}
@@ -761,7 +734,7 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
<span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 1.823 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 1.722 ms
</pre></div>
</div>
<div class="admonition note">
diff --git a/docs/how_to/tune_with_autotvm/sg_execution_times.html b/docs/how_to/tune_with_autotvm/sg_execution_times.html
index 4500abdc50..3b5cb2d03b 100644
--- a/docs/how_to/tune_with_autotvm/sg_execution_times.html
+++ b/docs/how_to/tune_with_autotvm/sg_execution_times.html
@@ -327,7 +327,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-tune-with-autotvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:32.020</strong> total execution time for <strong>how_to_tune_with_autotvm</strong> files:</p>
+<p><strong>00:29.562</strong> total execution time for <strong>how_to_tune_with_autotvm</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 84%" />
@@ -336,22 +336,22 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_conv2d_cuda.html#sphx-glr-how-to-tune-with-autotvm-tune-conv2d-cuda-py"><span class="std std-ref">Tuning High Performance Convolution on NVIDIA GPUs</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_cuda.py</span></code>)</p></td>
-<td><p>00:31.984</p></td>
+<td><p>00:29.526</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_relay_x86.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-x86-py"><span class="std std-ref">Auto-tuning a Convolutional Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_x86.py</span></code>)</p></td>
-<td><p>00:00.021</p></td>
+<td><p>00:00.020</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_relay_cuda.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-cuda-py"><span class="std std-ref">Auto-tuning a Convolutional Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_cuda.py</span></code>)</p></td>
<td><p>00:00.005</p></td>
<td><p>0.0 MB</p></td>
</tr>
-<tr class="row-even"><td><p><a class="reference internal" href="tune_relay_mobile_gpu.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-mobile-gpu-py"><span class="std std-ref">Auto-tuning a Convolutional Network for Mobile GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_mobile_gpu.py</span></code>)</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="tune_relay_arm.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-arm-py"><span class="std std-ref">Auto-tuning a Convolutional Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_arm.py</span></code>)</p></td>
<td><p>00:00.005</p></td>
<td><p>0.0 MB</p></td>
</tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="tune_relay_arm.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-arm-py"><span class="std std-ref">Auto-tuning a Convolutional Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_arm.py</span></code>)</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="tune_relay_mobile_gpu.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-mobile-gpu-py"><span class="std std-ref">Auto-tuning a Convolutional Network for Mobile GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_mobile_gpu.py</span></code>)</p></td>
<td><p>00:00.005</p></td>
<td><p>0.0 MB</p></td>
</tr>
diff --git a/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html b/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
index 5ebfd41f82..bebceac807 100644
--- a/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
+++ b/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
@@ -557,7 +557,9 @@ for this template</p>
waiting for device...
device available
Get devices for measurement successfully!
-No: 1 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+No: 1 GFLOPS: 24.13/24.13 result: MeasureResult(costs=(0.009593230454545455,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.932642698287964, timestamp=1663882327.1766) [('tile_f', [-1, 4, 4, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8593881
+No: 2 GFLOPS: 83.93/83.93 result: MeasureResult(costs=(0.0027582268448275863,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.507809638977051, timestamp=1663882328.0813944) [('tile_f', [-1, 16, 16, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,8717369
+No: 3 GFLOPS: 0.00/83.93 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -679,9 +681,8 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 64, 1, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 256]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9095346
-No: 2 GFLOPS: 3.44/3.44 result: MeasureResult(costs=(0.06739114375,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.442340850830078, timestamp=1663875119.019349) [('tile_f', [-1, 4, 1, 8]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 16, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,274698
-No: 3 GFLOPS: 0.00/3.44 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 4, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 256, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10132656
+No: 4 GFLOPS: 0.00/83.93 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -803,8 +804,8 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 2, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 512, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7002488
-No: 4 GFLOPS: 0.00/3.44 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 16, 16]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 128, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6414062
+No: 5 GFLOPS: 0.00/83.93 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -926,8 +927,8 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 8, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 512, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3712624
-No: 5 GFLOPS: 0.00/3.44 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 2, 64]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 2, 16]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10384866
+No: 6 GFLOPS: 0.00/83.93 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1049,8 +1050,9 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 16, 2]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 128]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5595345
-No: 6 GFLOPS: 0.00/3.44 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 8, 8]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 32]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8280956
+No: 7 GFLOPS: 91.25/91.25 result: MeasureResult(costs=(0.002536887925,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.1385846138000488, timestamp=1663882331.5116751) [('tile_f', [-1, 2, 64, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6074686
+No: 8 GFLOPS: 0.00/91.25 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1172,9 +1174,8 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 64, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 64]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9838447
-No: 7 GFLOPS: 3.57/3.57 result: MeasureResult(costs=(0.06481407925,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.5743279457092285, timestamp=1663875125.0370524) [('tile_f', [-1, 1, 32, 16]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 2, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4686404
-No: 8 GFLOPS: 0.00/3.57 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 1, 64]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 128]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8309381
+No: 9 GFLOPS: 0.00/91.25 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1296,9 +1297,8 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 256, 2, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 8]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8422278
-No: 9 GFLOPS: 59.00/59.00 result: MeasureResult(costs=(0.003923450037037037,), error_no=MeasureErrorNo.NO_ERROR, all_cost=5.324518442153931, timestamp=1663875130.5515735) [('tile_f', [-1, 1, 8, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 8]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8238461
-No: 10 GFLOPS: 0.00/59.00 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 4, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 64, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1185556
+No: 10 GFLOPS: 0.00/91.25 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1420,9 +1420,8 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 16, 16, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 8, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7790458
-No: 11 GFLOPS: 234.33/234.33 result: MeasureResult(costs=(0.0009879114594594594,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.1547462940216064, timestamp=1663875131.228428) [('tile_f', [-1, 2, 32, 1]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 64]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5386081
-No: 12 GFLOPS: 0.00/234.33 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 8, 16]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 8]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4361239
+No: 11 GFLOPS: 0.00/91.25 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1544,8 +1543,8 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 1, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 128, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9512987
-No: 13 GFLOPS: 0.00/234.33 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 2, 16]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 128]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9862111
+No: 12 GFLOPS: 0.00/91.25 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1667,8 +1666,10 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 32, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 4]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7048272
-No: 14 GFLOPS: 0.00/234.33 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 16, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 8]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,875069
+No: 13 GFLOPS: 34.82/91.25 result: MeasureResult(costs=(0.00664771,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.352648973464966, timestamp=1663882335.246479) [('tile_f', [-1, 1, 8, 32]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 16]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,1862937
+No: 14 GFLOPS: 0.98/91.25 result: MeasureResult(costs=(0.23562669349999998,), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.185566663742065, timestamp=1663882338.6236575) [('tile_f', [-1, 256, 2, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,260498
+No: 15 GFLOPS: 0.00/91.25 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1790,10 +1791,8 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 32, 4, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 64, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7413900
-No: 15 GFLOPS: 817.00/817.00 result: MeasureResult(costs=(0.00028335495759717314,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.201112985610962, timestamp=1663875132.6936429) [('tile_f', [-1, 1, 8, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1557847
-No: 16 GFLOPS: 823.54/823.54 result: MeasureResult(costs=(0.0002811060505226481,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.271367073059082, timestamp=1663875133.633881) [('tile_f', [-1, 1, 32, 1]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6791440
-No: 17 GFLOPS: 0.00/823.54 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 1, 16]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 128]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5792986
+No: 16 GFLOPS: 0.00/91.25 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1915,9 +1914,8 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 16, 2, 16]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1699894
-No: 18 GFLOPS: 305.95/823.54 result: MeasureResult(costs=(0.0007566715524475524,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.3589026927947998, timestamp=1663875135.184876) [('tile_f', [-1, 1, 16, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 16, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9308725
-No: 19 GFLOPS: 0.00/823.54 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 32, 1]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 32, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,474802
+No: 17 GFLOPS: 0.00/91.25 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2039,8 +2037,9 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 1, 128]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 16, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10278391
-No: 20 GFLOPS: 0.00/823.54 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 32, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 2, 16]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7673692
+No: 18 GFLOPS: 1204.15/1204.15 result: MeasureResult(costs=(0.0001922534928741093,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.188037395477295, timestamp=1663882340.0105078) [('tile_f', [-1, 1, 8, 2]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,1947959
+No: 19 GFLOPS: 0.00/1204.15 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2162,7 +2161,130 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 8, 16]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 16, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2182359
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 4, 32]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 16]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,130215
+No: 20 GFLOPS: 0.00/1204.15 result: Traceback (most recent call last):
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+ func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+ func = build(s, args, target_host=task.target_host, runtime=runtime)
+ File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+ input_mod = lower(inputs, args, name=name, binds=binds)
+ File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+ return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
+ File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
+tvm._ffi.base.TVMError: Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1731
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1671
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1646
+ 13: operator()
+ at ../src/driver/driver_api.cc:379
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:365
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:260
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:453
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1750
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1694
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1618
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+Traceback (most recent call last):
+ 24: TVMFuncCall
+ at ../src/runtime/c_runtime_api.cc:477
+ 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 22: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 21: operator()
+ at ../include/tvm/runtime/packed_func.h:1731
+ 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+ at ../include/tvm/runtime/packed_func.h:1671
+ 19: run<>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1631
+ 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+ at ../include/tvm/runtime/packed_func.h:1646
+ 13: operator()
+ at ../src/driver/driver_api.cc:379
+ 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+ at ../src/driver/driver_api.cc:365
+ 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+ at ../src/driver/driver_api.cc:260
+ 10: tvm::transform::Pass::operator()(tvm::IRModule) const
+ at ../src/ir/transform.cc:258
+ 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:453
+ 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/ir/transform.cc:274
+ 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+ at ../src/tir/ir/transform.cc:100
+ 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+ at ../include/tvm/runtime/packed_func.h:1750
+ 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+ at ../include/tvm/runtime/packed_func.h:1694
+ 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+ at ../include/tvm/runtime/packed_func.h:1618
+ 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+ at ../include/tvm/runtime/packed_func.h:1217
+ 1: Call
+ at ../include/tvm/runtime/packed_func.h:1213
+ 0: operator()
+ at ../src/runtime/c_runtime_api.cc:534
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+ File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+ raise InstantiationError("Skipped because of invalid gpu kernel")
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 1, 32]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 16]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5929408
</pre></div>
</div>
<p>Finally we can inspect the best config from log file, check correctness,
@@ -2201,9 +2323,9 @@ and measure running time.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Finish loading 20 records
Best config:
-[('tile_f', [-1, 1, 32, 1]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6791440
+[('tile_f', [-1, 1, 8, 2]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,1947959
Finish loading 20 records
-Time cost of this operator: 0.000695
+Time cost of this operator: 0.000484
</pre></div>
</div>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autotvm-tune-conv2d-cuda-py">
diff --git a/docs/how_to/work_with_microtvm/micro_autotune.html b/docs/how_to/work_with_microtvm/micro_autotune.html
index a11b45d60d..6df0eb15c8 100644
--- a/docs/how_to/work_with_microtvm/micro_autotune.html
+++ b/docs/how_to/work_with_microtvm/micro_autotune.html
@@ -582,10 +582,10 @@ the tuned operator.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>########## Build without Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs Measurements(us)
--------- --- -------- ------- ----- ------ ------- ----------------
-tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 311.7 98.72 (1, 2, 10, 10, 3) 2 1 [311.7]
-tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.071 0.973 (1, 6, 10, 10) 1 1 [3.071]
-tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.97 0.307 (1, 1, 10, 10, 3) 1 1 [0.97]
-Total_time - 315.741 - - - - -
+tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 310.4 98.714 (1, 2, 10, 10, 3) 2 1 [310.4]
+tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.086 0.981 (1, 6, 10, 10) 1 1 [3.086]
+tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.958 0.305 (1, 1, 10, 10, 3) 1 1 [0.958]
+Total_time - 314.444 - - - - -
</pre></div>
</div>
</div>
@@ -636,10 +636,10 @@ Total_time -
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>########## Build with Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs Measurements(us)
--------- --- -------- ------- ----- ------ ------- ----------------
-tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 136.0 98.039 (1, 6, 10, 10, 1) 2 1 [136.0]
-tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.765 1.273 (1, 6, 10, 10) 1 1 [1.765]
-tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.956 0.689 (1, 1, 10, 10, 3) 1 1 [0.956]
-Total_time - 138.721 - - - - -
+tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 181.3 98.431 (1, 1, 10, 10, 6) 2 1 [181.3]
+tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.93 1.048 (1, 6, 10, 10) 1 1 [1.93]
+tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.96 0.521 (1, 1, 10, 10, 3) 1 1 [0.96]
+Total_time - 184.189 - - - - -
</pre></div>
</div>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-autotune-py">
diff --git a/docs/how_to/work_with_microtvm/micro_train.html b/docs/how_to/work_with_microtvm/micro_train.html
index 6e1573e6f9..251d0e0792 100644
--- a/docs/how_to/work_with_microtvm/micro_train.html
+++ b/docs/how_to/work_with_microtvm/micro_train.html
@@ -516,7 +516,7 @@ take about <strong>2 minutes</strong> to download the Stanford Cars, while COCO
<a href="https://docs.python.org/3/library/shutil.html#shutil.move" title="shutil.move" class="sphx-glr-backref-module-shutil sphx-glr-backref-type-py-function"><span class="n">shutil</span><span class="o">.</span><span class="n">move</span></a><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><a href="https://docs.python.org/3/library/stdtypes.html#str" title="builtins.str" class="sphx-glr-backref-module-builtins sphx-glr-backref-typ [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>'/tmp/tmpnvgydylo/images/random'
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>'/tmp/tmp64nz5xxv/images/random'
</pre></div>
</div>
</div>
@@ -576,8 +576,8 @@ objects to other stuff? We can display some examples from our datasets using <co
<span class="n">plt</span><span class="o">.</span><span class="n">axis</span><span class="p">(</span><span class="s2">"off"</span><span class="p">)</span>
</pre></div>
</div>
-<img src="../../_images/sphx_glr_micro_train_001.png" srcset="../../_images/sphx_glr_micro_train_001.png" alt="[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>/tmp/tmpnvgydylo/images/target contains 8144 images
-/tmp/tmpnvgydylo/images/random contains 5000 images
+<img src="../../_images/sphx_glr_micro_train_001.png" srcset="../../_images/sphx_glr_micro_train_001.png" alt="[0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>/tmp/tmp64nz5xxv/images/target contains 8144 images
+/tmp/tmp64nz5xxv/images/random contains 5000 images
</pre></div>
</div>
</div>
@@ -689,13 +689,13 @@ the time on our validation set).</p>
</pre></div>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Epoch 1/3
-328/328 - 47s - loss: 0.2145 - accuracy: 0.9249 - val_loss: 0.1352 - val_accuracy: 0.9494 - 47s/epoch - 143ms/step
+328/328 - 47s - loss: 0.2080 - accuracy: 0.9267 - val_loss: 0.1079 - val_accuracy: 0.9619 - 47s/epoch - 142ms/step
Epoch 2/3
-328/328 - 43s - loss: 0.1034 - accuracy: 0.9635 - val_loss: 0.1398 - val_accuracy: 0.9588 - 43s/epoch - 131ms/step
+328/328 - 43s - loss: 0.0903 - accuracy: 0.9685 - val_loss: 0.0994 - val_accuracy: 0.9687 - 43s/epoch - 132ms/step
Epoch 3/3
-328/328 - 43s - loss: 0.0649 - accuracy: 0.9764 - val_loss: 0.1187 - val_accuracy: 0.9600 - 43s/epoch - 131ms/step
+328/328 - 43s - loss: 0.0624 - accuracy: 0.9774 - val_loss: 0.0998 - val_accuracy: 0.9619 - 43s/epoch - 131ms/step
-<keras.callbacks.History object at 0x7f1f3eb99810>
+<keras.callbacks.History object at 0x7f340b3e67d0>
</pre></div>
</div>
</div>
@@ -957,7 +957,7 @@ as intended.</p>
<p>From here, we could modify the model to read live images from the camera - we have another
Arduino tutorial for how to do that <a class="reference external" href="https://github.com/guberti/tvm-arduino-demos/tree/master/examples/person_detection">on GitHub</a>. Alternatively, we could also
<a class="reference external" href="https://tvm.apache.org/docs/how_to/work_with_microtvm/micro_autotune.html">use TVM’s autotuning capabilities</a> to dramatically improve the model’s performance.</p>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 4 minutes 32.249 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 4 minutes 28.626 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-train-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/b52cec46baf4f78d6bcd94cbe269c8a6/micro_train.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">micro_train.py</span></code></a></p>
diff --git a/docs/how_to/work_with_microtvm/sg_execution_times.html b/docs/how_to/work_with_microtvm/sg_execution_times.html
index cfde73b15e..84a42e1869 100644
--- a/docs/how_to/work_with_microtvm/sg_execution_times.html
+++ b/docs/how_to/work_with_microtvm/sg_execution_times.html
@@ -327,7 +327,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-work-with-microtvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>05:25.397</strong> total execution time for <strong>how_to_work_with_microtvm</strong> files:</p>
+<p><strong>05:20.845</strong> total execution time for <strong>how_to_work_with_microtvm</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 83%" />
@@ -336,19 +336,19 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="micro_train.html#sphx-glr-how-to-work-with-microtvm-micro-train-py"><span class="std std-ref">Training Vision Models for microTVM on Arduino</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_train.py</span></code>)</p></td>
-<td><p>04:32.249</p></td>
+<td><p>04:28.626</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="micro_autotune.html#sphx-glr-how-to-work-with-microtvm-micro-autotune-py"><span class="std std-ref">Autotuning with microTVM</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_autotune.py</span></code>)</p></td>
-<td><p>00:42.333</p></td>
+<td><p>00:41.541</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="micro_aot.html#sphx-glr-how-to-work-with-microtvm-micro-aot-py"><span class="std std-ref">microTVM Host-Driven AoT</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_aot.py</span></code>)</p></td>
-<td><p>00:07.457</p></td>
+<td><p>00:07.415</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="micro_tflite.html#sphx-glr-how-to-work-with-microtvm-micro-tflite-py"><span class="std std-ref">microTVM with TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_tflite.py</span></code>)</p></td>
-<td><p>00:03.357</p></td>
+<td><p>00:03.261</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="micro_ethosu.html#sphx-glr-how-to-work-with-microtvm-micro-ethosu-py"><span class="std std-ref">Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_ethosu.py</span></code>)</p></td>
diff --git a/docs/how_to/work_with_relay/sg_execution_times.html b/docs/how_to/work_with_relay/sg_execution_times.html
index 7713204724..38f718feb1 100644
--- a/docs/how_to/work_with_relay/sg_execution_times.html
+++ b/docs/how_to/work_with_relay/sg_execution_times.html
@@ -327,7 +327,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-work-with-relay-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:40.072</strong> total execution time for <strong>how_to_work_with_relay</strong> files:</p>
+<p><strong>00:42.533</strong> total execution time for <strong>how_to_work_with_relay</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 84%" />
@@ -336,15 +336,15 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="using_pipeline_executor.html#sphx-glr-how-to-work-with-relay-using-pipeline-executor-py"><span class="std std-ref">Using Pipeline Executor in Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_pipeline_executor.py</span></code>)</p></td>
-<td><p>00:31.961</p></td>
+<td><p>00:31.214</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="using_external_lib.html#sphx-glr-how-to-work-with-relay-using-external-lib-py"><span class="std std-ref">Using External Libraries in Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_external_lib.py</span></code>)</p></td>
-<td><p>00:06.405</p></td>
+<td><p>00:09.844</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="build_gcn.html#sphx-glr-how-to-work-with-relay-build-gcn-py"><span class="std std-ref">Building a Graph Convolutional Network</span></a> (<code class="docutils literal notranslate"><span class="pre">build_gcn.py</span></code>)</p></td>
-<td><p>00:01.699</p></td>
+<td><p>00:01.469</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="using_relay_viz.html#sphx-glr-how-to-work-with-relay-using-relay-viz-py"><span class="std std-ref">Use Relay Visualizer to Visualize Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_relay_viz.py</span></code>)</p></td>
diff --git a/docs/how_to/work_with_schedules/intrin_math.html b/docs/how_to/work_with_schedules/intrin_math.html
index 69314ebf77..82e14a3442 100644
--- a/docs/how_to/work_with_schedules/intrin_math.html
+++ b/docs/how_to/work_with_schedules/intrin_math.html
@@ -522,7 +522,7 @@ The following example customizes CUDA lowering rule for <code class="code docuti
<a href="../../reference/api/python/ir.html#tvm.ir.register_intrin_lowering" title="tvm.ir.register_intrin_lowering" class="sphx-glr-backref-module-tvm-ir sphx-glr-backref-type-py-function"><span class="n">register_intrin_lowering</span></a><span class="p">(</span><span class="s2">"tir.exp"</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">"cuda"</span><span class="p">,</span> <span class="n">f</span><span class="o">= [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span><function my_cuda_math_rule at 0x7f1ed8bea170>
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span><function my_cuda_math_rule at 0x7f33abc0d950>
</pre></div>
</div>
<p>Register the rule to TVM with override option to override existing rule.
diff --git a/docs/how_to/work_with_schedules/sg_execution_times.html b/docs/how_to/work_with_schedules/sg_execution_times.html
index d748113c67..dfa5211aa2 100644
--- a/docs/how_to/work_with_schedules/sg_execution_times.html
+++ b/docs/how_to/work_with_schedules/sg_execution_times.html
@@ -327,7 +327,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-work-with-schedules-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:04.868</strong> total execution time for <strong>how_to_work_with_schedules</strong> files:</p>
+<p><strong>00:07.564</strong> total execution time for <strong>how_to_work_with_schedules</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 83%" />
@@ -336,19 +336,19 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="intrin_math.html#sphx-glr-how-to-work-with-schedules-intrin-math-py"><span class="std std-ref">Intrinsics and Math Functions</span></a> (<code class="docutils literal notranslate"><span class="pre">intrin_math.py</span></code>)</p></td>
-<td><p>00:02.324</p></td>
+<td><p>00:05.338</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tensorize.html#sphx-glr-how-to-work-with-schedules-tensorize-py"><span class="std std-ref">Use Tensorize to Leverage Hardware Intrinsics</span></a> (<code class="docutils literal notranslate"><span class="pre">tensorize.py</span></code>)</p></td>
-<td><p>00:01.228</p></td>
+<td><p>00:00.990</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="reduction.html#sphx-glr-how-to-work-with-schedules-reduction-py"><span class="std std-ref">Reduction</span></a> (<code class="docutils literal notranslate"><span class="pre">reduction.py</span></code>)</p></td>
-<td><p>00:00.576</p></td>
+<td><p>00:00.539</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="scan.html#sphx-glr-how-to-work-with-schedules-scan-py"><span class="std std-ref">Scan and Recurrent Kernel</span></a> (<code class="docutils literal notranslate"><span class="pre">scan.py</span></code>)</p></td>
-<td><p>00:00.562</p></td>
+<td><p>00:00.518</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="extern_op.html#sphx-glr-how-to-work-with-schedules-extern-op-py"><span class="std std-ref">External Tensor Functions</span></a> (<code class="docutils literal notranslate"><span class="pre">extern_op.py</span></code>)</p></td>
@@ -356,7 +356,7 @@
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="schedule_primitives.html#sphx-glr-how-to-work-with-schedules-schedule-primitives-py"><span class="std std-ref">Schedule Primitives in TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">schedule_primitives.py</span></code>)</p></td>
-<td><p>00:00.039</p></td>
+<td><p>00:00.040</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="tedd.html#sphx-glr-how-to-work-with-schedules-tedd-py"><span class="std std-ref">Use Tensor Expression Debug Display (TEDD) for Visualization</span></a> (<code class="docutils literal notranslate"><span class="pre">tedd.py</span></code>)</p></td>
diff --git a/docs/how_to/work_with_schedules/tensorize.html b/docs/how_to/work_with_schedules/tensorize.html
index d48f39beb0..c7624482e6 100644
--- a/docs/how_to/work_with_schedules/tensorize.html
+++ b/docs/how_to/work_with_schedules/tensorize.html
@@ -577,7 +577,7 @@ The importing needs to happen before the tensorized GEMV being executed.</p>
C: Buffer(C_2: Pointer(float32), float32, [524288], [])}
buffer_map = {A_1: A, B_1: B, C_1: C}
preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [1024, 64], []), B_1: B_3: Buffer(B_2, float32, [512, 64], []), C_1: C_3: Buffer(C_2, float32, [1024, 512], [])} {
- attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmpo8bhi268/input0.cc'\nsource_filename = \"/tmp/tmpo8bhi268/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = allo [...]
+ attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmpnpp5qs6m/input0.cc'\nsource_filename = \"/tmp/tmpnpp5qs6m/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = allo [...]
for (i, 0, 1024) {
for (j.outer: int32, 0, 32) {
@tir.call_extern("gemv_update", @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), C_2, ((i*512) + (j.outer*16)), 16, 2, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), A_2, (i*64), 64, 1, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), B_2, (j.outer*1024), 1024, 1, dtype=handle), 16, 64, 64, dtype=int32)
diff --git a/docs/install/nnpack.html b/docs/install/nnpack.html
index aa2238b85b..3153785d75 100644
--- a/docs/install/nnpack.html
+++ b/docs/install/nnpack.html
@@ -224,17 +224,7 @@
<p class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
<ul class="current">
<li class="toctree-l1 current"><a class="reference internal" href="index.html">Installing TVM</a><ul class="current">
-<li class="toctree-l2 current"><a class="reference internal" href="from_source.html">Install from Source</a><ul class="current">
-<li class="toctree-l3"><a class="reference internal" href="from_source.html#developers-get-source-from-github">Developers: Get Source from Github</a></li>
-<li class="toctree-l3"><a class="reference internal" href="from_source.html#build-the-shared-library">Build the Shared Library</a></li>
-<li class="toctree-l3"><a class="reference internal" href="from_source.html#python-package-installation">Python Package Installation</a></li>
-<li class="toctree-l3 current"><a class="reference internal" href="from_source.html#install-contrib-libraries">Install Contrib Libraries</a><ul class="current">
-<li class="toctree-l4 current"><a class="current reference internal" href="#">NNPACK Contrib Installation</a></li>
-</ul>
-</li>
-<li class="toctree-l3"><a class="reference internal" href="from_source.html#enable-c-tests">Enable C++ Tests</a></li>
-</ul>
-</li>
+<li class="toctree-l2"><a class="reference internal" href="from_source.html">Install from Source</a></li>
<li class="toctree-l2"><a class="reference internal" href="docker.html">Docker Images</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">NNPACK Contrib Installation</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#conditions">Conditions</a></li>
diff --git a/docs/objects.inv b/docs/objects.inv
index 2dde1c0bb6..dc17e6c026 100644
Binary files a/docs/objects.inv and b/docs/objects.inv differ
diff --git a/docs/reference/api/doxygen/analyzer_8h.html b/docs/reference/api/doxygen/analyzer_8h.html
index 7fb675f2a4..891197cf9a 100644
--- a/docs/reference/api/doxygen/analyzer_8h.html
+++ b/docs/reference/api/doxygen/analyzer_8h.html
@@ -89,7 +89,7 @@ Include dependency graph for analyzer.h:</div>
</div><div class="textblock"><div class="dynheader">
This graph shows which files directly or indirectly include this file:</div>
<div class="dyncontent">
-<div class="center"><iframe scrolling="no" frameborder="0" src="analyzer_8h__dep__incl.svg" width="4750" height="752"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
+<div class="center"><iframe scrolling="no" frameborder="0" src="analyzer_8h__dep__incl.svg" width="5124" height="752"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
</div>
</div>
</div>
diff --git a/docs/reference/api/doxygen/analyzer_8h__dep__incl.svg b/docs/reference/api/doxygen/analyzer_8h__dep__incl.svg
index 899fc2b7cf..0dd4b1fd1b 100644
--- a/docs/reference/api/doxygen/analyzer_8h__dep__incl.svg
+++ b/docs/reference/api/doxygen/analyzer_8h__dep__incl.svg
@@ -4,1157 +4,1173 @@
<!-- Generated by graphviz version 2.40.1 (20161225.0304)
-->
<!-- Title: include/tvm/arith/analyzer.h Pages: 1 -->
-<svg width="3562pt" height="564pt"
- viewBox="0.00 0.00 3561.50 564.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<svg width="3843pt" height="564pt"
+ viewBox="0.00 0.00 3843.00 564.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 560)">
<title>include/tvm/arith/analyzer.h</title>
-<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-560 3557.5,-560 3557.5,4 -4,4"/>
+<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-560 3839,-560 3839,4 -4,4"/>
<!-- Node56 -->
<g id="node1" class="node">
<title>Node56</title>
-<polygon fill="#bfbfbf" stroke="#000000" points="3171,-536.5 3171,-555.5 3324,-555.5 3324,-536.5 3171,-536.5"/>
-<text text-anchor="middle" x="3247.5" y="-543.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/arith/analyzer.h</text>
+<polygon fill="#bfbfbf" stroke="#000000" points="3452.5,-536.5 3452.5,-555.5 3605.5,-555.5 3605.5,-536.5 3452.5,-536.5"/>
+<text text-anchor="middle" x="3529" y="-543.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/arith/analyzer.h</text>
</g>
<!-- Node57 -->
<g id="node2" class="node">
<title>Node57</title>
<g id="a_node2"><a xlink:href="int__solver_8h.html" target="_top" xlink:title="integer constraints data structures and solvers ">
-<polygon fill="#ffffff" stroke="#000000" points="1604.5,-469.5 1604.5,-499.5 1720.5,-499.5 1720.5,-469.5 1604.5,-469.5"/>
-<text text-anchor="start" x="1612.5" y="-487.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/arith/int</text>
-<text text-anchor="middle" x="1662.5" y="-476.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_solver.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1963,-469.5 1963,-499.5 2079,-499.5 2079,-469.5 1963,-469.5"/>
+<text text-anchor="start" x="1971" y="-487.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/arith/int</text>
+<text text-anchor="middle" x="2021" y="-476.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_solver.h</text>
</a>
</g>
</g>
<!-- Node56->Node57 -->
<g id="edge1" class="edge">
<title>Node56->Node57</title>
-<path fill="none" stroke="#191970" d="M3160.8458,-545.6179C2881.911,-544.0348 2009.2562,-536.2385 1729.5,-500 1726.6583,-499.6319 1723.7606,-499.1966 1720.8423,-498.7099"/>
-<polygon fill="#191970" stroke="#191970" points="3160.8933,-549.1181 3170.9126,-545.6738 3160.9322,-542.1182 3160.8933,-549.1181"/>
+<path fill="none" stroke="#191970" d="M3442.1696,-545.4523C3172.5498,-543.4006 2352.0414,-534.4201 2088,-500 2085.1586,-499.6296 2082.2612,-499.1924 2079.3431,-498.7042"/>
+<polygon fill="#191970" stroke="#191970" points="3442.2688,-548.953 3452.2948,-545.528 3442.3213,-541.9532 3442.2688,-548.953"/>
</g>
<!-- Node58 -->
<g id="node3" class="node">
<title>Node58</title>
<g id="a_node3"><a xlink:href="iter__affine__map_8h.html" target="_top" xlink:title="Iterator quasi-affine mapping patterns. ">
-<polygon fill="#ffffff" stroke="#000000" points="1739,-469.5 1739,-499.5 1858,-499.5 1858,-469.5 1739,-469.5"/>
-<text text-anchor="start" x="1747" y="-487.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/arith/iter</text>
-<text text-anchor="middle" x="1798.5" y="-476.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_affine_map.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="2097.5,-469.5 2097.5,-499.5 2216.5,-499.5 2216.5,-469.5 2097.5,-469.5"/>
+<text text-anchor="start" x="2105.5" y="-487.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/arith/iter</text>
+<text text-anchor="middle" x="2157" y="-476.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_affine_map.h</text>
</a>
</g>
</g>
<!-- Node56->Node58 -->
<g id="edge2" class="edge">
<title>Node56->Node58</title>
-<path fill="none" stroke="#191970" d="M3160.3362,-545.2337C2898.1187,-542.59 2119.1585,-532.1707 1867.5,-500 1864.4554,-499.6108 1861.3473,-499.1498 1858.2173,-498.6347"/>
-<polygon fill="#191970" stroke="#191970" points="3160.5321,-548.7357 3170.5665,-545.3355 3160.6018,-541.7361 3160.5321,-548.7357"/>
+<path fill="none" stroke="#191970" d="M3441.9463,-545.0348C3189.8323,-541.9047 2462.1012,-530.4181 2226,-500 2222.9557,-499.6078 2219.848,-499.1444 2216.7183,-498.6274"/>
+<polygon fill="#191970" stroke="#191970" points="3442.1113,-548.5369 3452.1536,-545.1601 3442.1973,-541.5375 3442.1113,-548.5369"/>
</g>
<!-- Node59 -->
<g id="node4" class="node">
<title>Node59</title>
<g id="a_node4"><a xlink:href="operation_8h.html" target="_top" xlink:title="Operation node can generate one or multiple Tensors. ">
-<polygon fill="#ffffff" stroke="#000000" points="1876.5,-475 1876.5,-494 2022.5,-494 2022.5,-475 1876.5,-475"/>
-<text text-anchor="middle" x="1949.5" y="-482" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/te/operation.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="2235,-475 2235,-494 2381,-494 2381,-475 2235,-475"/>
+<text text-anchor="middle" x="2308" y="-482" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/te/operation.h</text>
</a>
</g>
</g>
<!-- Node56->Node59 -->
<g id="edge3" class="edge">
<title>Node56->Node59</title>
-<path fill="none" stroke="#191970" d="M3160.5605,-541.8808C2919.6513,-530.4663 2246.5657,-498.5751 2022.8461,-487.9752"/>
-<polygon fill="#191970" stroke="#191970" points="3160.5169,-545.3825 3170.6714,-542.3598 3160.8483,-538.3904 3160.5169,-545.3825"/>
+<path fill="none" stroke="#191970" d="M3441.852,-541.6105C3212.2867,-530.0476 2594.0039,-498.9056 2381.1665,-488.1853"/>
+<polygon fill="#191970" stroke="#191970" points="3441.995,-545.122 3452.1584,-542.1296 3442.3472,-538.1309 3441.995,-545.122"/>
</g>
<!-- Node77 -->
<g id="node22" class="node">
<title>Node77</title>
<g id="a_node22"><a xlink:href="nn_2pooling_8h.html" target="_top" xlink:title="Pooling op constructions. ">
-<polygon fill="#ffffff" stroke="#000000" points="2953,-.5 2953,-30.5 3064,-30.5 3064,-.5 2953,-.5"/>
-<text text-anchor="start" x="2961" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
-<text text-anchor="middle" x="3008.5" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/pooling.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="3227.5,-.5 3227.5,-30.5 3338.5,-30.5 3338.5,-.5 3227.5,-.5"/>
+<text text-anchor="start" x="3235.5" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
+<text text-anchor="middle" x="3283" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/pooling.h</text>
</a>
</g>
</g>
<!-- Node56->Node77 -->
-<g id="edge112" class="edge">
+<g id="edge113" class="edge">
<title>Node56->Node77</title>
-<path fill="none" stroke="#191970" d="M3288.9226,-532.9752C3336.33,-515.1774 3407.5,-478.3289 3407.5,-417.5 3407.5,-417.5 3407.5,-417.5 3407.5,-149.5 3407.5,-77.9798 3174.2098,-37.4872 3064.3994,-22.3886"/>
-<polygon fill="#191970" stroke="#191970" points="3287.7081,-529.6925 3279.4938,-536.384 3290.0881,-536.2755 3287.7081,-529.6925"/>
+<path fill="none" stroke="#191970" d="M3570.4226,-532.9752C3617.83,-515.1774 3689,-478.3289 3689,-417.5 3689,-417.5 3689,-417.5 3689,-149.5 3689,-76.5729 3449.7945,-36.6507 3338.6013,-22.0326"/>
+<polygon fill="#191970" stroke="#191970" points="3569.2081,-529.6925 3560.9938,-536.384 3571.5881,-536.2755 3569.2081,-529.6925"/>
</g>
<!-- Node79 -->
<g id="node24" class="node">
<title>Node79</title>
<g id="a_node24"><a xlink:href="topi_2nn_8h.html" target="_top" xlink:title="NN op constructions. ">
-<polygon fill="#ffffff" stroke="#000000" points="3077.5,-73 3077.5,-92 3197.5,-92 3197.5,-73 3077.5,-73"/>
-<text text-anchor="middle" x="3137.5" y="-80" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="3361,-73 3361,-92 3481,-92 3481,-73 3361,-73"/>
+<text text-anchor="middle" x="3421" y="-80" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn.h</text>
</a>
</g>
</g>
<!-- Node56->Node79 -->
-<g id="edge113" class="edge">
+<g id="edge114" class="edge">
<title>Node56->Node79</title>
-<path fill="none" stroke="#191970" d="M3276.7754,-531.5131C3312.8705,-511.3449 3369.5,-471.1535 3369.5,-417.5 3369.5,-417.5 3369.5,-417.5 3369.5,-216.5 3369.5,-137.0782 3266.2882,-103.9901 3197.5475,-90.7603"/>
-<polygon fill="#191970" stroke="#191970" points="3274.7869,-528.6088 3267.6458,-536.4355 3278.109,-534.7703 3274.7869,-528.6088"/>
+<path fill="none" stroke="#191970" d="M3558.2754,-531.5131C3594.3705,-511.3449 3651,-471.1535 3651,-417.5 3651,-417.5 3651,-417.5 3651,-216.5 3651,-137.8221 3549.2763,-104.5268 3481.1294,-91.0492"/>
+<polygon fill="#191970" stroke="#191970" points="3556.2869,-528.6088 3549.1458,-536.4355 3559.609,-534.7703 3556.2869,-528.6088"/>
</g>
<!-- Node83 -->
<g id="node28" class="node">
<title>Node83</title>
<g id="a_node28"><a xlink:href="constant__utils_8h.html" target="_top" xlink:title="Utility functions for handling constants in TVM expressions. ">
-<polygon fill="#ffffff" stroke="#000000" points="2921,-402.5 2921,-432.5 3048,-432.5 3048,-402.5 2921,-402.5"/>
-<text text-anchor="start" x="2929" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/detail</text>
-<text text-anchor="middle" x="2984.5" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/constant_utils.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="3202.5,-402.5 3202.5,-432.5 3329.5,-432.5 3329.5,-402.5 3202.5,-402.5"/>
+<text text-anchor="start" x="3210.5" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/detail</text>
+<text text-anchor="middle" x="3266" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/constant_utils.h</text>
</a>
</g>
</g>
<!-- Node56->Node83 -->
-<g id="edge109" class="edge">
+<g id="edge110" class="edge">
<title>Node56->Node83</title>
-<path fill="none" stroke="#191970" d="M3218.4705,-531.8164C3168.5617,-507.4313 3066.9223,-457.771 3015.613,-432.7016"/>
-<polygon fill="#191970" stroke="#191970" points="3217.2108,-535.0963 3227.7323,-536.3416 3220.2839,-528.8069 3217.2108,-535.0963"/>
+<path fill="none" stroke="#191970" d="M3499.9705,-531.8164C3450.0617,-507.4313 3348.4223,-457.771 3297.113,-432.7016"/>
+<polygon fill="#191970" stroke="#191970" points="3498.7108,-535.0963 3509.2323,-536.3416 3501.7839,-528.8069 3498.7108,-535.0963"/>
</g>
<!-- Node86 -->
<g id="node31" class="node">
<title>Node86</title>
<g id="a_node31"><a xlink:href="nn_2bnn_8h.html" target="_top" xlink:title="Binary op constructions. ">
-<polygon fill="#ffffff" stroke="#000000" points="3230,-335.5 3230,-365.5 3341,-365.5 3341,-335.5 3230,-335.5"/>
-<text text-anchor="start" x="3238" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
-<text text-anchor="middle" x="3285.5" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/bnn.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="3511.5,-335.5 3511.5,-365.5 3622.5,-365.5 3622.5,-335.5 3511.5,-335.5"/>
+<text text-anchor="start" x="3519.5" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
+<text text-anchor="middle" x="3567" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/bnn.h</text>
</a>
</g>
</g>
<!-- Node56->Node86 -->
-<g id="edge110" class="edge">
+<g id="edge111" class="edge">
<title>Node56->Node86</title>
-<path fill="none" stroke="#191970" d="M3261.3216,-528.4371C3277.0457,-507.2338 3301.7978,-469.8177 3311.5,-433 3315.0109,-419.6771 3314.6915,-415.403 3311.5,-402 3308.3911,-388.9439 3301.4132,-375.4455 3295.4684,-365.5452"/>
-<polygon fill="#191970" stroke="#191970" points="3258.471,-526.4035 3255.2056,-536.4826 3264.0437,-530.6397 3258.471,-526.4035"/>
+<path fill="none" stroke="#191970" d="M3542.8216,-528.4371C3558.5457,-507.2338 3583.2978,-469.8177 3593,-433 3596.5109,-419.6771 3596.1915,-415.403 3593,-402 3589.8911,-388.9439 3582.9132,-375.4455 3576.9684,-365.5452"/>
+<polygon fill="#191970" stroke="#191970" points="3539.971,-526.4035 3536.7056,-536.4826 3545.5437,-530.6397 3539.971,-526.4035"/>
</g>
<!-- Node100 -->
<g id="node45" class="node">
<title>Node100</title>
<g id="a_node45"><a xlink:href="dilate_8h.html" target="_top" xlink:title="Dilate op constructions. ">
-<polygon fill="#ffffff" stroke="#000000" points="3192,-402.5 3192,-432.5 3303,-432.5 3303,-402.5 3192,-402.5"/>
-<text text-anchor="start" x="3200" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
-<text text-anchor="middle" x="3247.5" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/dilate.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="3473.5,-402.5 3473.5,-432.5 3584.5,-432.5 3584.5,-402.5 3473.5,-402.5"/>
+<text text-anchor="start" x="3481.5" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
+<text text-anchor="middle" x="3529" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/dilate.h</text>
</a>
</g>
</g>
<!-- Node56->Node100 -->
-<g id="edge111" class="edge">
+<g id="edge112" class="edge">
<title>Node56->Node100</title>
-<path fill="none" stroke="#191970" d="M3247.5,-526.2718C3247.5,-500.5195 3247.5,-455.9952 3247.5,-432.7016"/>
-<polygon fill="#191970" stroke="#191970" points="3244.0001,-526.3416 3247.5,-536.3416 3251.0001,-526.3416 3244.0001,-526.3416"/>
-</g>
-<!-- Node103 -->
-<g id="node48" class="node">
-<title>Node103</title>
-<g id="a_node48"><a xlink:href="greedy_8h.html" target="_top" xlink:title="This header file contains helper methods used in greedy algorithms for planning memory for USMP...">
-<polygon fill="#ffffff" stroke="#000000" points="3435.5,-469.5 3435.5,-499.5 3553.5,-499.5 3553.5,-469.5 3435.5,-469.5"/>
-<text text-anchor="start" x="3443.5" y="-487.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/usmp</text>
-<text text-anchor="middle" x="3494.5" y="-476.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/algo/greedy.h</text>
+<path fill="none" stroke="#191970" d="M3529,-526.2718C3529,-500.5195 3529,-455.9952 3529,-432.7016"/>
+<polygon fill="#191970" stroke="#191970" points="3525.5001,-526.3416 3529,-536.3416 3532.5001,-526.3416 3525.5001,-526.3416"/>
+</g>
+<!-- Node104 -->
+<g id="node49" class="node">
+<title>Node104</title>
+<g id="a_node49"><a xlink:href="greedy_8h.html" target="_top" xlink:title="This header file contains helper methods used in greedy algorithms for planning memory for USMP...">
+<polygon fill="#ffffff" stroke="#000000" points="3717,-469.5 3717,-499.5 3835,-499.5 3835,-469.5 3717,-469.5"/>
+<text text-anchor="start" x="3725" y="-487.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/usmp</text>
+<text text-anchor="middle" x="3776" y="-476.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/algo/greedy.h</text>
</a>
</g>
</g>
-<!-- Node56->Node103 -->
-<g id="edge108" class="edge">
-<title>Node56->Node103</title>
-<path fill="none" stroke="#191970" d="M3295.5975,-534.0243C3335.5664,-524.0725 3392.6053,-509.8705 3435.3613,-499.2248"/>
-<polygon fill="#191970" stroke="#191970" points="3294.6808,-530.6456 3285.8227,-536.4581 3296.3721,-537.4382 3294.6808,-530.6456"/>
+<!-- Node56->Node104 -->
+<g id="edge109" class="edge">
+<title>Node56->Node104</title>
+<path fill="none" stroke="#191970" d="M3577.0975,-534.0243C3617.0664,-524.0725 3674.1053,-509.8705 3716.8613,-499.2248"/>
+<polygon fill="#191970" stroke="#191970" points="3576.1808,-530.6456 3567.3227,-536.4581 3577.8721,-537.4382 3576.1808,-530.6456"/>
</g>
<!-- Node60 -->
<g id="node5" class="node">
<title>Node60</title>
<g id="a_node5"><a xlink:href="cublas_8h.html" target="_top" xlink:title="External function interface to cuBLAS libraries. ">
-<polygon fill="#ffffff" stroke="#000000" points="751,-335.5 751,-365.5 884,-365.5 884,-335.5 751,-335.5"/>
-<text text-anchor="start" x="759" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/contrib</text>
-<text text-anchor="middle" x="817.5" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/cublas.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="2086.5,-335.5 2086.5,-365.5 2219.5,-365.5 2219.5,-335.5 2086.5,-335.5"/>
+<text text-anchor="start" x="2094.5" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/contrib</text>
+<text text-anchor="middle" x="2153" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/cublas.h</text>
</a>
</g>
</g>
<!-- Node59->Node60 -->
<g id="edge4" class="edge">
<title>Node59->Node60</title>
-<path fill="none" stroke="#191970" d="M1894.8197,-473.1647C1885.4083,-471.5364 1875.6974,-470.0574 1866.5,-469 1507.6205,-427.7412 1409.2668,-501.0978 1054.5,-433 979.4099,-418.5864 895.8914,-385.1623 850.7911,-365.5669"/>
-<polygon fill="#191970" stroke="#191970" points="1894.4136,-476.6478 1904.8759,-474.9773 1895.6553,-469.7588 1894.4136,-476.6478"/>
+<path fill="none" stroke="#191970" d="M2267.4151,-471.2425C2246.1305,-462.7944 2220.6753,-450.1743 2202,-433 2181.1547,-413.83 2166.3866,-383.7394 2158.8167,-365.6857"/>
+<polygon fill="#191970" stroke="#191970" points="2266.2705,-474.5517 2276.8617,-474.8318 2268.7569,-468.0081 2266.2705,-474.5517"/>
</g>
<!-- Node61 -->
<g id="node6" class="node">
<title>Node61</title>
<g id="a_node6"><a xlink:href="cuda_2dense_8h.html" target="_top" xlink:title="CUDA schedule for dense operation. ">
-<polygon fill="#ffffff" stroke="#000000" points="659.5,-201.5 659.5,-231.5 781.5,-231.5 781.5,-201.5 659.5,-201.5"/>
-<text text-anchor="start" x="667.5" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/cuda</text>
-<text text-anchor="middle" x="720.5" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/dense.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="2005,-201.5 2005,-231.5 2127,-231.5 2127,-201.5 2005,-201.5"/>
+<text text-anchor="start" x="2013" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/cuda</text>
+<text text-anchor="middle" x="2066" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/dense.h</text>
</a>
</g>
</g>
<!-- Node59->Node61 -->
<g id="edge9" class="edge">
<title>Node59->Node61</title>
-<path fill="none" stroke="#191970" d="M1895.4621,-473.1778C1885.8531,-471.5168 1875.9098,-470.0235 1866.5,-469 1804.5003,-462.2561 790.2919,-475.3385 744.5,-433 687.1607,-379.9849 706.7098,-271.6282 716.4135,-231.7068"/>
-<polygon fill="#191970" stroke="#191970" points="1894.8558,-476.6248 1905.3185,-474.9567 1896.0992,-469.7361 1894.8558,-476.6248"/>
+<path fill="none" stroke="#191970" d="M2246.6751,-472.5491C2190.9049,-458.0052 2112.1173,-427.35 2077,-366 2052.1768,-322.6339 2058.4009,-260.2232 2063.0644,-231.7282"/>
+<polygon fill="#191970" stroke="#191970" points="2245.9066,-475.9647 2256.4573,-474.9977 2247.6065,-469.1742 2245.9066,-475.9647"/>
</g>
<!-- Node62 -->
<g id="node7" class="node">
<title>Node62</title>
<g id="a_node7"><a xlink:href="rocm_2dense_8h.html" target="_top" xlink:title="rocm schedule for dense operation ">
-<polygon fill="#ffffff" stroke="#000000" points="612.5,-134.5 612.5,-164.5 736.5,-164.5 736.5,-134.5 612.5,-134.5"/>
-<text text-anchor="start" x="620.5" y="-152.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/rocm</text>
-<text text-anchor="middle" x="674.5" y="-141.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/dense.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="2065,-134.5 2065,-164.5 2189,-164.5 2189,-134.5 2065,-134.5"/>
+<text text-anchor="start" x="2073" y="-152.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/rocm</text>
+<text text-anchor="middle" x="2127" y="-141.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/dense.h</text>
</a>
</g>
</g>
<!-- Node59->Node62 -->
-<g id="edge98" class="edge">
+<g id="edge99" class="edge">
<title>Node59->Node62</title>
-<path fill="none" stroke="#191970" d="M1997.3621,-471.8076C2021.7406,-463.6225 2050.7436,-451.0707 2072.5,-433 2129.7769,-385.4263 2156.6237,-318.0528 2101.5,-268 1999.1825,-175.0947 997.869,-154.1792 736.5938,-150.2876"/>
-<polygon fill="#191970" stroke="#191970" points="1996.0334,-468.558 1987.5801,-474.945 1998.1713,-475.2236 1996.0334,-468.558"/>
+<path fill="none" stroke="#191970" d="M2344.1594,-470.6472C2361.9796,-462.1837 2382.4881,-449.7334 2396,-433 2424.0443,-398.2695 2439.2416,-374.2618 2418,-335 2367.2537,-241.2034 2246.3897,-188.161 2177.4493,-164.5458"/>
+<polygon fill="#191970" stroke="#191970" points="2342.2861,-467.6517 2334.5974,-474.941 2345.1536,-474.0374 2342.2861,-467.6517"/>
</g>
<!-- Node63 -->
<g id="node8" class="node">
<title>Node63</title>
<g id="a_node8"><a xlink:href="rocblas_8h.html" target="_top" xlink:title="include/tvm/topi/contrib\l/rocblas.h">
-<polygon fill="#ffffff" stroke="#000000" points="0,-335.5 0,-365.5 133,-365.5 133,-335.5 0,-335.5"/>
-<text text-anchor="start" x="8" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/contrib</text>
-<text text-anchor="middle" x="66.5" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/rocblas.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="2276.5,-335.5 2276.5,-365.5 2409.5,-365.5 2409.5,-335.5 2276.5,-335.5"/>
+<text text-anchor="start" x="2284.5" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/contrib</text>
+<text text-anchor="middle" x="2343" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/rocblas.h</text>
</a>
</g>
</g>
<!-- Node59->Node63 -->
<g id="edge7" class="edge">
<title>Node59->Node63</title>
-<path fill="none" stroke="#191970" d="M1895.8027,-473.1469C1886.0901,-471.4723 1876.0235,-469.9823 1866.5,-469 1771.4831,-459.1996 235.1895,-470.8788 147.5,-433 115.069,-418.9909 88.9455,-385.2502 75.7854,-365.5303"/>
-<polygon fill="#191970" stroke="#191970" points="1895.2956,-476.6119 1905.7586,-474.9454 1896.5401,-469.7234 1895.2956,-476.6119"/>
+<path fill="none" stroke="#191970" d="M2340.5114,-470.0659C2355.4232,-461.5875 2371.6193,-449.2985 2380,-433 2391.8441,-409.966 2373.0499,-382.3521 2358.1917,-365.6308"/>
+<polygon fill="#191970" stroke="#191970" points="2338.6717,-467.0797 2331.474,-474.8543 2341.949,-473.2651 2338.6717,-467.0797"/>
</g>
<!-- Node64 -->
<g id="node9" class="node">
<title>Node64</title>
<g id="a_node9"><a xlink:href="cuda_2injective_8h.html" target="_top" xlink:title="CUDA schedule for injective operations. ">
-<polygon fill="#ffffff" stroke="#000000" points="1968.5,-335.5 1968.5,-365.5 2090.5,-365.5 2090.5,-335.5 1968.5,-335.5"/>
-<text text-anchor="start" x="1976.5" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/cuda</text>
-<text text-anchor="middle" x="2029.5" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/injective.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="454,-335.5 454,-365.5 576,-365.5 576,-335.5 454,-335.5"/>
+<text text-anchor="start" x="462" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/cuda</text>
+<text text-anchor="middle" x="515" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/injective.h</text>
</a>
</g>
</g>
<!-- Node59->Node64 -->
<g id="edge10" class="edge">
<title>Node59->Node64</title>
-<path fill="none" stroke="#191970" d="M1960.4742,-466.1182C1976.423,-439.404 2005.7096,-390.349 2020.4785,-365.611"/>
-<polygon fill="#191970" stroke="#191970" points="1957.3744,-464.4827 1955.2534,-474.8631 1963.3847,-468.071 1957.3744,-464.4827"/>
+<path fill="none" stroke="#191970" d="M2254.3018,-473.1554C2244.5893,-471.4799 2234.523,-469.9874 2225,-469 2134.6291,-459.6297 672.6789,-470.6682 590,-433 559.174,-418.9558 535.4454,-385.5604 523.5157,-365.8333"/>
+<polygon fill="#191970" stroke="#191970" points="2253.7946,-476.6204 2264.2576,-474.9543 2255.0393,-469.7319 2253.7946,-476.6204"/>
</g>
<!-- Node65 -->
<g id="node10" class="node">
<title>Node65</title>
<g id="a_node10"><a xlink:href="rocm_2injective_8h.html" target="_top" xlink:title="rocm schedule for injective operations ">
-<polygon fill="#ffffff" stroke="#000000" points="1968.5,-268.5 1968.5,-298.5 2092.5,-298.5 2092.5,-268.5 1968.5,-268.5"/>
-<text text-anchor="start" x="1976.5" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/rocm</text>
-<text text-anchor="middle" x="2030.5" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/injective.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="453,-268.5 453,-298.5 577,-298.5 577,-268.5 453,-268.5"/>
+<text text-anchor="start" x="461" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/rocm</text>
+<text text-anchor="middle" x="515" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/injective.h</text>
</a>
</g>
</g>
<!-- Node59->Node65 -->
-<g id="edge99" class="edge">
+<g id="edge100" class="edge">
<title>Node59->Node65</title>
-<path fill="none" stroke="#191970" d="M1980.6961,-470.4997C2016.948,-452.2971 2074.9298,-416.7229 2099.5,-366 2105.5064,-353.6004 2105.6759,-347.3161 2099.5,-335 2091.5935,-319.2327 2076.3158,-307.0591 2062.1421,-298.5161"/>
-<polygon fill="#191970" stroke="#191970" points="1978.9943,-467.4354 1971.5374,-474.9618 1982.0602,-473.7283 1978.9943,-467.4354"/>
+<path fill="none" stroke="#191970" d="M2254.3031,-473.1428C2244.5905,-471.4685 2234.5238,-469.9798 2225,-469 2030.099,-448.9487 653.3522,-475.1045 462,-433 389.5001,-417.0474 349.6213,-428.1339 309,-366 301.4607,-354.468 300.521,-345.8597 309,-335 326.6391,-312.4083 398.8555,-298.2605 452.8381,-290.634"/>
+<polygon fill="#191970" stroke="#191970" points="2253.7961,-476.6077 2264.259,-474.9411 2255.0404,-469.7192 2253.7961,-476.6077"/>
</g>
<!-- Node66 -->
<g id="node11" class="node">
<title>Node66</title>
<g id="a_node11"><a xlink:href="cuda_2pooling_8h.html" target="_top" xlink:title="CUDA schedule for pooling operations. ">
-<polygon fill="#ffffff" stroke="#000000" points="1714.5,-335.5 1714.5,-365.5 1836.5,-365.5 1836.5,-335.5 1714.5,-335.5"/>
-<text text-anchor="start" x="1722.5" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/cuda</text>
-<text text-anchor="middle" x="1775.5" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/pooling.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1429,-335.5 1429,-365.5 1551,-365.5 1551,-335.5 1429,-335.5"/>
+<text text-anchor="start" x="1437" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/cuda</text>
+<text text-anchor="middle" x="1490" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/pooling.h</text>
</a>
</g>
</g>
<!-- Node59->Node66 -->
<g id="edge12" class="edge">
<title>Node59->Node66</title>
-<path fill="none" stroke="#191970" d="M1903.3079,-471.7685C1879.4161,-463.5268 1850.8662,-450.935 1829.5,-433 1807.0795,-414.18 1790.5176,-383.6923 1781.9911,-365.5214"/>
-<polygon fill="#191970" stroke="#191970" points="1902.2938,-475.1194 1912.8869,-474.9299 1904.4877,-468.4721 1902.2938,-475.1194"/>
+<path fill="none" stroke="#191970" d="M2252.9819,-473.1874C2243.6743,-471.5721 2234.085,-470.0914 2225,-469 2079.1638,-451.4809 1697.6595,-496.0601 1565,-433 1534.4061,-418.4571 1510.5923,-385.2448 1498.5854,-365.6836"/>
+<polygon fill="#191970" stroke="#191970" points="2252.4716,-476.6517 2262.9338,-474.9809 2253.7131,-469.7627 2252.4716,-476.6517"/>
</g>
<!-- Node67 -->
<g id="node12" class="node">
<title>Node67</title>
<g id="a_node12"><a xlink:href="rocm_2pooling_8h.html" target="_top" xlink:title="rocm schedule for pooling operations ">
-<polygon fill="#ffffff" stroke="#000000" points="1676.5,-268.5 1676.5,-298.5 1800.5,-298.5 1800.5,-268.5 1676.5,-268.5"/>
-<text text-anchor="start" x="1684.5" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/rocm</text>
-<text text-anchor="middle" x="1738.5" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/pooling.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1465,-268.5 1465,-298.5 1589,-298.5 1589,-268.5 1465,-268.5"/>
+<text text-anchor="start" x="1473" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/rocm</text>
+<text text-anchor="middle" x="1527" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/pooling.h</text>
</a>
</g>
</g>
<!-- Node59->Node67 -->
-<g id="edge100" class="edge">
+<g id="edge101" class="edge">
<title>Node59->Node67</title>
-<path fill="none" stroke="#191970" d="M1902.339,-472.1181C1872.0579,-463.2553 1832.3885,-449.9129 1799.5,-433 1753.8753,-409.5375 1728.9987,-411.606 1705.5,-366 1693.8416,-343.3736 1710.9205,-315.6419 1724.5356,-298.7853"/>
-<polygon fill="#191970" stroke="#191970" points="1901.6478,-475.5609 1912.225,-474.9507 1903.5759,-468.8316 1901.6478,-475.5609"/>
+<path fill="none" stroke="#191970" d="M2252.6506,-473.1526C2243.4464,-471.5568 2233.9766,-470.0914 2225,-469 2155.7287,-460.5779 1654.4214,-474.0618 1598,-433 1560.2286,-405.5111 1581.1519,-376.6523 1560,-335 1553.5571,-322.3127 1544.8175,-308.7971 1537.9509,-298.7964"/>
+<polygon fill="#191970" stroke="#191970" points="2252.0362,-476.5982 2262.4979,-474.9239 2253.2755,-469.7088 2252.0362,-476.5982"/>
</g>
<!-- Node68 -->
<g id="node13" class="node">
<title>Node68</title>
<g id="a_node13"><a xlink:href="cuda_2reduction_8h.html" target="_top" xlink:title="CUDA schedule for reduction operations. ">
-<polygon fill="#ffffff" stroke="#000000" points="1498.5,-335.5 1498.5,-365.5 1620.5,-365.5 1620.5,-335.5 1498.5,-335.5"/>
-<text text-anchor="start" x="1506.5" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/cuda</text>
-<text text-anchor="middle" x="1559.5" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/reduction.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="848,-335.5 848,-365.5 970,-365.5 970,-335.5 848,-335.5"/>
+<text text-anchor="start" x="856" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/cuda</text>
+<text text-anchor="middle" x="909" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/reduction.h</text>
</a>
</g>
</g>
<!-- Node59->Node68 -->
<g id="edge14" class="edge">
<title>Node59->Node68</title>
-<path fill="none" stroke="#191970" d="M1890.0264,-473.1654C1882.1222,-471.7259 1874.1197,-470.3024 1866.5,-469 1763.6464,-451.4204 1726.3965,-482.426 1634.5,-433 1604.8373,-417.0461 1580.8492,-384.6666 1568.5252,-365.5603"/>
-<polygon fill="#191970" stroke="#191970" points="1889.4816,-476.6238 1899.9496,-474.9894 1890.7472,-469.7391 1889.4816,-476.6238"/>
+<path fill="none" stroke="#191970" d="M2253.9644,-473.1567C2244.3551,-471.498 2234.4111,-470.011 2225,-469 2089.7195,-454.4679 1127.3426,-483.4931 1001,-433 965.938,-418.9874 935.6019,-385.248 920.0658,-365.5293"/>
+<polygon fill="#191970" stroke="#191970" points="2253.3583,-476.6038 2263.8209,-474.9346 2254.601,-469.715 2253.3583,-476.6038"/>
</g>
<!-- Node69 -->
<g id="node14" class="node">
<title>Node69</title>
<g id="a_node14"><a xlink:href="rocm_2reduction_8h.html" target="_top" xlink:title="rocm schedule for reduction operations ">
-<polygon fill="#ffffff" stroke="#000000" points="1497.5,-268.5 1497.5,-298.5 1621.5,-298.5 1621.5,-268.5 1497.5,-268.5"/>
-<text text-anchor="start" x="1505.5" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/rocm</text>
-<text text-anchor="middle" x="1559.5" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/reduction.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="847,-268.5 847,-298.5 971,-298.5 971,-268.5 847,-268.5"/>
+<text text-anchor="start" x="855" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/rocm</text>
+<text text-anchor="middle" x="909" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/reduction.h</text>
</a>
</g>
</g>
<!-- Node59->Node69 -->
-<g id="edge101" class="edge">
+<g id="edge102" class="edge">
<title>Node59->Node69</title>
-<path fill="none" stroke="#191970" d="M1893.7907,-472.8593C1834.3171,-460.1819 1746.6785,-440.6864 1732.5,-433 1676.95,-402.8853 1677.9339,-375.5814 1629.5,-335 1614.0034,-322.0158 1595.5507,-308.5141 1581.4825,-298.5942"/>
-<polygon fill="#191970" stroke="#191970" points="1893.3011,-476.3334 1903.8101,-474.9875 1894.7555,-469.4861 1893.3011,-476.3334"/>
+<path fill="none" stroke="#191970" d="M2253.9654,-473.148C2244.3559,-471.4902 2234.4117,-470.0057 2225,-469 2084.4105,-453.9764 1088.8907,-472.0491 953,-433 896.5165,-416.7691 868.1874,-417.009 839,-366 832.1574,-354.0415 832.7817,-347.2947 839,-335 846.9669,-319.2478 862.273,-307.1369 876.5457,-298.6292"/>
+<polygon fill="#191970" stroke="#191970" points="2253.3594,-476.5951 2263.8218,-474.9254 2254.6018,-469.7062 2253.3594,-476.5951"/>
</g>
<!-- Node70 -->
<g id="node15" class="node">
<title>Node70</title>
<g id="a_node15"><a xlink:href="cuda_2softmax_8h.html" target="_top" xlink:title="include/tvm/topi/cuda\l/softmax.h">
-<polygon fill="#ffffff" stroke="#000000" points="189.5,-335.5 189.5,-365.5 311.5,-365.5 311.5,-335.5 189.5,-335.5"/>
-<text text-anchor="start" x="197.5" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/cuda</text>
-<text text-anchor="middle" x="250.5" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/softmax.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="670,-335.5 670,-365.5 792,-365.5 792,-335.5 670,-335.5"/>
+<text text-anchor="start" x="678" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/cuda</text>
+<text text-anchor="middle" x="731" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/softmax.h</text>
</a>
</g>
</g>
<!-- Node59->Node70 -->
<g id="edge16" class="edge">
<title>Node59->Node70</title>
-<path fill="none" stroke="#191970" d="M1895.8014,-473.1595C1886.0889,-471.4835 1876.0227,-469.9898 1866.5,-469 1778.1756,-459.8192 337.0291,-489.4735 268.5,-433 248.9279,-416.871 247.5885,-384.8653 248.7773,-365.8069"/>
-<polygon fill="#191970" stroke="#191970" points="1895.2942,-476.6244 1905.7572,-474.9585 1896.539,-469.736 1895.2942,-476.6244"/>
+<path fill="none" stroke="#191970" d="M2254.2991,-473.182C2244.5868,-471.5036 2234.5213,-470.0034 2225,-469 2146.5755,-460.7356 877.707,-465.8146 806,-433 775.1976,-418.9042 751.4603,-385.5277 739.5228,-365.8178"/>
+<polygon fill="#191970" stroke="#191970" points="2253.7915,-476.6469 2264.2547,-474.9821 2255.0371,-469.7586 2253.7915,-476.6469"/>
</g>
<!-- Node71 -->
<g id="node16" class="node">
<title>Node71</title>
<g id="a_node16"><a xlink:href="rocm_2softmax_8h.html" target="_top" xlink:title="include/tvm/topi/rocm\l/softmax.h">
-<polygon fill="#ffffff" stroke="#000000" points="188.5,-268.5 188.5,-298.5 312.5,-298.5 312.5,-268.5 188.5,-268.5"/>
-<text text-anchor="start" x="196.5" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/rocm</text>
-<text text-anchor="middle" x="250.5" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/softmax.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="669,-268.5 669,-298.5 793,-298.5 793,-268.5 669,-268.5"/>
+<text text-anchor="start" x="677" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/rocm</text>
+<text text-anchor="middle" x="731" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/softmax.h</text>
</a>
</g>
</g>
<!-- Node59->Node71 -->
-<g id="edge102" class="edge">
+<g id="edge103" class="edge">
<title>Node59->Node71</title>
-<path fill="none" stroke="#191970" d="M1895.8017,-473.1572C1886.0891,-471.4814 1876.0229,-469.9884 1866.5,-469 1777.0141,-459.7117 330.2305,-468.3506 247.5,-433 198.982,-412.2683 156.6873,-382.0824 180.5,-335 188.4669,-319.2478 203.773,-307.1369 218.0457,-298.6292"/>
-<polygon fill="#191970" stroke="#191970" points="1895.2944,-476.6221 1905.7575,-474.9561 1896.5392,-469.7337 1895.2944,-476.6221"/>
+<path fill="none" stroke="#191970" d="M2254.2997,-473.1762C2244.5874,-471.4985 2234.5217,-469.9999 2225,-469 2063.5054,-452.0412 919.0976,-481.0922 764,-433 711.8394,-416.8262 687.0398,-414.0026 661,-366 654.4304,-353.8894 654.7817,-347.2947 661,-335 668.9669,-319.2478 684.273,-307.1369 698.5457,-298.6292"/>
+<polygon fill="#191970" stroke="#191970" points="2253.7922,-476.6411 2264.2554,-474.976 2255.0376,-469.7528 2253.7922,-476.6411"/>
</g>
<!-- Node72 -->
<g id="node17" class="node">
<title>Node72</title>
<g id="a_node17"><a xlink:href="array__utils_8h.html" target="_top" xlink:title="Utility functions for handling arrays. ">
-<polygon fill="#ffffff" stroke="#000000" points="1839,-402.5 1839,-432.5 1966,-432.5 1966,-402.5 1839,-402.5"/>
-<text text-anchor="start" x="1847" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/detail</text>
-<text text-anchor="middle" x="1902.5" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/array_utils.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1607.5,-402.5 1607.5,-432.5 1734.5,-432.5 1734.5,-402.5 1607.5,-402.5"/>
+<text text-anchor="start" x="1615.5" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/detail</text>
+<text text-anchor="middle" x="1671" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/array_utils.h</text>
</a>
</g>
</g>
<!-- Node59->Node72 -->
<g id="edge18" class="edge">
<title>Node59->Node72</title>
-<path fill="none" stroke="#191970" d="M1936.8048,-466.4026C1929.3627,-455.7936 1920.0725,-442.5502 1913.0377,-432.5218"/>
-<polygon fill="#191970" stroke="#191970" points="1934.1579,-468.7239 1942.766,-474.9005 1939.8885,-464.7039 1934.1579,-468.7239"/>
+<path fill="none" stroke="#191970" d="M2251.6796,-473.1911C2242.7769,-471.6323 2233.655,-470.1683 2225,-469 2035.3828,-443.4053 1986.312,-452.7766 1796,-433 1775.9896,-430.9206 1754.2178,-428.3339 1734.5351,-425.8731"/>
+<polygon fill="#191970" stroke="#191970" points="2251.1389,-476.6499 2261.6014,-474.9806 2252.3815,-469.7611 2251.1389,-476.6499"/>
</g>
<!-- Node73 -->
<g id="node18" class="node">
<title>Node73</title>
<g id="a_node18"><a xlink:href="detail_2broadcast_8h.html" target="_top" xlink:title="Detail broadcast. ">
-<polygon fill="#ffffff" stroke="#000000" points="2689,-335.5 2689,-365.5 2816,-365.5 2816,-335.5 2689,-335.5"/>
-<text text-anchor="start" x="2697" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/detail</text>
-<text text-anchor="middle" x="2752.5" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/broadcast.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="2970.5,-335.5 2970.5,-365.5 3097.5,-365.5 3097.5,-335.5 2970.5,-335.5"/>
+<text text-anchor="start" x="2978.5" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/detail</text>
+<text text-anchor="middle" x="3034" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/broadcast.h</text>
</a>
</g>
</g>
<!-- Node59->Node73 -->
<g id="edge23" class="edge">
<title>Node59->Node73</title>
-<path fill="none" stroke="#191970" d="M2032.8296,-482.7546C2215.2145,-478.3613 2638.0454,-464.8404 2694.5,-433 2721.1875,-417.9482 2738.4916,-384.9228 2746.7701,-365.5307"/>
-<polygon fill="#191970" stroke="#191970" points="2032.6613,-479.2575 2022.7473,-482.9941 2032.8276,-486.2556 2032.6613,-479.2575"/>
+<path fill="none" stroke="#191970" d="M2391.4615,-481.8544C2558.5084,-476.0067 2922.2633,-460.2919 2972,-433 2999.4878,-417.9167 3018.3672,-384.9028 3027.5625,-365.5212"/>
+<polygon fill="#191970" stroke="#191970" points="2391.1052,-478.3646 2381.2324,-482.2088 2391.3476,-485.3604 2391.1052,-478.3646"/>
</g>
<!-- Node76 -->
<g id="node21" class="node">
<title>Node76</title>
<g id="a_node21"><a xlink:href="reduction_8h.html" target="_top" xlink:title="Reduction op constructors. ">
-<polygon fill="#ffffff" stroke="#000000" points="2862.5,-140 2862.5,-159 3016.5,-159 3016.5,-140 2862.5,-140"/>
-<text text-anchor="middle" x="2939.5" y="-147" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/reduction.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="3139,-140 3139,-159 3293,-159 3293,-140 3139,-140"/>
+<text text-anchor="middle" x="3216" y="-147" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/reduction.h</text>
</a>
</g>
</g>
<!-- Node59->Node76 -->
-<g id="edge97" class="edge">
+<g id="edge98" class="edge">
<title>Node59->Node76</title>
-<path fill="none" stroke="#191970" d="M2033.0288,-483.7296C2284.8075,-481.002 3021.2859,-469.8904 3056.5,-433 3128.645,-357.4207 3085.3681,-281.2857 3018.5,-201 3002.888,-182.2553 2978.6873,-167.8845 2961.1844,-159.146"/>
-<polygon fill="#191970" stroke="#191970" points="2032.8034,-480.2317 2022.8413,-483.8383 2032.8781,-487.2313 2032.8034,-480.2317"/>
+<path fill="none" stroke="#191970" d="M2391.3565,-482.9003C2629.9161,-477.9841 3300.6869,-461.6682 3338,-433 3420.5875,-369.5468 3438.0219,-280.7188 3371,-201 3356.6755,-183.9618 3301.6233,-168.4522 3261.186,-159.0045"/>
+<polygon fill="#191970" stroke="#191970" points="2390.9335,-479.4081 2381.0072,-483.112 2391.0767,-486.4066 2390.9335,-479.4081"/>
</g>
<!-- Node78 -->
<g id="node23" class="node">
<title>Node78</title>
<g id="a_node23"><a xlink:href="nn_2softmax_8h.html" target="_top" xlink:title="Softmax op constructions. ">
-<polygon fill="#ffffff" stroke="#000000" points="2451,-67.5 2451,-97.5 2562,-97.5 2562,-67.5 2451,-67.5"/>
-<text text-anchor="start" x="2459" y="-85.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
-<text text-anchor="middle" x="2506.5" y="-74.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/softmax.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="2533.5,-67.5 2533.5,-97.5 2644.5,-97.5 2644.5,-67.5 2533.5,-67.5"/>
+<text text-anchor="start" x="2541.5" y="-85.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
+<text text-anchor="middle" x="2589" y="-74.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/softmax.h</text>
</a>
</g>
</g>
<!-- Node59->Node78 -->
-<g id="edge95" class="edge">
+<g id="edge96" class="edge">
<title>Node59->Node78</title>
-<path fill="none" stroke="#191970" d="M2032.547,-474.7894C2092.7731,-466.3109 2167.8105,-452.3241 2192.5,-433 2224.589,-407.8844 2232.5,-391.2492 2232.5,-350.5 2232.5,-350.5 2232.5,-350.5 2232.5,-216.5 2232.5,-178.3133 2228.4406,-159.9015 2256.5,-134 2270.7051,-120.8873 2381.7032,-101.7223 2450.938,-90.8485"/>
-<polygon fill="#191970" stroke="#191970" points="2032.0665,-471.3224 2022.6376,-476.1541 2033.0216,-478.2569 2032.0665,-471.3224"/>
+<path fill="none" stroke="#191970" d="M2390.404,-472.469C2419.069,-464.9573 2449.8164,-452.7686 2473,-433 2504.1561,-406.4333 2514,-391.445 2514,-350.5 2514,-350.5 2514,-350.5 2514,-216.5 2514,-178.3133 2518.6358,-166.9128 2538,-134 2546.1675,-120.1179 2558.9175,-107.2429 2569.6256,-97.8384"/>
+<polygon fill="#191970" stroke="#191970" points="2389.2431,-469.15 2380.3675,-474.9357 2390.9139,-475.9477 2389.2431,-469.15"/>
</g>
<!-- Node59->Node79 -->
-<g id="edge96" class="edge">
+<g id="edge97" class="edge">
<title>Node59->Node79</title>
-<path fill="none" stroke="#191970" d="M2033.1796,-482.5876C2287.3585,-476.5302 3035.9574,-456.7572 3082.5,-433 3158.1399,-394.3903 3201.5,-368.4241 3201.5,-283.5 3201.5,-283.5 3201.5,-283.5 3201.5,-216.5 3201.5,-165.4534 3163.2148,-113.21 3145.8669,-92.1574"/>
-<polygon fill="#191970" stroke="#191970" points="2032.8121,-479.0953 2022.898,-482.8316 2032.9782,-486.0933 2032.8121,-479.0953"/>
+<path fill="none" stroke="#191970" d="M2391.2814,-482.3588C2633.771,-475.8728 3324.9692,-455.5022 3368,-433 3405.1423,-413.5771 3483,-325.4142 3483,-283.5 3483,-283.5 3483,-283.5 3483,-216.5 3483,-165.7459 3445.9113,-113.3425 3429.1054,-92.2025"/>
+<polygon fill="#191970" stroke="#191970" points="2391.0166,-478.8645 2381.1134,-482.6297 2391.203,-485.862 2391.0166,-478.8645"/>
</g>
<!-- Node80 -->
<g id="node25" class="node">
<title>Node80</title>
<g id="a_node25"><a xlink:href="reorg_8h.html" target="_top" xlink:title="Reorg op constructions. ">
-<polygon fill="#ffffff" stroke="#000000" points="2671.5,-67.5 2671.5,-97.5 2799.5,-97.5 2799.5,-67.5 2671.5,-67.5"/>
-<text text-anchor="start" x="2679.5" y="-85.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/vision</text>
-<text text-anchor="middle" x="2735.5" y="-74.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/reorg.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="2762,-67.5 2762,-97.5 2890,-97.5 2890,-67.5 2762,-67.5"/>
+<text text-anchor="start" x="2770" y="-85.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/vision</text>
+<text text-anchor="middle" x="2826" y="-74.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/reorg.h</text>
</a>
</g>
</g>
<!-- Node59->Node80 -->
-<g id="edge104" class="edge">
+<g id="edge105" class="edge">
<title>Node59->Node80</title>
-<path fill="none" stroke="#191970" d="M2032.9531,-476.2741C2101.3605,-468.1717 2191.5916,-453.9474 2221.5,-433 2256.431,-408.5349 2270.5,-393.1464 2270.5,-350.5 2270.5,-350.5 2270.5,-350.5 2270.5,-216.5 2270.5,-134.0609 2543.7702,-99.2689 2671.3319,-87.5189"/>
-<polygon fill="#191970" stroke="#191970" points="2032.1324,-472.8457 2022.6017,-477.4736 2032.9382,-479.7992 2032.1324,-472.8457"/>
+<path fill="none" stroke="#191970" d="M2371.4541,-472.9834C2419.7643,-463.2484 2481.3618,-448.4531 2502,-433 2536.3205,-407.3022 2552,-393.3751 2552,-350.5 2552,-350.5 2552,-350.5 2552,-216.5 2552,-168.3124 2690.4884,-120.9137 2770.552,-97.5377"/>
+<polygon fill="#191970" stroke="#191970" points="2370.4954,-469.6052 2361.3665,-474.9823 2371.8561,-476.4717 2370.4954,-469.6052"/>
</g>
<!-- Node81 -->
<g id="node26" class="node">
<title>Node81</title>
<g id="a_node26"><a xlink:href="bias__add_8h.html" target="_top" xlink:title="bias_add op constructions ">
-<polygon fill="#ffffff" stroke="#000000" points="2666,-134.5 2666,-164.5 2777,-164.5 2777,-134.5 2666,-134.5"/>
-<text text-anchor="start" x="2674" y="-152.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
-<text text-anchor="middle" x="2721.5" y="-141.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/bias_add.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="2962.5,-134.5 2962.5,-164.5 3073.5,-164.5 3073.5,-134.5 2962.5,-134.5"/>
+<text text-anchor="start" x="2970.5" y="-152.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
+<text text-anchor="middle" x="3018" y="-141.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/bias_add.h</text>
</a>
</g>
</g>
<!-- Node59->Node81 -->
<g id="edge86" class="edge">
<title>Node59->Node81</title>
-<path fill="none" stroke="#191970" d="M2032.6125,-481.7373C2119.4394,-477.1498 2248.0141,-465.0083 2284.5,-433 2313.206,-407.8169 2308.5,-388.6867 2308.5,-350.5 2308.5,-350.5 2308.5,-350.5 2308.5,-283.5 2308.5,-209.2497 2553.0697,-169.9234 2665.801,-155.7402"/>
-<polygon fill="#191970" stroke="#191970" points="2032.3145,-478.2478 2022.5037,-482.2474 2032.6674,-485.2389 2032.3145,-478.2478"/>
+<path fill="none" stroke="#191970" d="M2391.3813,-474.4873C2455.3658,-465.5937 2537.4303,-451.1953 2566,-433 2611.5869,-403.9669 2604.8418,-377.6804 2638,-335 2661.6359,-304.5765 2662.2581,-291.2203 2693,-268 2775.3075,-205.8306 2893.4591,-173.7425 2962.3123,-159.3554"/>
+<polygon fill="#191970" stroke="#191970" points="2390.7127,-471.0461 2381.2783,-475.8672 2391.66,-477.9817 2390.7127,-471.0461"/>
</g>
<!-- Node82 -->
<g id="node27" class="node">
<title>Node82</title>
<g id="a_node27"><a xlink:href="topi_2transform_8h.html" target="_top" xlink:title="Transform op constructors. ">
-<polygon fill="#ffffff" stroke="#000000" points="2537.5,-207 2537.5,-226 2693.5,-226 2693.5,-207 2537.5,-207"/>
-<text text-anchor="middle" x="2615.5" y="-214" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/transform.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="2856,-207 2856,-226 3012,-226 3012,-207 2856,-207"/>
+<text text-anchor="middle" x="2934" y="-214" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/transform.h</text>
</a>
</g>
</g>
<!-- Node59->Node82 -->
-<g id="edge103" class="edge">
+<g id="edge104" class="edge">
<title>Node59->Node82</title>
-<path fill="none" stroke="#191970" d="M2032.686,-480.8974C2127.5572,-475.4383 2275.6072,-462.3993 2322.5,-433 2397.1735,-386.1837 2361.2092,-319.8221 2432.5,-268 2462.9345,-245.8769 2502.5527,-233.2445 2537.1402,-226.0373"/>
-<polygon fill="#191970" stroke="#191970" points="2032.3239,-477.4121 2022.5347,-481.4644 2032.7143,-484.4012 2032.3239,-477.4121"/>
+<path fill="none" stroke="#191970" d="M2391.2904,-480.6607C2558.2886,-472.5585 2919.732,-452.8223 2938,-433 2966.0879,-402.5224 2942.8225,-263.9289 2935.8298,-226.0769"/>
+<polygon fill="#191970" stroke="#191970" points="2390.8813,-477.1762 2381.0616,-481.1542 2391.2187,-484.1681 2390.8813,-477.1762"/>
</g>
<!-- Node59->Node83 -->
<g id="edge40" class="edge">
<title>Node59->Node83</title>
-<path fill="none" stroke="#191970" d="M2032.8795,-481.3845C2192.163,-475.0682 2550.6531,-459.0961 2851.5,-433 2874.1964,-431.0313 2899.0064,-428.2984 2920.9857,-425.6749"/>
-<polygon fill="#191970" stroke="#191970" points="2032.533,-477.8954 2022.6786,-481.7866 2032.8088,-484.89 2032.533,-477.8954"/>
+<path fill="none" stroke="#191970" d="M2391.5175,-483.0109C2547.4109,-479.4708 2893.0748,-468.033 3182,-433 3188.6248,-432.1967 3195.516,-431.1969 3202.3754,-430.0945"/>
+<polygon fill="#191970" stroke="#191970" points="2390.995,-479.5215 2381.0752,-483.2426 2391.1504,-486.5198 2390.995,-479.5215"/>
</g>
<!-- Node85 -->
<g id="node30" class="node">
<title>Node85</title>
<g id="a_node30"><a xlink:href="einsum_8h.html" target="_top" xlink:title="Einstein summation op. ">
-<polygon fill="#ffffff" stroke="#000000" points="2412.5,-341 2412.5,-360 2556.5,-360 2556.5,-341 2412.5,-341"/>
-<text text-anchor="middle" x="2484.5" y="-348" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/einsum.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="2656,-341 2656,-360 2800,-360 2800,-341 2656,-341"/>
+<text text-anchor="middle" x="2728" y="-348" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/einsum.h</text>
</a>
</g>
</g>
<!-- Node59->Node85 -->
<g id="edge82" class="edge">
<title>Node59->Node85</title>
-<path fill="none" stroke="#191970" d="M2033.1681,-482.9075C2181.7273,-479.207 2478.9955,-467.5291 2510.5,-433 2529.6304,-412.033 2506.342,-377.0577 2492.8416,-360.1865"/>
-<polygon fill="#191970" stroke="#191970" points="2032.6468,-479.419 2022.7346,-483.1604 2032.8165,-486.417 2032.6468,-479.419"/>
+<path fill="none" stroke="#191970" d="M2391.098,-480.7797C2513.4749,-474.4372 2730.3247,-459.5665 2754,-433 2772.8835,-411.8105 2749.7128,-376.9412 2736.2909,-360.1408"/>
+<polygon fill="#191970" stroke="#191970" points="2390.8161,-477.2894 2381.0074,-481.2944 2391.1728,-484.2803 2390.8161,-477.2894"/>
</g>
<!-- Node59->Node86 -->
<g id="edge87" class="edge">
<title>Node59->Node86</title>
-<path fill="none" stroke="#191970" d="M2033.0409,-483.6502C2286.5173,-480.6872 3035.8657,-468.9545 3139.5,-433 3161.7582,-425.2778 3162.3429,-414.1965 3182.5,-402 3204.9077,-388.4417 3231.3537,-375.2739 3251.8375,-365.6472"/>
-<polygon fill="#191970" stroke="#191970" points="2032.7481,-480.1533 2022.7892,-483.7684 2032.8289,-487.1528 2032.7481,-480.1533"/>
+<path fill="none" stroke="#191970" d="M2391.5247,-483.3785C2633.6527,-479.7443 3324.8321,-466.5252 3421,-433 3443.2467,-425.2446 3443.8429,-414.1965 3464,-402 3486.4077,-388.4417 3512.8537,-375.2739 3533.3375,-365.6472"/>
+<polygon fill="#191970" stroke="#191970" points="2391.3192,-479.8811 2381.3723,-483.5292 2391.4232,-486.8803 2391.3192,-479.8811"/>
</g>
<!-- Node87 -->
<g id="node32" class="node">
<title>Node87</title>
<g id="a_node32"><a xlink:href="flatten_8h.html" target="_top" xlink:title="Softmax op constructions. ">
-<polygon fill="#ffffff" stroke="#000000" points="2834,-335.5 2834,-365.5 2945,-365.5 2945,-335.5 2834,-335.5"/>
-<text text-anchor="start" x="2842" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
-<text text-anchor="middle" x="2889.5" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/flatten.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="3115.5,-335.5 3115.5,-365.5 3226.5,-365.5 3226.5,-335.5 3115.5,-335.5"/>
+<text text-anchor="start" x="3123.5" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
+<text text-anchor="middle" x="3171" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/flatten.h</text>
</a>
</g>
</g>
<!-- Node59->Node87 -->
<g id="edge92" class="edge">
<title>Node59->Node87</title>
-<path fill="none" stroke="#191970" d="M2032.7548,-483.1151C2205.2114,-479.5701 2596.1061,-467.844 2724.5,-433 2778.5908,-418.3207 2835.8339,-385.1584 2866.6429,-365.6426"/>
-<polygon fill="#191970" stroke="#191970" points="2032.6399,-479.6166 2022.7125,-483.3173 2032.7809,-486.6152 2032.6399,-479.6166"/>
+<path fill="none" stroke="#191970" d="M2391.2347,-483.2825C2550.2801,-480.1446 2891.4706,-469.2777 3003,-433 3027.8645,-424.9122 3029.8474,-414.1495 3053,-402 3079.0272,-388.342 3109.4509,-375.1267 3132.8868,-365.5031"/>
+<polygon fill="#191970" stroke="#191970" points="2390.9575,-479.7871 2381.0265,-483.4782 2391.0918,-486.7858 2390.9575,-479.7871"/>
</g>
<!-- Node88 -->
<g id="node33" class="node">
<title>Node88</title>
<g id="a_node33"><a xlink:href="detail_2extern_8h.html" target="_top" xlink:title="Helpers for using external functions. ">
-<polygon fill="#ffffff" stroke="#000000" points="754,-402.5 754,-432.5 881,-432.5 881,-402.5 754,-402.5"/>
-<text text-anchor="start" x="762" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/detail</text>
-<text text-anchor="middle" x="817.5" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/extern.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="2244.5,-402.5 2244.5,-432.5 2371.5,-432.5 2371.5,-402.5 2244.5,-402.5"/>
+<text text-anchor="start" x="2252.5" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/detail</text>
+<text text-anchor="middle" x="2308" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/extern.h</text>
</a>
</g>
</g>
<!-- Node59->Node88 -->
<g id="edge52" class="edge">
<title>Node59->Node88</title>
-<path fill="none" stroke="#191970" d="M1894.8234,-473.1315C1885.4117,-471.5071 1875.6996,-470.0379 1866.5,-469 1469.5863,-424.2206 1366.9203,-461.4072 968.5,-433 939.8479,-430.9571 908.2202,-427.8439 881.345,-424.9421"/>
-<polygon fill="#191970" stroke="#191970" points="1894.4179,-476.6147 1904.8799,-474.9421 1895.6583,-469.7255 1894.4179,-476.6147"/>
+<path fill="none" stroke="#191970" d="M2308,-464.7758C2308,-454.4641 2308,-442.0437 2308,-432.5218"/>
+<polygon fill="#191970" stroke="#191970" points="2304.5001,-464.9005 2308,-474.9005 2311.5001,-464.9006 2304.5001,-464.9005"/>
</g>
<!-- Node89 -->
<g id="node34" class="node">
<title>Node89</title>
<g id="a_node34"><a xlink:href="fuse_8h.html" target="_top" xlink:title="Fuse operation. ">
-<polygon fill="#ffffff" stroke="#000000" points="1195,-402.5 1195,-432.5 1322,-432.5 1322,-402.5 1195,-402.5"/>
-<text text-anchor="start" x="1203" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/detail</text>
-<text text-anchor="middle" x="1258.5" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/fuse.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1010.5,-402.5 1010.5,-432.5 1137.5,-432.5 1137.5,-402.5 1010.5,-402.5"/>
+<text text-anchor="start" x="1018.5" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/detail</text>
+<text text-anchor="middle" x="1074" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/fuse.h</text>
</a>
</g>
</g>
<!-- Node59->Node89 -->
<g id="edge55" class="edge">
<title>Node59->Node89</title>
-<path fill="none" stroke="#191970" d="M1893.1788,-473.197C1884.2762,-471.6374 1875.1545,-470.1716 1866.5,-469 1813.6184,-461.8409 1467.303,-434.0454 1322.2451,-422.5351"/>
-<polygon fill="#191970" stroke="#191970" points="1892.638,-476.6559 1903.1006,-474.987 1893.8809,-469.7671 1892.638,-476.6559"/>
+<path fill="none" stroke="#191970" d="M2253.9581,-473.215C2244.3494,-471.55 2234.4073,-470.0457 2225,-469 1792.7017,-420.9452 1681.0961,-460.417 1247,-433 1210.7354,-430.7096 1170.2804,-427.1412 1137.6147,-424.0053"/>
+<polygon fill="#191970" stroke="#191970" points="2253.3512,-476.662 2263.8142,-474.9958 2254.5958,-469.7735 2253.3512,-476.662"/>
</g>
<!-- Node90 -->
<g id="node35" class="node">
<title>Node90</title>
<g id="a_node35"><a xlink:href="generic_2default_8h.html" target="_top" xlink:title="Generic default schedule. ">
-<polygon fill="#ffffff" stroke="#000000" points="406,-335.5 406,-365.5 541,-365.5 541,-335.5 406,-335.5"/>
-<text text-anchor="start" x="414" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/generic</text>
-<text text-anchor="middle" x="473.5" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/default.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1140.5,-335.5 1140.5,-365.5 1275.5,-365.5 1275.5,-335.5 1140.5,-335.5"/>
+<text text-anchor="start" x="1148.5" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/generic</text>
+<text text-anchor="middle" x="1208" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/default.h</text>
</a>
</g>
</g>
<!-- Node59->Node90 -->
<g id="edge83" class="edge">
<title>Node59->Node90</title>
-<path fill="none" stroke="#191970" d="M1895.8012,-473.161C1886.0888,-471.4849 1876.0226,-469.9908 1866.5,-469 1822.7249,-464.4454 313.2651,-464.4723 282.5,-433 240.1441,-389.6705 335.3098,-367.7022 405.9555,-357.7052"/>
-<polygon fill="#191970" stroke="#191970" points="1895.294,-476.626 1905.757,-474.9602 1896.5389,-469.7376 1895.294,-476.626"/>
+<path fill="none" stroke="#191970" d="M2253.3182,-473.1775C2243.907,-471.5478 2234.1965,-470.0649 2225,-469 1878.9447,-428.9282 1783.9362,-499.6306 1442,-433 1367.7974,-418.5407 1285.3595,-385.1335 1240.851,-365.5533"/>
+<polygon fill="#191970" stroke="#191970" points="2252.9119,-476.6606 2263.3743,-474.9908 2254.1541,-469.7717 2252.9119,-476.6606"/>
</g>
<!-- Node91 -->
<g id="node36" class="node">
<title>Node91</title>
<g id="a_node36"><a xlink:href="generic_2extern_8h.html" target="_top" xlink:title="Schedule for extern followed by injective ops. ">
-<polygon fill="#ffffff" stroke="#000000" points="559,-268.5 559,-298.5 694,-298.5 694,-268.5 559,-268.5"/>
-<text text-anchor="start" x="567" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/generic</text>
-<text text-anchor="middle" x="626.5" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/extern.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1818.5,-268.5 1818.5,-298.5 1953.5,-298.5 1953.5,-268.5 1818.5,-268.5"/>
+<text text-anchor="start" x="1826.5" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/generic</text>
+<text text-anchor="middle" x="1886" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/extern.h</text>
</a>
</g>
</g>
<!-- Node59->Node91 -->
<g id="edge84" class="edge">
<title>Node59->Node91</title>
-<path fill="none" stroke="#191970" d="M1895.464,-473.161C1885.8547,-471.5018 1875.9108,-470.0135 1866.5,-469 1600.8054,-440.3862 924.9296,-492.9064 664.5,-433 631.7361,-425.4634 627.6119,-411.9548 595.5,-402 509.6505,-375.3863 454.1209,-434.9801 396.5,-366 387.6672,-355.426 387.8875,-345.7542 396.5,-335 399.1098,-331.7412 491.9218,-311.6959 558.8317,-297.5986"/>
-<polygon fill="#191970" stroke="#191970" points="1894.8578,-476.608 1905.3204,-474.9391 1896.1006,-469.7192 1894.8578,-476.608"/>
+<path fill="none" stroke="#191970" d="M2246.9361,-473.1088C2170.3408,-458.7454 2048.4015,-435.5988 2044,-433 1995.0964,-404.1257 2004.1372,-373.0755 1962,-335 1946.6266,-321.1084 1927.2684,-308.045 1911.9369,-298.5509"/>
+<polygon fill="#191970" stroke="#191970" points="2246.4905,-476.5861 2256.964,-474.9877 2247.7797,-469.7059 2246.4905,-476.5861"/>
</g>
<!-- Node92 -->
<g id="node37" class="node">
<title>Node92</title>
<g id="a_node37"><a xlink:href="generic_2injective_8h.html" target="_top" xlink:title="Generic schedule for injective operations. ">
-<polygon fill="#ffffff" stroke="#000000" points="559,-335.5 559,-365.5 694,-365.5 694,-335.5 559,-335.5"/>
-<text text-anchor="start" x="567" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/generic</text>
-<text text-anchor="middle" x="626.5" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/injective.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1683.5,-335.5 1683.5,-365.5 1818.5,-365.5 1818.5,-335.5 1683.5,-335.5"/>
+<text text-anchor="start" x="1691.5" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/generic</text>
+<text text-anchor="middle" x="1751" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/injective.h</text>
</a>
</g>
</g>
<!-- Node59->Node92 -->
<g id="edge85" class="edge">
<title>Node59->Node92</title>
-<path fill="none" stroke="#191970" d="M1895.463,-473.1699C1885.8538,-471.5098 1875.9103,-470.0188 1866.5,-469 1738.5191,-455.1436 826.5723,-484.2839 708.5,-433 676.0854,-418.9209 649.669,-385.5383 636.1975,-365.8229"/>
-<polygon fill="#191970" stroke="#191970" points="1894.8567,-476.617 1905.3194,-474.9485 1896.0998,-469.7282 1894.8567,-476.617"/>
+<path fill="none" stroke="#191970" d="M2249.457,-473.1588C2241.2561,-471.6894 2232.9251,-470.2597 2225,-469 2103.6984,-449.7193 2069.6666,-464.687 1951,-433 1888.2203,-416.2362 1818.9552,-384.4536 1780.6404,-365.6068"/>
+<polygon fill="#191970" stroke="#191970" points="2248.8946,-476.6138 2259.3598,-474.9621 2250.1487,-469.7271 2248.8946,-476.6138"/>
</g>
<!-- Node93 -->
<g id="node38" class="node">
<title>Node93</title>
<g id="a_node38"><a xlink:href="x86_2bnn_8h.html" target="_top" xlink:title="x86 schedule for binary operations ">
-<polygon fill="#ffffff" stroke="#000000" points="902,-335.5 902,-365.5 1019,-365.5 1019,-335.5 902,-335.5"/>
-<text text-anchor="start" x="910" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/x86</text>
-<text text-anchor="middle" x="960.5" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/bnn.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1293.5,-335.5 1293.5,-365.5 1410.5,-365.5 1410.5,-335.5 1293.5,-335.5"/>
+<text text-anchor="start" x="1301.5" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/x86</text>
+<text text-anchor="middle" x="1352" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/bnn.h</text>
</a>
</g>
</g>
<!-- Node59->Node93 -->
-<g id="edge105" class="edge">
+<g id="edge106" class="edge">
<title>Node59->Node93</title>
-<path fill="none" stroke="#191970" d="M1894.4866,-473.1478C1885.1785,-471.5372 1875.5877,-470.0683 1866.5,-469 1544.7644,-431.1792 1453.0337,-510.542 1138.5,-433 1080.4613,-418.6917 1018.3436,-385.2285 985.0163,-365.5981"/>
-<polygon fill="#191970" stroke="#191970" points="1893.977,-476.6122 1904.4388,-474.9388 1895.2169,-469.7229 1893.977,-476.6122"/>
+<path fill="none" stroke="#191970" d="M2252.9849,-473.1626C2243.6769,-471.5502 2234.0867,-470.0769 2225,-469 1915.202,-432.2841 1826.3239,-509.962 1524,-433 1467.6736,-418.6611 1407.7403,-385.2093 1375.6176,-365.5891"/>
+<polygon fill="#191970" stroke="#191970" points="2252.475,-476.627 2262.937,-474.9545 2253.7155,-469.7378 2252.475,-476.627"/>
</g>
<!-- Node94 -->
<g id="node39" class="node">
<title>Node94</title>
<g id="a_node39"><a xlink:href="x86_2default_8h.html" target="_top" xlink:title="default x86 schedule ">
-<polygon fill="#ffffff" stroke="#000000" points="1037,-335.5 1037,-365.5 1154,-365.5 1154,-335.5 1037,-335.5"/>
-<text text-anchor="start" x="1045" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/x86</text>
-<text text-anchor="middle" x="1095.5" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/default.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1836.5,-335.5 1836.5,-365.5 1953.5,-365.5 1953.5,-335.5 1836.5,-335.5"/>
+<text text-anchor="start" x="1844.5" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/x86</text>
+<text text-anchor="middle" x="1895" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/default.h</text>
</a>
</g>
</g>
<!-- Node59->Node94 -->
-<g id="edge106" class="edge">
+<g id="edge107" class="edge">
<title>Node59->Node94</title>
-<path fill="none" stroke="#191970" d="M1894.4835,-473.1743C1885.1757,-471.5606 1875.5859,-470.0838 1866.5,-469 1716.022,-451.0513 1325.2359,-491.6485 1185.5,-433 1151.2968,-418.6446 1121.7987,-385.3634 1106.5562,-365.7399"/>
-<polygon fill="#191970" stroke="#191970" points="1893.9734,-476.6387 1904.4355,-474.967 1895.2144,-469.7495 1893.9734,-476.6387"/>
+<path fill="none" stroke="#191970" d="M2247.9391,-473.1244C2240.2244,-471.7101 2232.4288,-470.3041 2225,-469 2127.8459,-451.9452 2096.8129,-471.507 2006,-433 1967.7575,-416.7842 1930.5106,-384.809 1910.3783,-365.7796"/>
+<polygon fill="#191970" stroke="#191970" points="2247.5275,-476.6075 2257.9967,-474.9805 2248.7979,-469.7237 2247.5275,-476.6075"/>
</g>
<!-- Node95 -->
<g id="node40" class="node">
<title>Node95</title>
<g id="a_node40"><a xlink:href="x86_2injective_8h.html" target="_top" xlink:title="x86 schedule for injective ops ">
-<polygon fill="#ffffff" stroke="#000000" points="1363,-335.5 1363,-365.5 1480,-365.5 1480,-335.5 1363,-335.5"/>
-<text text-anchor="start" x="1371" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/x86</text>
-<text text-anchor="middle" x="1421.5" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/injective.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="318.5,-335.5 318.5,-365.5 435.5,-365.5 435.5,-335.5 318.5,-335.5"/>
+<text text-anchor="start" x="326.5" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/x86</text>
+<text text-anchor="middle" x="377" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/injective.h</text>
</a>
</g>
</g>
<!-- Node59->Node95 -->
-<g id="edge107" class="edge">
+<g id="edge108" class="edge">
<title>Node59->Node95</title>
-<path fill="none" stroke="#191970" d="M1890.9565,-473.1623C1882.7556,-471.6923 1874.4248,-470.2616 1866.5,-469 1745.6381,-449.7599 1710.8023,-467.899 1593.5,-433 1538.4253,-416.6145 1478.9989,-384.5448 1446.4166,-365.5766"/>
-<polygon fill="#191970" stroke="#191970" points="1890.3939,-476.6173 1900.8592,-474.9659 1891.6483,-469.7306 1890.3939,-476.6173"/>
+<path fill="none" stroke="#191970" d="M2254.3023,-473.1511C2244.5897,-471.476 2234.5233,-469.9848 2225,-469 2039.6123,-449.8294 729.2323,-476.485 548,-433 491.588,-419.4645 431.9183,-385.5462 400.1337,-365.669"/>
+<polygon fill="#191970" stroke="#191970" points="2253.7951,-476.616 2264.2581,-474.9497 2255.0397,-469.7275 2253.7951,-476.616"/>
</g>
<!-- Node96 -->
<g id="node41" class="node">
<title>Node96</title>
<g id="a_node41"><a xlink:href="pad__utils_8h.html" target="_top" xlink:title="Padding helpers. ">
-<polygon fill="#ffffff" stroke="#000000" points="2077,-201.5 2077,-231.5 2204,-231.5 2204,-201.5 2077,-201.5"/>
-<text text-anchor="start" x="2085" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/detail</text>
-<text text-anchor="middle" x="2140.5" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/pad_utils.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="2358.5,-201.5 2358.5,-231.5 2485.5,-231.5 2485.5,-201.5 2358.5,-201.5"/>
+<text text-anchor="start" x="2366.5" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/detail</text>
+<text text-anchor="middle" x="2422" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/pad_utils.h</text>
</a>
</g>
</g>
<!-- Node59->Node96 -->
<g id="edge73" class="edge">
<title>Node59->Node96</title>
-<path fill="none" stroke="#191970" d="M2032.7989,-475.8251C2087.9663,-468.1631 2153.2355,-454.7503 2170.5,-433 2218.8624,-372.0719 2170.5319,-269.8932 2149.3558,-231.5995"/>
-<polygon fill="#191970" stroke="#191970" points="2032.0751,-472.3908 2022.6294,-477.1896 2033.006,-479.3286 2032.0751,-472.3908"/>
+<path fill="none" stroke="#191970" d="M2360.6241,-472.429C2397.1066,-462.8719 2441.029,-448.5751 2452,-433 2496.6871,-369.5592 2450.8358,-269.3206 2430.5793,-231.5577"/>
+<polygon fill="#191970" stroke="#191970" points="2359.389,-469.132 2350.565,-474.9961 2361.1199,-475.9146 2359.389,-469.132"/>
</g>
<!-- Node97 -->
<g id="node42" class="node">
<title>Node97</title>
<g id="a_node42"><a xlink:href="ravel__unravel_8h.html" target="_top" xlink:title="Index ravel and unraval operations. ">
-<polygon fill="#ffffff" stroke="#000000" points="2559,-402.5 2559,-432.5 2686,-432.5 2686,-402.5 2559,-402.5"/>
-<text text-anchor="start" x="2567" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/detail</text>
-<text text-anchor="middle" x="2622.5" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/ravel_unravel.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="2802.5,-402.5 2802.5,-432.5 2929.5,-432.5 2929.5,-402.5 2802.5,-402.5"/>
+<text text-anchor="start" x="2810.5" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/detail</text>
+<text text-anchor="middle" x="2866" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/ravel_unravel.h</text>
</a>
</g>
</g>
<!-- Node59->Node97 -->
<g id="edge75" class="edge">
<title>Node59->Node97</title>
-<path fill="none" stroke="#191970" d="M2032.7811,-480.1851C2148.1421,-473.5503 2362.8973,-458.8235 2544.5,-433 2549.1474,-432.3391 2553.9363,-431.5804 2558.7469,-430.7615"/>
-<polygon fill="#191970" stroke="#191970" points="2032.3962,-476.7012 2022.6112,-480.7638 2032.7939,-483.6899 2032.3962,-476.7012"/>
+<path fill="none" stroke="#191970" d="M2391.2632,-477.8383C2487.2212,-469.6352 2649.5064,-454.147 2788,-433 2792.6404,-432.2914 2797.4241,-431.4977 2802.2311,-430.6539"/>
+<polygon fill="#191970" stroke="#191970" points="2390.7105,-474.3726 2381.0424,-478.7059 2391.3027,-481.3475 2390.7105,-474.3726"/>
</g>
<!-- Node98 -->
<g id="node43" class="node">
<title>Node98</title>
<g id="a_node43"><a xlink:href="tensor__utils_8h.html" target="_top" xlink:title="Utility functions for handling tensor. ">
-<polygon fill="#ffffff" stroke="#000000" points="2375,-402.5 2375,-432.5 2502,-432.5 2502,-402.5 2375,-402.5"/>
-<text text-anchor="start" x="2383" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/detail</text>
-<text text-anchor="middle" x="2438.5" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/tensor_utils.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="2618.5,-402.5 2618.5,-432.5 2745.5,-432.5 2745.5,-402.5 2618.5,-402.5"/>
+<text text-anchor="start" x="2626.5" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/detail</text>
+<text text-anchor="middle" x="2682" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/tensor_utils.h</text>
</a>
</g>
</g>
<!-- Node59->Node98 -->
<g id="edge79" class="edge">
<title>Node59->Node98</title>
-<path fill="none" stroke="#191970" d="M2033.0118,-475.8069C2116.2302,-466.7282 2247.547,-451.2756 2360.5,-433 2365.1339,-432.2502 2369.9128,-431.4262 2374.7164,-430.5609"/>
-<polygon fill="#191970" stroke="#191970" points="2032.4187,-472.3507 2022.8548,-476.9094 2033.1741,-479.3098 2032.4187,-472.3507"/>
+<path fill="none" stroke="#191970" d="M2375.2032,-473.2782C2435.2515,-463.1391 2525.5932,-447.6091 2604,-433 2608.6148,-432.1402 2613.3796,-431.2353 2618.1732,-430.3126"/>
+<polygon fill="#191970" stroke="#191970" points="2374.3729,-469.8687 2365.094,-474.9826 2375.5367,-476.7713 2374.3729,-469.8687"/>
</g>
<!-- Node99 -->
<g id="node44" class="node">
<title>Node99</title>
<g id="a_node44"><a xlink:href="nn_2dense_8h.html" target="_top" xlink:title="Dense op constructions. ">
-<polygon fill="#ffffff" stroke="#000000" points="430,-268.5 430,-298.5 541,-298.5 541,-268.5 430,-268.5"/>
-<text text-anchor="start" x="438" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
-<text text-anchor="middle" x="485.5" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/dense.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="2124.5,-268.5 2124.5,-298.5 2235.5,-298.5 2235.5,-268.5 2124.5,-268.5"/>
+<text text-anchor="start" x="2132.5" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
+<text text-anchor="middle" x="2180" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/dense.h</text>
</a>
</g>
</g>
<!-- Node59->Node99 -->
<g id="edge88" class="edge">
<title>Node59->Node99</title>
-<path fill="none" stroke="#191970" d="M1895.4653,-473.1481C1885.8559,-471.4904 1875.9117,-470.0058 1866.5,-469 1726.0211,-453.9867 726.5008,-485.901 595.5,-433 576.5384,-425.3429 579.7219,-411.2809 561.5,-402 490.7426,-365.9614 436.6741,-427.5463 386.5,-366 364.2346,-338.688 406.8389,-313.7552 442.9205,-298.6121"/>
-<polygon fill="#191970" stroke="#191970" points="1894.8594,-476.5952 1905.3218,-474.9256 1896.1017,-469.7064 1894.8594,-476.5952"/>
+<path fill="none" stroke="#191970" d="M2275.8579,-470.0791C2260.782,-461.5534 2244.1545,-449.219 2235,-433 2213.5362,-394.9727 2245.4368,-375.034 2228,-335 2221.8505,-320.881 2210.1264,-308.1845 2199.805,-298.9307"/>
+<polygon fill="#191970" stroke="#191970" points="2274.4875,-473.3137 2284.9636,-474.896 2277.7608,-467.1261 2274.4875,-473.3137"/>
</g>
<!-- Node59->Node100 -->
<g id="edge91" class="edge">
<title>Node59->Node100</title>
-<path fill="none" stroke="#191970" d="M2032.7342,-482.9195C2271.4285,-478.0835 2955.3541,-462.0231 3177.5,-433 3182.1901,-432.3872 3187.0301,-431.6149 3191.8723,-430.7438"/>
-<polygon fill="#191970" stroke="#191970" points="2032.6545,-479.4203 2022.727,-483.1209 2032.7954,-486.4189 2032.6545,-479.4203"/>
+<path fill="none" stroke="#191970" d="M2391.4835,-482.6433C2620.0092,-477.2611 3252.4675,-460.2774 3459,-433 3463.6892,-432.3807 3468.5286,-431.6037 3473.3704,-430.7295"/>
+<polygon fill="#191970" stroke="#191970" points="2391.1512,-479.15 2381.2359,-482.8833 2391.3151,-486.1481 2391.1512,-479.15"/>
</g>
<!-- Node101 -->
<g id="node46" class="node">
<title>Node101</title>
-<g id="a_node46"><a xlink:href="local__response__norm_8h.html" target="_top" xlink:title="local response normalization op constructions ">
-<polygon fill="#ffffff" stroke="#000000" points="291.5,-402.5 291.5,-432.5 423.5,-432.5 423.5,-402.5 291.5,-402.5"/>
-<text text-anchor="start" x="299.5" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
-<text text-anchor="middle" x="357.5" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/local_response_norm.h</text>
+<g id="a_node46"><a xlink:href="layer__norm_8h.html" target="_top" xlink:title="layer normalization op constructions ">
+<polygon fill="#ffffff" stroke="#000000" points="3062.5,-402.5 3062.5,-432.5 3173.5,-432.5 3173.5,-402.5 3062.5,-402.5"/>
+<text text-anchor="start" x="3070.5" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
+<text text-anchor="middle" x="3118" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/layer_norm.h</text>
</a>
</g>
</g>
<!-- Node59->Node101 -->
<g id="edge93" class="edge">
<title>Node59->Node101</title>
-<path fill="none" stroke="#191970" d="M1895.7993,-473.1799C1886.087,-471.5018 1876.0215,-470.0021 1866.5,-469 1232.4677,-402.2707 1065.8633,-505.8056 432.5,-433 429.594,-432.666 426.6336,-432.2749 423.6496,-431.839"/>
-<polygon fill="#191970" stroke="#191970" points="1895.2918,-476.6448 1905.755,-474.9799 1896.5373,-469.7565 1895.2918,-476.6448"/>
+<path fill="none" stroke="#191970" d="M2391.1567,-482.7315C2528.5436,-478.9114 2811.2803,-467.2521 3048,-433 3052.6812,-432.3227 3057.515,-431.5049 3062.3529,-430.6033"/>
+<polygon fill="#191970" stroke="#191970" points="2391.0198,-479.2338 2381.1184,-483.0038 2391.2097,-486.2313 2391.0198,-479.2338"/>
</g>
<!-- Node102 -->
<g id="node47" class="node">
<title>Node102</title>
-<g id="a_node47"><a xlink:href="mapping_8h.html" target="_top" xlink:title="Mapping op constructions. ">
-<polygon fill="#ffffff" stroke="#000000" points="442,-402.5 442,-432.5 553,-432.5 553,-402.5 442,-402.5"/>
-<text text-anchor="start" x="450" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
-<text text-anchor="middle" x="497.5" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/mapping.h</text>
+<g id="a_node47"><a xlink:href="local__response__norm_8h.html" target="_top" xlink:title="local response normalization op constructions ">
+<polygon fill="#ffffff" stroke="#000000" points="0,-402.5 0,-432.5 132,-432.5 132,-402.5 0,-402.5"/>
+<text text-anchor="start" x="8" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
+<text text-anchor="middle" x="66" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/local_response_norm.h</text>
</a>
</g>
</g>
<!-- Node59->Node102 -->
<g id="edge94" class="edge">
<title>Node59->Node102</title>
-<path fill="none" stroke="#191970" d="M1895.4659,-473.1433C1885.8564,-471.4861 1875.912,-470.003 1866.5,-469 1292.1967,-407.7999 1140.7051,-503.7511 567.5,-433 562.8057,-432.4206 557.9627,-431.6717 553.1185,-430.8163"/>
-<polygon fill="#191970" stroke="#191970" points="1894.86,-476.5904 1905.3224,-474.9205 1896.1021,-469.7015 1894.86,-476.5904"/>
+<path fill="none" stroke="#191970" d="M2254.6367,-473.1749C2244.8216,-471.48 2234.6336,-469.9758 2225,-469 1303.3555,-375.6458 1061.707,-535.187 141,-433 138.0927,-432.6773 135.1313,-432.2956 132.1464,-431.8675"/>
+<polygon fill="#191970" stroke="#191970" points="2254.2273,-476.6576 2264.6911,-474.9961 2255.475,-469.7697 2254.2273,-476.6576"/>
+</g>
+<!-- Node103 -->
+<g id="node48" class="node">
+<title>Node103</title>
+<g id="a_node48"><a xlink:href="mapping_8h.html" target="_top" xlink:title="Mapping op constructions. ">
+<polygon fill="#ffffff" stroke="#000000" points="150.5,-402.5 150.5,-432.5 261.5,-432.5 261.5,-402.5 150.5,-402.5"/>
+<text text-anchor="start" x="158.5" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
+<text text-anchor="middle" x="206" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/mapping.h</text>
+</a>
+</g>
+</g>
+<!-- Node59->Node103 -->
+<g id="edge95" class="edge">
+<title>Node59->Node103</title>
+<path fill="none" stroke="#191970" d="M2254.3047,-473.1273C2244.5919,-471.4547 2234.5247,-469.9705 2225,-469 1794.0464,-425.0879 706.1629,-484.0793 276,-433 271.3031,-432.4423 266.4582,-431.7086 261.6128,-430.8635"/>
+<polygon fill="#191970" stroke="#191970" points="2253.7979,-476.5923 2264.2607,-474.9249 2255.0417,-469.7037 2253.7979,-476.5923"/>
</g>
<!-- Node60->Node61 -->
<g id="edge5" class="edge">
<title>Node60->Node61</title>
-<path fill="none" stroke="#191970" d="M800.5939,-327.1452C780.7533,-299.7365 748.4244,-255.076 731.5474,-231.7614"/>
-<polygon fill="#191970" stroke="#191970" points="797.8625,-329.3409 806.5614,-335.389 803.5328,-325.2362 797.8625,-329.3409"/>
+<path fill="none" stroke="#191970" d="M2135.0933,-327.2648C2128.5716,-318.5379 2121.2662,-308.4439 2115,-299 2099.8029,-276.0963 2083.8402,-248.4447 2074.412,-231.6802"/>
+<polygon fill="#191970" stroke="#191970" points="2132.4654,-329.5925 2141.2863,-335.4612 2138.0504,-325.3726 2132.4654,-329.5925"/>
</g>
<!-- Node61->Node62 -->
<g id="edge6" class="edge">
<title>Node61->Node62</title>
-<path fill="none" stroke="#191970" d="M704.2102,-192.7735C697.7852,-183.4154 690.5946,-172.9421 684.8968,-164.6432"/>
-<polygon fill="#191970" stroke="#191970" points="701.5164,-195.0337 710.0619,-201.2967 707.2872,-191.0716 701.5164,-195.0337"/>
+<path fill="none" stroke="#191970" d="M2086.8142,-193.6385C2095.5301,-184.0653 2105.4235,-173.1987 2113.2129,-164.6432"/>
+<polygon fill="#191970" stroke="#191970" points="2083.986,-191.5459 2079.8418,-201.2967 2089.1621,-196.2585 2083.986,-191.5459"/>
</g>
<!-- Node63->Node62 -->
<g id="edge8" class="edge">
<title>Node63->Node62</title>
-<path fill="none" stroke="#191970" d="M90.4801,-328.5729C112.2031,-309.8441 145.8696,-283.5379 179.5,-268 326.098,-200.2689 515.6482,-168.8515 612.1189,-156.4176"/>
-<polygon fill="#191970" stroke="#191970" points="87.9736,-326.1159 82.7644,-335.3416 92.5899,-331.378 87.9736,-326.1159"/>
+<path fill="none" stroke="#191970" d="M2319.3595,-328.5012C2275.117,-287.3311 2180.878,-199.6365 2143.1289,-164.5088"/>
+<polygon fill="#191970" stroke="#191970" points="2317.0599,-331.1422 2326.7649,-335.3923 2321.8285,-326.0177 2317.0599,-331.1422"/>
</g>
<!-- Node64->Node65 -->
<g id="edge11" class="edge">
<title>Node64->Node65</title>
-<path fill="none" stroke="#191970" d="M2029.8802,-325.0249C2030.013,-316.128 2030.1578,-306.4287 2030.274,-298.6432"/>
-<polygon fill="#191970" stroke="#191970" points="2026.3766,-325.2455 2029.7269,-335.2967 2033.3759,-325.35 2026.3766,-325.2455"/>
+<path fill="none" stroke="#191970" d="M515,-325.0249C515,-316.128 515,-306.4287 515,-298.6432"/>
+<polygon fill="#191970" stroke="#191970" points="511.5001,-325.2966 515,-335.2967 518.5001,-325.2967 511.5001,-325.2966"/>
</g>
<!-- Node66->Node67 -->
<g id="edge13" class="edge">
<title>Node66->Node67</title>
-<path fill="none" stroke="#191970" d="M1762.2373,-326.4837C1757.1103,-317.1996 1751.3984,-306.8565 1746.8626,-298.6432"/>
-<polygon fill="#191970" stroke="#191970" points="1759.206,-328.2348 1767.1041,-335.2967 1765.3337,-324.8508 1759.206,-328.2348"/>
+<path fill="none" stroke="#191970" d="M1503.2627,-326.4837C1508.3897,-317.1996 1514.1016,-306.8565 1518.6374,-298.6432"/>
+<polygon fill="#191970" stroke="#191970" points="1500.1663,-324.8508 1498.3959,-335.2967 1506.294,-328.2348 1500.1663,-324.8508"/>
</g>
<!-- Node68->Node69 -->
<g id="edge15" class="edge">
<title>Node68->Node69</title>
-<path fill="none" stroke="#191970" d="M1559.5,-325.0249C1559.5,-316.128 1559.5,-306.4287 1559.5,-298.6432"/>
-<polygon fill="#191970" stroke="#191970" points="1556.0001,-325.2966 1559.5,-335.2967 1563.0001,-325.2967 1556.0001,-325.2966"/>
+<path fill="none" stroke="#191970" d="M909,-325.0249C909,-316.128 909,-306.4287 909,-298.6432"/>
+<polygon fill="#191970" stroke="#191970" points="905.5001,-325.2966 909,-335.2967 912.5001,-325.2967 905.5001,-325.2966"/>
</g>
<!-- Node70->Node71 -->
<g id="edge17" class="edge">
<title>Node70->Node71</title>
-<path fill="none" stroke="#191970" d="M250.5,-325.0249C250.5,-316.128 250.5,-306.4287 250.5,-298.6432"/>
-<polygon fill="#191970" stroke="#191970" points="247.0001,-325.2966 250.5,-335.2967 254.0001,-325.2967 247.0001,-325.2966"/>
+<path fill="none" stroke="#191970" d="M731,-325.0249C731,-316.128 731,-306.4287 731,-298.6432"/>
+<polygon fill="#191970" stroke="#191970" points="727.5001,-325.2966 731,-335.2967 734.5001,-325.2967 727.5001,-325.2966"/>
</g>
<!-- Node72->Node61 -->
<g id="edge19" class="edge">
<title>Node72->Node61</title>
-<path fill="none" stroke="#191970" d="M1896.7088,-392.5668C1892.0593,-375.5341 1884.4792,-352.897 1873.5,-335 1851.9665,-299.8986 1846.9694,-285.0835 1809.5,-268 1716.0176,-225.3785 997.7372,-218.0089 781.7432,-216.7535"/>
-<polygon fill="#191970" stroke="#191970" points="1893.3461,-393.5439 1899.2291,-402.3553 1900.125,-391.7985 1893.3461,-393.5439"/>
+<path fill="none" stroke="#191970" d="M1744.9067,-410.9125C1818.9614,-403.1776 1926.8617,-388.4769 1962,-366 2014.2034,-332.6071 2047.4333,-262.7129 2060.1421,-231.7844"/>
+<polygon fill="#191970" stroke="#191970" points="1744.2847,-407.4577 1734.693,-411.9577 1744.9974,-414.4214 1744.2847,-407.4577"/>
</g>
<!-- Node72->Node62 -->
<g id="edge21" class="edge">
<title>Node72->Node62</title>
-<path fill="none" stroke="#191970" d="M1903.9588,-392.201C1904.0665,-375.4424 1902.5351,-353.2659 1895.5,-335 1882.0497,-300.0775 1877.1507,-286.2872 1844.5,-268 1747.2533,-213.5335 964.5577,-165.7626 736.5886,-152.8971"/>
-<polygon fill="#191970" stroke="#191970" points="1900.4565,-392.2474 1903.6927,-402.3359 1907.4541,-392.4312 1900.4565,-392.2474"/>
+<path fill="none" stroke="#191970" d="M1665.5841,-392.4002C1663.2405,-374.9355 1663.1212,-351.8523 1674,-335 1760.1906,-201.4825 1962.1759,-164.0458 2064.7972,-153.5647"/>
+<polygon fill="#191970" stroke="#191970" points="1662.1608,-393.1512 1667.2523,-402.4424 1669.0662,-392.004 1662.1608,-393.1512"/>
</g>
<!-- Node72->Node66 -->
<g id="edge20" class="edge">
<title>Node72->Node66</title>
-<path fill="none" stroke="#191970" d="M1865.1095,-397.7743C1845.6978,-387.5335 1822.3444,-375.2132 1804.3456,-365.7177"/>
-<polygon fill="#191970" stroke="#191970" points="1863.5209,-400.8934 1873.9987,-402.4639 1866.7872,-394.7021 1863.5209,-400.8934"/>
+<path fill="none" stroke="#191970" d="M1620.6577,-398.865C1592.3253,-388.3773 1557.4062,-375.4515 1530.7971,-365.6017"/>
+<polygon fill="#191970" stroke="#191970" points="1619.7869,-402.2747 1630.3801,-402.4639 1622.217,-395.71 1619.7869,-402.2747"/>
</g>
<!-- Node72->Node67 -->
<g id="edge22" class="edge">
<title>Node72->Node67</title>
-<path fill="none" stroke="#191970" d="M1890.139,-393.0882C1880.085,-375.1703 1864.4795,-351.2275 1845.5,-335 1834.1005,-325.2534 1801.0945,-309.9498 1774.725,-298.5544"/>
-<polygon fill="#191970" stroke="#191970" points="1887.3042,-395.2041 1895.1396,-402.3355 1893.4616,-391.8744 1887.3042,-395.2041"/>
+<path fill="none" stroke="#191970" d="M1650.9026,-394.4298C1635.6417,-377.3715 1613.8024,-353.9191 1593,-335 1578.5921,-321.8964 1561.1843,-308.4003 1547.8647,-298.5129"/>
+<polygon fill="#191970" stroke="#191970" points="1648.5782,-397.0839 1657.8324,-402.2424 1653.815,-392.4388 1648.5782,-397.0839"/>
</g>
<!-- Node74 -->
<g id="node19" class="node">
<title>Node74</title>
<g id="a_node19"><a xlink:href="broadcast_8h.html" target="_top" xlink:title="Broadcast op constructions. ">
-<polygon fill="#ffffff" stroke="#000000" points="2865.5,-274 2865.5,-293 3021.5,-293 3021.5,-274 2865.5,-274"/>
-<text text-anchor="middle" x="2943.5" y="-281" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/broadcast.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="3185,-274 3185,-293 3341,-293 3341,-274 3185,-274"/>
+<text text-anchor="middle" x="3263" y="-281" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/broadcast.h</text>
</a>
</g>
</g>
<!-- Node73->Node74 -->
<g id="edge24" class="edge">
<title>Node73->Node74</title>
-<path fill="none" stroke="#191970" d="M2805.2379,-332.0003C2841.2898,-319.3538 2887.5881,-303.1131 2916.4028,-293.0053"/>
-<polygon fill="#191970" stroke="#191970" points="2803.6418,-328.851 2795.3641,-335.4639 2805.959,-335.4564 2803.6418,-328.851"/>
+<path fill="none" stroke="#191970" d="M3095.0245,-332.6457C3138.5509,-319.9109 3195.3737,-303.2859 3230.5118,-293.0053"/>
+<polygon fill="#191970" stroke="#191970" points="3094.0069,-329.2966 3085.3921,-335.4639 3095.9726,-336.015 3094.0069,-329.2966"/>
</g>
<!-- Node73->Node82 -->
<g id="edge39" class="edge">
<title>Node73->Node82</title>
-<path fill="none" stroke="#191970" d="M2729.8496,-328.3456C2699.5194,-298.6796 2647.0169,-247.3267 2625.3192,-226.1042"/>
-<polygon fill="#191970" stroke="#191970" points="2727.4544,-330.8987 2737.0507,-335.389 2732.3491,-325.8945 2727.4544,-330.8987"/>
+<path fill="none" stroke="#191970" d="M3016.6148,-327.2038C2994.3574,-297.3789 2956.8037,-247.0569 2941.1673,-226.1042"/>
+<polygon fill="#191970" stroke="#191970" points="3013.9372,-329.4679 3022.7231,-335.389 3019.5473,-325.2813 3013.9372,-329.4679"/>
</g>
<!-- Node75 -->
<g id="node20" class="node">
<title>Node75</title>
<g id="a_node20"><a xlink:href="elemwise_8h.html" target="_top" xlink:title="Elementwise op constructions. ">
-<polygon fill="#ffffff" stroke="#000000" points="2812,-207 2812,-226 2967,-226 2967,-207 2812,-207"/>
-<text text-anchor="middle" x="2889.5" y="-214" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/elemwise.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="3185.5,-207 3185.5,-226 3340.5,-226 3340.5,-207 3185.5,-207"/>
+<text text-anchor="middle" x="3263" y="-214" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/elemwise.h</text>
</a>
</g>
</g>
<!-- Node74->Node75 -->
<g id="edge25" class="edge">
<title>Node74->Node75</title>
-<path fill="none" stroke="#191970" d="M2929.2735,-265.8486C2919.0141,-253.1194 2905.5573,-236.4229 2897.2225,-226.0817"/>
-<polygon fill="#191970" stroke="#191970" points="2926.7627,-268.3109 2935.7631,-273.9005 2932.2129,-263.9182 2926.7627,-268.3109"/>
+<path fill="none" stroke="#191970" d="M3263,-263.6079C3263,-251.214 3263,-235.8263 3263,-226.0817"/>
+<polygon fill="#191970" stroke="#191970" points="3259.5001,-263.9005 3263,-273.9005 3266.5001,-263.9006 3259.5001,-263.9005"/>
</g>
<!-- Node74->Node76 -->
<g id="edge33" class="edge">
<title>Node74->Node76</title>
-<path fill="none" stroke="#191970" d="M2958.1078,-265.561C2964.8178,-256.1684 2972.0413,-244.1199 2975.5,-232 2979.2809,-218.7512 2979.6379,-214.1417 2975.5,-201 2970.3235,-184.5597 2957.8236,-168.7955 2949.012,-159.1353"/>
-<polygon fill="#191970" stroke="#191970" points="2955.2159,-263.5823 2951.9714,-273.6682 2960.7974,-267.807 2955.2159,-263.5823"/>
+<path fill="none" stroke="#191970" d="M3221.8804,-269.7464C3204.4882,-261.6559 3185.9755,-249.4972 3176,-232 3161.4152,-206.4179 3188.7993,-174.7258 3205.1107,-159.1143"/>
+<polygon fill="#191970" stroke="#191970" points="3220.7972,-273.0906 3231.3661,-273.8306 3223.5655,-266.6612 3220.7972,-273.0906"/>
</g>
<!-- Node74->Node81 -->
<g id="edge32" class="edge">
<title>Node74->Node81</title>
-<path fill="none" stroke="#191970" d="M2893.6802,-271.197C2865.4997,-262.8363 2830.4252,-249.9932 2802.5,-232 2773.647,-213.409 2747.1991,-182.8444 2732.8168,-164.5971"/>
-<polygon fill="#191970" stroke="#191970" points="2892.7599,-274.5741 2903.3374,-273.9684 2894.6908,-267.8457 2892.7599,-274.5741"/>
+<path fill="none" stroke="#191970" d="M3236.131,-268.9904C3217.0934,-258.6962 3190.9601,-244.5373 3168,-232 3125.09,-208.5692 3075.3901,-181.1858 3045.3441,-164.6043"/>
+<polygon fill="#191970" stroke="#191970" points="3234.6616,-272.1747 3245.1231,-273.8507 3237.9901,-266.0167 3234.6616,-272.1747"/>
</g>
<!-- Node74->Node82 -->
<g id="edge34" class="edge">
<title>Node74->Node82</title>
-<path fill="none" stroke="#191970" d="M2887.0009,-271.959C2823.4736,-258.9824 2721.4797,-238.1483 2662.2504,-226.0496"/>
-<polygon fill="#191970" stroke="#191970" points="2886.3557,-275.3994 2896.8539,-273.9717 2887.7568,-268.5411 2886.3557,-275.3994"/>
+<path fill="none" stroke="#191970" d="M3206.3287,-271.959C3142.6077,-258.9824 3040.3028,-238.1483 2980.8929,-226.0496"/>
+<polygon fill="#191970" stroke="#191970" points="3205.7144,-275.4057 3216.2117,-273.9717 3207.1113,-268.5465 3205.7144,-275.4057"/>
</g>
<!-- Node75->Node76 -->
<g id="edge26" class="edge">
<title>Node75->Node76</title>
-<path fill="none" stroke="#191970" d="M2902.6727,-198.8486C2912.1721,-186.1194 2924.6321,-169.4229 2932.3495,-159.0817"/>
-<polygon fill="#191970" stroke="#191970" points="2899.8396,-196.7929 2896.6638,-206.9005 2905.4497,-200.9795 2899.8396,-196.7929"/>
+<path fill="none" stroke="#191970" d="M3250.3593,-198.4803C3241.4634,-185.7989 3229.906,-169.3235 3222.7215,-159.0817"/>
+<polygon fill="#191970" stroke="#191970" points="3247.6579,-200.7239 3256.266,-206.9005 3253.3885,-196.7039 3247.6579,-200.7239"/>
</g>
<!-- Node76->Node77 -->
<g id="edge27" class="edge">
<title>Node76->Node77</title>
-<path fill="none" stroke="#191970" d="M2949.1469,-130.7654C2962.9424,-103.9741 2988.0366,-55.2405 3000.7189,-30.611"/>
-<polygon fill="#191970" stroke="#191970" points="2945.9286,-129.3702 2944.4623,-139.8631 2952.152,-132.5748 2945.9286,-129.3702"/>
+<path fill="none" stroke="#191970" d="M3225.3673,-130.7654C3238.7629,-103.9741 3263.1298,-55.2405 3275.4445,-30.611"/>
+<polygon fill="#191970" stroke="#191970" points="3222.1602,-129.3535 3220.8185,-139.8631 3228.4212,-132.484 3222.1602,-129.3535"/>
</g>
<!-- Node76->Node78 -->
<g id="edge28" class="edge">
<title>Node76->Node78</title>
-<path fill="none" stroke="#191970" d="M2867.998,-138.4362C2782.1633,-125.1546 2640.0533,-103.1653 2562.0314,-91.0926"/>
-<polygon fill="#191970" stroke="#191970" points="2867.5039,-141.9013 2877.9215,-139.9717 2868.5743,-134.9836 2867.5039,-141.9013"/>
+<path fill="none" stroke="#191970" d="M3128.6572,-139.2553C3113.1204,-137.4734 3097.0855,-135.6604 3082,-134 2923.5382,-116.5591 2736.256,-97.3921 2644.8418,-88.13"/>
+<polygon fill="#191970" stroke="#191970" points="3128.4457,-142.754 3138.7802,-140.42 3129.2458,-135.7999 3128.4457,-142.754"/>
</g>
<!-- Node76->Node79 -->
<g id="edge29" class="edge">
<title>Node76->Node79</title>
-<path fill="none" stroke="#191970" d="M2977.6739,-136.5826C3016.1553,-123.5611 3074.6563,-103.7653 3109.1841,-92.0817"/>
-<polygon fill="#191970" stroke="#191970" points="2976.2191,-133.3798 2967.8686,-139.9005 2978.4628,-140.0105 2976.2191,-133.3798"/>
+<path fill="none" stroke="#191970" d="M3255.016,-136.7484C3294.8494,-123.7297 3355.7824,-103.815 3391.683,-92.0817"/>
+<polygon fill="#191970" stroke="#191970" points="3253.7894,-133.4671 3245.3715,-139.9005 3255.9641,-140.1207 3253.7894,-133.4671"/>
</g>
<!-- Node76->Node80 -->
<g id="edge31" class="edge">
<title>Node76->Node80</title>
-<path fill="none" stroke="#191970" d="M2900.6772,-136.7494C2866.6571,-125.5761 2817.1696,-109.3229 2781.2379,-97.5218"/>
-<polygon fill="#191970" stroke="#191970" points="2899.6789,-140.1054 2910.2718,-139.9005 2901.8632,-133.4549 2899.6789,-140.1054"/>
+<path fill="none" stroke="#191970" d="M3150.6069,-138.2658C3078.1158,-125.8122 2961.9547,-105.8563 2890.0205,-93.4984"/>
+<polygon fill="#191970" stroke="#191970" points="3150.0884,-141.7279 3160.5367,-139.9717 3151.2737,-134.829 3150.0884,-141.7279"/>
</g>
<!-- Node79->Node77 -->
<g id="edge30" class="edge">
<title>Node79->Node77</title>
-<path fill="none" stroke="#191970" d="M3110.1278,-68.2834C3088.7672,-57.1892 3059.1688,-41.8164 3037.4225,-30.5218"/>
-<polygon fill="#191970" stroke="#191970" points="3108.5298,-71.3973 3119.0174,-72.9005 3111.7563,-65.1852 3108.5298,-71.3973"/>
+<path fill="none" stroke="#191970" d="M3392.025,-68.4324C3369.1514,-57.3271 3337.3082,-41.867 3313.9403,-30.5218"/>
+<polygon fill="#191970" stroke="#191970" points="3390.7035,-71.6815 3401.228,-72.9005 3393.7608,-65.3844 3390.7035,-71.6815"/>
</g>
<!-- Node82->Node76 -->
<g id="edge37" class="edge">
<title>Node82->Node76</title>
-<path fill="none" stroke="#191970" d="M2671.6979,-204.8788C2734.4771,-191.8967 2834.9331,-171.1234 2893.3197,-159.0496"/>
-<polygon fill="#191970" stroke="#191970" points="2670.6612,-201.5191 2661.5772,-206.9717 2672.0788,-208.374 2670.6612,-201.5191"/>
+<path fill="none" stroke="#191970" d="M2983.9162,-204.6405C3038.5367,-191.6633 3125.0369,-171.1118 3175.5775,-159.1039"/>
+<polygon fill="#191970" stroke="#191970" points="2983.0243,-201.2549 2974.1042,-206.9717 2984.6425,-208.0653 2983.0243,-201.2549"/>
</g>
<!-- Node82->Node79 -->
<g id="edge36" class="edge">
<title>Node82->Node79</title>
-<path fill="none" stroke="#191970" d="M2619.4931,-197.0169C2624.5647,-177.6393 2635.3331,-148.7348 2656.5,-134 2690.3608,-110.4287 2955.6337,-92.7005 3077.115,-85.7329"/>
-<polygon fill="#191970" stroke="#191970" points="2616.0513,-196.3645 2617.1806,-206.899 2622.8671,-197.9596 2616.0513,-196.3645"/>
+<path fill="none" stroke="#191970" d="M2932.5955,-196.6029C2932.4418,-177.5668 2935.5482,-149.5245 2953,-134 2982.9258,-107.379 3241.3607,-91.3331 3360.8582,-85.2739"/>
+<polygon fill="#191970" stroke="#191970" points="2929.1082,-196.9979 2932.996,-206.8536 2936.1029,-196.7246 2929.1082,-196.9979"/>
</g>
<!-- Node82->Node80 -->
<g id="edge38" class="edge">
<title>Node82->Node80</title>
-<path fill="none" stroke="#191970" d="M2612.2411,-196.8349C2610.3649,-178.8204 2610.5149,-152.2567 2623.5,-134 2635.6562,-116.9086 2654.9184,-105.3557 2674.0647,-97.6056"/>
-<polygon fill="#191970" stroke="#191970" points="2608.7786,-197.3489 2613.5745,-206.796 2615.7167,-196.42 2608.7786,-197.3489"/>
+<path fill="none" stroke="#191970" d="M2919.7472,-198.816C2898.3491,-172.2665 2858.292,-122.566 2838.179,-97.611"/>
+<polygon fill="#191970" stroke="#191970" points="2917.2325,-201.2734 2926.2329,-206.8631 2922.6827,-196.8807 2917.2325,-201.2734"/>
</g>
<!-- Node82->Node81 -->
<g id="edge35" class="edge">
<title>Node82->Node81</title>
-<path fill="none" stroke="#191970" d="M2639.1826,-201.5308C2656.6314,-190.5019 2680.2658,-175.5631 2697.7342,-164.5218"/>
-<polygon fill="#191970" stroke="#191970" points="2637.2702,-198.599 2630.6872,-206.9005 2641.0103,-204.5161 2637.2702,-198.599"/>
+<path fill="none" stroke="#191970" d="M2953.92,-200.6115C2967.6219,-189.6825 2985.7048,-175.2592 2999.1667,-164.5218"/>
+<polygon fill="#191970" stroke="#191970" points="2951.6705,-197.9287 2946.0352,-206.9005 2956.0354,-203.4011 2951.6705,-197.9287"/>
</g>
<!-- Node83->Node73 -->
<g id="edge42" class="edge">
<title>Node83->Node73</title>
-<path fill="none" stroke="#191970" d="M2922.8096,-399.6842C2886.01,-389.0568 2839.7927,-375.7095 2804.7924,-365.6017"/>
-<polygon fill="#191970" stroke="#191970" points="2921.8562,-403.0519 2932.4347,-402.4639 2923.7984,-396.3267 2921.8562,-403.0519"/>
+<path fill="none" stroke="#191970" d="M3204.3096,-399.6842C3167.51,-389.0568 3121.2927,-375.7095 3086.2924,-365.6017"/>
+<polygon fill="#191970" stroke="#191970" points="3203.3562,-403.0519 3213.9347,-402.4639 3205.2984,-396.3267 3203.3562,-403.0519"/>
</g>
<!-- Node83->Node74 -->
<g id="edge41" class="edge">
<title>Node83->Node74</title>
-<path fill="none" stroke="#191970" d="M2990.2976,-392.2298C2992.9764,-375.4834 2994.2566,-353.3098 2987.5,-335 2981.1489,-317.7891 2966.1994,-302.4301 2955.4891,-293.057"/>
-<polygon fill="#191970" stroke="#191970" points="2986.8019,-391.8829 2988.4057,-402.3557 2993.6828,-393.1686 2986.8019,-391.8829"/>
+<path fill="none" stroke="#191970" d="M3289.786,-394.4321C3296.7103,-386.1885 3303.3807,-376.3571 3307,-366 3311.5451,-352.9935 3311.7698,-347.9258 3307,-335 3300.6489,-317.7891 3285.6994,-302.4301 3274.9891,-293.057"/>
+<polygon fill="#191970" stroke="#191970" points="3286.9673,-392.3321 3282.8807,-402.1071 3292.1711,-397.014 3286.9673,-392.3321"/>
</g>
<!-- Node83->Node76 -->
<g id="edge49" class="edge">
<title>Node83->Node76</title>
-<path fill="none" stroke="#191970" d="M3000.4892,-393.5919C3018.1407,-364.1999 3042.5658,-312.7128 3030.5,-268 3017.5247,-219.9167 2972.8248,-177.3021 2951.2859,-159"/>
-<polygon fill="#191970" stroke="#191970" points="2997.3677,-391.9833 2995.051,-402.3218 3003.3092,-395.6845 2997.3677,-391.9833"/>
+<path fill="none" stroke="#191970" d="M3294.671,-396.0679C3304.625,-387.5516 3315.228,-377.1304 3323,-366 3365.502,-305.1322 3391.4154,-261.9282 3349,-201 3337.7178,-184.7936 3288.8654,-168.7902 3253.6383,-159.028"/>
+<polygon fill="#191970" stroke="#191970" points="3292.3924,-393.4102 3286.8938,-402.4665 3296.8398,-398.8159 3292.3924,-393.4102"/>
</g>
<!-- Node83->Node79 -->
<g id="edge48" class="edge">
<title>Node83->Node79</title>
-<path fill="none" stroke="#191970" d="M3054.4009,-398.4805C3071.7979,-390.844 3089.0213,-380.3362 3101.5,-366 3173.6516,-283.1082 3147.8978,-131.4213 3139.646,-92.0424"/>
-<polygon fill="#191970" stroke="#191970" points="3052.6748,-395.4051 3044.747,-402.4337 3055.3275,-401.8831 3052.6748,-395.4051"/>
+<path fill="none" stroke="#191970" d="M3307.6633,-397.3623C3321.7429,-389.0897 3336.7326,-378.4938 3348,-366 3423.3944,-282.3999 3422.9712,-131.2304 3421.4878,-92.0038"/>
+<polygon fill="#191970" stroke="#191970" points="3305.5945,-394.51 3298.5906,-402.4596 3309.0233,-400.6128 3305.5945,-394.51"/>
</g>
<!-- Node83->Node80 -->
<g id="edge51" class="edge">
<title>Node83->Node80</title>
-<path fill="none" stroke="#191970" d="M3012.3378,-395.4742C3021.0698,-387.1539 3029.8844,-377.0107 3035.5,-366 3082.3903,-274.0601 3097.7337,-207.7153 3025.5,-134 2994.8557,-102.7271 2875.4386,-90.2483 2799.73,-85.4134"/>
-<polygon fill="#191970" stroke="#191970" points="3009.8557,-392.9995 3004.768,-402.2928 3014.5406,-398.2006 3009.8557,-392.9995"/>
+<path fill="none" stroke="#191970" d="M3192.3555,-403.8197C3188.8558,-403.1983 3185.3902,-402.5888 3182,-402 3083.9511,-384.9699 3047.6737,-414.9006 2961,-366 2892.8991,-327.5781 2879.0435,-303.3245 2847,-232 2826.4849,-186.3362 2824.8924,-125.5913 2825.4063,-97.6881"/>
+<polygon fill="#191970" stroke="#191970" points="3191.8133,-407.2782 3202.2737,-405.5958 3193.0472,-400.3878 3191.8133,-407.2782"/>
</g>
<!-- Node83->Node82 -->
<g id="edge50" class="edge">
<title>Node83->Node82</title>
-<path fill="none" stroke="#191970" d="M2910.626,-413.0798C2832.0948,-407.0052 2714.4839,-393.5339 2679.5,-366 2633.1315,-329.5059 2619.8168,-252.9552 2616.4908,-226.0768"/>
-<polygon fill="#191970" stroke="#191970" points="2910.7227,-416.5968 2920.9563,-413.8539 2911.2458,-409.6164 2910.7227,-416.5968"/>
+<path fill="none" stroke="#191970" d="M3263.2206,-392.5226C3259.7827,-373.8519 3252.1086,-349.1953 3235,-335 3163.8089,-275.9317 3112.7248,-340.3986 3030,-299 3012.0045,-289.9944 3012.1323,-281.2647 2997,-268 2979.7904,-252.9143 2959.1332,-236.2982 2946.2834,-226.1297"/>
+<polygon fill="#191970" stroke="#191970" points="3259.7671,-393.093 3264.7612,-402.4369 3266.6841,-392.0181 3259.7671,-393.093"/>
</g>
<!-- Node84 -->
<g id="node29" class="node">
<title>Node84</title>
<g id="a_node29"><a xlink:href="strided__slice_8h.html" target="_top" xlink:title="Utility functions for strided_slice op. ">
-<polygon fill="#ffffff" stroke="#000000" points="2720,-268.5 2720,-298.5 2847,-298.5 2847,-268.5 2720,-268.5"/>
-<text text-anchor="start" x="2728" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/detail</text>
-<text text-anchor="middle" x="2783.5" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/strided_slice.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="3039.5,-268.5 3039.5,-298.5 3166.5,-298.5 3166.5,-268.5 3039.5,-268.5"/>
+<text text-anchor="start" x="3047.5" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/detail</text>
+<text text-anchor="middle" x="3103" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/strided_slice.h</text>
</a>
</g>
</g>
<!-- Node83->Node84 -->
<g id="edge43" class="edge">
<title>Node83->Node84</title>
-<path fill="none" stroke="#191970" d="M2981.1074,-392.51C2977.3547,-374.21 2969.5554,-350.0401 2953.5,-335 2945.0602,-327.0939 2888.1459,-310.7717 2842.4608,-298.6103"/>
-<polygon fill="#191970" stroke="#191970" points="2977.6856,-393.2596 2982.881,-402.4931 2984.5777,-392.035 2977.6856,-393.2596"/>
+<path fill="none" stroke="#191970" d="M3274.4265,-392.7792C3278.8252,-375.0127 3281.0814,-351.3976 3269,-335 3262.3999,-326.042 3208.2977,-310.31 3163.6232,-298.5839"/>
+<polygon fill="#191970" stroke="#191970" points="3271.0408,-391.8898 3271.6813,-402.4653 3277.7756,-393.7986 3271.0408,-391.8898"/>
</g>
<!-- Node83->Node85 -->
<g id="edge45" class="edge">
<title>Node83->Node85</title>
-<path fill="none" stroke="#191970" d="M2910.7672,-407.6198C2815.2716,-394.8234 2649.9252,-372.667 2555.6894,-360.0394"/>
-<polygon fill="#191970" stroke="#191970" points="2910.5446,-411.1212 2920.9209,-408.9804 2911.4744,-404.1832 2910.5446,-411.1212"/>
+<path fill="none" stroke="#191970" d="M3192.3885,-403.6153C3188.88,-403.0484 3185.4034,-402.5064 3182,-402 3047.6353,-382.0071 2889.8764,-365.7169 2800.1488,-357.1221"/>
+<polygon fill="#191970" stroke="#191970" points="3191.8871,-407.0799 3202.3262,-405.2701 3193.0369,-400.175 3191.8871,-407.0799"/>
</g>
<!-- Node83->Node86 -->
<g id="edge46" class="edge">
<title>Node83->Node86</title>
-<path fill="none" stroke="#191970" d="M3058.3506,-401.0615C3111.3232,-389.2702 3181.5782,-373.6321 3229.8534,-362.8865"/>
-<polygon fill="#191970" stroke="#191970" points="3057.335,-397.7018 3048.3344,-403.291 3058.856,-404.5346 3057.335,-397.7018"/>
+<path fill="none" stroke="#191970" d="M3339.8506,-401.0615C3392.8232,-389.2702 3463.0782,-373.6321 3511.3534,-362.8865"/>
+<polygon fill="#191970" stroke="#191970" points="3338.835,-397.7018 3329.8344,-403.291 3340.356,-404.5346 3338.835,-397.7018"/>
</g>
<!-- Node83->Node87 -->
<g id="edge47" class="edge">
<title>Node83->Node87</title>
-<path fill="none" stroke="#191970" d="M2954.5068,-396.3469C2940.3712,-386.3776 2923.8305,-374.7121 2910.9717,-365.6432"/>
-<polygon fill="#191970" stroke="#191970" points="2952.7537,-399.3934 2962.943,-402.2967 2956.7882,-393.6729 2952.7537,-399.3934"/>
+<path fill="none" stroke="#191970" d="M3236.0068,-396.3469C3221.8712,-386.3776 3205.3305,-374.7121 3192.4717,-365.6432"/>
+<polygon fill="#191970" stroke="#191970" points="3234.2537,-399.3934 3244.443,-402.2967 3238.2882,-393.6729 3234.2537,-399.3934"/>
</g>
<!-- Node84->Node82 -->
<g id="edge44" class="edge">
<title>Node84->Node82</title>
-<path fill="none" stroke="#191970" d="M2736.3268,-264.6869C2704.8353,-252.1278 2664.7337,-236.1349 2639.5815,-226.1039"/>
-<polygon fill="#191970" stroke="#191970" points="2735.2124,-268.0105 2745.7975,-268.4639 2737.8055,-261.5085 2735.2124,-268.0105"/>
+<path fill="none" stroke="#191970" d="M3055.546,-264.6869C3023.867,-252.1278 2983.5267,-236.1349 2958.2248,-226.1039"/>
+<polygon fill="#191970" stroke="#191970" points="3054.4871,-268.032 3065.0731,-268.4639 3057.0669,-261.5248 3054.4871,-268.032"/>
</g>
<!-- Node88->Node60 -->
<g id="edge53" class="edge">
<title>Node88->Node60</title>
-<path fill="none" stroke="#191970" d="M817.5,-392.0249C817.5,-383.128 817.5,-373.4287 817.5,-365.6432"/>
-<polygon fill="#191970" stroke="#191970" points="814.0001,-392.2966 817.5,-402.2967 821.0001,-392.2967 814.0001,-392.2966"/>
+<path fill="none" stroke="#191970" d="M2263.9328,-398.4516C2239.8444,-388.0392 2210.425,-375.3224 2187.9367,-365.6017"/>
+<polygon fill="#191970" stroke="#191970" points="2262.6471,-401.7088 2273.215,-402.4639 2265.4246,-395.2834 2262.6471,-401.7088"/>
</g>
<!-- Node88->Node63 -->
<g id="edge54" class="edge">
<title>Node88->Node63</title>
-<path fill="none" stroke="#191970" d="M743.5731,-412.8355C624.7245,-405.0415 383.8777,-388.0309 180.5,-366 165.1104,-364.3329 148.5908,-362.2547 133.0358,-360.167"/>
-<polygon fill="#191970" stroke="#191970" points="743.4432,-416.3344 753.65,-413.4934 743.8993,-409.3492 743.4432,-416.3344"/>
+<path fill="none" stroke="#191970" d="M2320.6976,-393.1932C2325.5082,-383.9844 2330.8435,-373.771 2335.0894,-365.6432"/>
+<polygon fill="#191970" stroke="#191970" points="2317.47,-391.8126 2315.942,-402.2967 2323.6745,-395.0537 2317.47,-391.8126"/>
</g>
<!-- Node89->Node64 -->
<g id="edge56" class="edge">
<title>Node89->Node64</title>
-<path fill="none" stroke="#191970" d="M1332.3981,-411.0782C1484.1073,-397.8947 1828.1914,-367.9937 1968.3584,-355.8132"/>
-<polygon fill="#191970" stroke="#191970" points="1331.9095,-407.6074 1322.2501,-411.9601 1332.5156,-414.5811 1331.9095,-407.6074"/>
+<path fill="none" stroke="#191970" d="M1000.2276,-409.4695C911.733,-399.7237 758.9184,-382.5356 628,-366 611.203,-363.8785 593.0212,-361.4451 576.2532,-359.1448"/>
+<polygon fill="#191970" stroke="#191970" points="1000.1085,-412.9775 1010.4312,-410.5917 1000.8738,-406.0194 1000.1085,-412.9775"/>
</g>
<!-- Node89->Node65 -->
<g id="edge66" class="edge">
<title>Node89->Node65</title>
-<path fill="none" stroke="#191970" d="M1283.924,-395.8293C1294.6309,-386.6638 1307.2042,-375.8492 1318.5,-366 1334.1622,-352.3436 1334.3637,-343.0996 1353.5,-335 1360.5516,-332.0153 1802.54,-299.9206 1968.4681,-287.9593"/>
-<polygon fill="#191970" stroke="#191970" points="1281.4789,-393.3149 1276.1523,-402.4734 1286.0276,-398.6356 1281.4789,-393.3149"/>
+<path fill="none" stroke="#191970" d="M999.986,-411.7442C898.6166,-403.267 722.1884,-386.2227 661,-366 612.9004,-350.1031 562.7379,-317.7031 535.5375,-298.5779"/>
+<polygon fill="#191970" stroke="#191970" points="999.9019,-415.2491 1010.1568,-412.5871 1000.4801,-408.2731 999.9019,-415.2491"/>
</g>
<!-- Node89->Node66 -->
<g id="edge57" class="edge">
<title>Node89->Node66</title>
-<path fill="none" stroke="#191970" d="M1332.1313,-407.9578C1434.7907,-394.6538 1619.037,-370.7766 1714.4724,-358.4088"/>
-<polygon fill="#191970" stroke="#191970" points="1331.6206,-404.4947 1322.1534,-409.2509 1332.5203,-411.4366 1331.6206,-404.4947"/>
+<path fill="none" stroke="#191970" d="M1147.8455,-407.9297C1217.8628,-398.5124 1325.8939,-383.0925 1419,-366 1422.2264,-365.4077 1425.5298,-364.7758 1428.8613,-364.1185"/>
+<polygon fill="#191970" stroke="#191970" points="1147.2442,-404.4789 1137.7972,-409.2751 1148.1732,-411.417 1147.2442,-404.4789"/>
</g>
<!-- Node89->Node67 -->
<g id="edge67" class="edge">
<title>Node89->Node67</title>
-<path fill="none" stroke="#191970" d="M1269.8354,-393.2165C1280.1756,-374.0036 1297.3688,-348.2738 1320.5,-335 1380.6515,-300.4822 1561.7251,-307.927 1630.5,-299 1645.3398,-297.0738 1661.3055,-294.8604 1676.289,-292.7204"/>
-<polygon fill="#191970" stroke="#191970" points="1266.6245,-391.8083 1265.1858,-402.305 1272.8563,-394.9965 1266.6245,-391.8083"/>
+<path fill="none" stroke="#191970" d="M1074.3347,-392.1634C1076.0272,-373.4721 1081.5507,-348.9562 1098,-335 1125.4203,-311.7355 1352.4482,-294.429 1464.6784,-287.2073"/>
+<polygon fill="#191970" stroke="#191970" points="1070.8275,-392.1814 1073.7272,-402.3717 1077.8151,-392.5973 1070.8275,-392.1814"/>
</g>
<!-- Node89->Node68 -->
<g id="edge58" class="edge">
<title>Node89->Node68</title>
-<path fill="none" stroke="#191970" d="M1332.3082,-401.0709C1383.2981,-389.721 1450.3161,-374.8034 1498.3275,-364.1165"/>
-<polygon fill="#191970" stroke="#191970" points="1331.335,-397.7018 1322.3344,-403.291 1332.856,-404.5346 1331.335,-397.7018"/>
+<path fill="none" stroke="#191970" d="M1027.4298,-398.5897C1001.7245,-388.1518 970.2358,-375.3654 946.1907,-365.6017"/>
+<polygon fill="#191970" stroke="#191970" points="1026.3887,-401.9444 1036.9708,-402.4639 1029.0223,-395.4587 1026.3887,-401.9444"/>
</g>
<!-- Node89->Node69 -->
<g id="edge68" class="edge">
<title>Node89->Node69</title>
-<path fill="none" stroke="#191970" d="M1259.1266,-392.1379C1260.9902,-373.644 1266.6006,-349.3944 1282.5,-335 1313.3674,-307.0543 1425.6894,-293.5013 1497.4732,-287.5573"/>
-<polygon fill="#191970" stroke="#191970" points="1255.6262,-392.0219 1258.4129,-402.2436 1262.6088,-392.5151 1255.6262,-392.0219"/>
+<path fill="none" stroke="#191970" d="M1060.3392,-393.7157C1049.1384,-375.8158 1031.9214,-351.6095 1012,-335 993.5733,-319.6367 969.7898,-307.3605 949.7203,-298.6531"/>
+<polygon fill="#191970" stroke="#191970" points="1057.4478,-395.6975 1065.6305,-402.4277 1063.4308,-392.0637 1057.4478,-395.6975"/>
</g>
<!-- Node89->Node70 -->
<g id="edge59" class="edge">
<title>Node89->Node70</title>
-<path fill="none" stroke="#191970" d="M1184.4988,-414.2301C1035.2977,-407.3831 687.7634,-390.1487 396.5,-366 368.5572,-363.6833 337.6716,-360.4957 311.5313,-357.6134"/>
-<polygon fill="#191970" stroke="#191970" points="1184.7457,-417.745 1194.8951,-414.7051 1185.0653,-410.7523 1184.7457,-417.745"/>
+<path fill="none" stroke="#191970" d="M1000.1615,-403.0767C938.0423,-390.9427 850.2823,-373.8 792.0651,-362.4282"/>
+<polygon fill="#191970" stroke="#191970" points="999.8026,-406.5727 1010.2882,-405.0548 1001.1447,-399.7026 999.8026,-406.5727"/>
</g>
<!-- Node89->Node71 -->
<g id="edge69" class="edge">
<title>Node89->Node71</title>
-<path fill="none" stroke="#191970" d="M1231.8379,-395.8068C1220.804,-386.7124 1207.9388,-375.9562 1196.5,-366 1181.0752,-352.5744 1181.3834,-342.8479 1162.5,-335 1086.279,-303.3229 502.841,-304.7487 420.5,-299 384.7271,-296.5025 344.7921,-292.9032 312.6213,-289.8015"/>
-<polygon fill="#191970" stroke="#191970" points="1229.9149,-398.7565 1239.8647,-402.3968 1234.3567,-393.3462 1229.9149,-398.7565"/>
+<path fill="none" stroke="#191970" d="M1047.2637,-395.8915C1036.2182,-386.8102 1023.3644,-376.0411 1012,-366 996.9199,-352.676 996.8541,-344.2827 979,-335 947.2508,-318.493 855.6365,-302.1598 793.1067,-292.4509"/>
+<polygon fill="#191970" stroke="#191970" points="1045.3466,-398.8451 1055.3032,-402.4669 1049.7784,-393.4266 1045.3466,-398.8451"/>
</g>
<!-- Node89->Node90 -->
<g id="edge60" class="edge">
<title>Node89->Node90</title>
-<path fill="none" stroke="#191970" d="M1184.4755,-415.2587C1055.1392,-410.6444 779.8367,-397.7197 549.5,-366 546.832,-365.6326 544.1169,-365.228 541.3786,-364.7942"/>
-<polygon fill="#191970" stroke="#191970" points="1184.5419,-418.763 1194.6584,-415.6154 1184.7871,-411.7673 1184.5419,-418.763"/>
+<path fill="none" stroke="#191970" d="M1113.1736,-397.9132C1133.7099,-387.645 1158.4877,-375.2561 1177.5645,-365.7177"/>
+<polygon fill="#191970" stroke="#191970" points="1111.4512,-394.8612 1104.0722,-402.4639 1114.5818,-401.1222 1111.4512,-394.8612"/>
</g>
<!-- Node89->Node91 -->
<g id="edge61" class="edge">
<title>Node89->Node91</title>
-<path fill="none" stroke="#191970" d="M1247.2525,-393.0609C1236.9645,-373.7557 1219.7986,-347.9778 1196.5,-335 1153.5051,-311.051 835.6521,-293.325 694.1328,-286.5342"/>
-<polygon fill="#191970" stroke="#191970" points="1244.2376,-394.8548 1251.8744,-402.1985 1250.484,-391.6953 1244.2376,-394.8548"/>
+<path fill="none" stroke="#191970" d="M1083.6031,-393.3068C1092.6945,-373.9139 1108.3457,-347.8855 1131,-335 1145.855,-326.5507 1635.5733,-297.8008 1818.2199,-287.3432"/>
+<polygon fill="#191970" stroke="#191970" points="1080.3935,-391.9109 1079.5547,-402.4725 1086.7967,-394.7392 1080.3935,-391.9109"/>
</g>
<!-- Node89->Node92 -->
<g id="edge64" class="edge">
<title>Node89->Node92</title>
-<path fill="none" stroke="#191970" d="M1184.7108,-410.961C1084.32,-401.8793 899.137,-384.4691 741.5,-366 726.1932,-364.2066 709.7652,-362.1009 694.2532,-360.0298"/>
-<polygon fill="#191970" stroke="#191970" points="1184.4754,-414.4538 1194.7495,-411.8669 1185.1046,-407.4822 1184.4754,-414.4538"/>
+<path fill="none" stroke="#191970" d="M1147.7337,-410.2029C1280.0393,-397.1091 1555.8272,-369.8155 1683.4837,-357.1818"/>
+<polygon fill="#191970" stroke="#191970" points="1147.2972,-406.7289 1137.6905,-411.1968 1147.9866,-413.6948 1147.2972,-406.7289"/>
</g>
<!-- Node89->Node93 -->
<g id="edge70" class="edge">
<title>Node89->Node93</title>
-<path fill="none" stroke="#191970" d="M1184.6923,-401.2025C1139.3256,-391.1459 1080.0237,-377.9302 1027.5,-366 1024.8222,-365.3918 1022.0848,-364.7677 1019.32,-364.1356"/>
-<polygon fill="#191970" stroke="#191970" points="1184.2171,-404.682 1194.7375,-403.4281 1185.7314,-397.8477 1184.2171,-404.682"/>
+<path fill="none" stroke="#191970" d="M1146.234,-400.0911C1191.9373,-389.0763 1250.2706,-375.0175 1293.1482,-364.6837"/>
+<polygon fill="#191970" stroke="#191970" points="1145.2902,-396.7183 1136.3886,-402.4639 1146.9303,-403.5234 1145.2902,-396.7183"/>
</g>
<!-- Node89->Node94 -->
<g id="edge71" class="edge">
<title>Node89->Node94</title>
-<path fill="none" stroke="#191970" d="M1212.4942,-398.5897C1187.1006,-388.1518 1155.9935,-375.3654 1132.2399,-365.6017"/>
-<polygon fill="#191970" stroke="#191970" points="1211.3398,-401.8992 1221.9196,-402.4639 1214.0011,-395.4248 1211.3398,-401.8992"/>
+<path fill="none" stroke="#191970" d="M1148.0807,-416.2545C1283.5444,-413.1304 1579.9953,-402.3715 1827,-366 1829.9948,-365.559 1833.0532,-365.0563 1836.1341,-364.5082"/>
+<polygon fill="#191970" stroke="#191970" points="1147.7442,-412.761 1137.8249,-416.4834 1147.9005,-419.7593 1147.7442,-412.761"/>
</g>
<!-- Node89->Node95 -->
<g id="edge72" class="edge">
<title>Node89->Node95</title>
-<path fill="none" stroke="#191970" d="M1304.5058,-398.5897C1329.8994,-388.1518 1361.0065,-375.3654 1384.7601,-365.6017"/>
-<polygon fill="#191970" stroke="#191970" points="1302.9989,-395.4248 1295.0804,-402.4639 1305.6602,-401.8992 1302.9989,-395.4248"/>
+<path fill="none" stroke="#191970" d="M1000.2619,-414.8677C882.3655,-409.896 644.3952,-396.728 445,-366 441.8886,-365.5205 438.7077,-364.9785 435.5036,-364.3917"/>
+<polygon fill="#191970" stroke="#191970" points="1000.1242,-418.3649 1010.2605,-415.2818 1000.4139,-411.3709 1000.1242,-418.3649"/>
</g>
<!-- Node91->Node61 -->
<g id="edge62" class="edge">
<title>Node91->Node61</title>
-<path fill="none" stroke="#191970" d="M656.1775,-262.3469C670.1643,-252.3776 686.5308,-240.7121 699.2544,-231.6432"/>
-<polygon fill="#191970" stroke="#191970" points="653.9418,-259.6423 647.8301,-268.2967 658.0047,-265.3426 653.9418,-259.6423"/>
+<path fill="none" stroke="#191970" d="M1936.0642,-264.865C1964.24,-254.3773 1998.9663,-241.4515 2025.4283,-231.6017"/>
+<polygon fill="#191970" stroke="#191970" points="1934.5464,-261.6953 1926.3955,-268.4639 1936.9883,-268.2556 1934.5464,-261.6953"/>
</g>
<!-- Node91->Node62 -->
<g id="edge63" class="edge">
<title>Node91->Node62</title>
-<path fill="none" stroke="#191970" d="M632.7333,-258.3924C637.0943,-241.8753 643.3952,-219.8986 650.5,-201 655.2208,-188.4428 661.6992,-174.6929 666.7354,-164.5616"/>
-<polygon fill="#191970" stroke="#191970" points="629.2681,-257.8135 630.1586,-268.3708 636.0461,-259.5624 629.2681,-257.8135"/>
+<path fill="none" stroke="#191970" d="M1910.9398,-261.7034C1932.4035,-243.7262 1964.7627,-218.4333 1996,-201 2022.5672,-186.173 2054.2771,-173.5871 2079.8441,-164.5692"/>
+<polygon fill="#191970" stroke="#191970" points="1908.6368,-259.0673 1903.2721,-268.2035 1913.1634,-264.4069 1908.6368,-259.0673"/>
</g>
<!-- Node92->Node91 -->
<g id="edge65" class="edge">
<title>Node92->Node91</title>
-<path fill="none" stroke="#191970" d="M626.5,-325.0249C626.5,-316.128 626.5,-306.4287 626.5,-298.6432"/>
-<polygon fill="#191970" stroke="#191970" points="623.0001,-325.2966 626.5,-335.2967 630.0001,-325.2967 623.0001,-325.2966"/>
+<path fill="none" stroke="#191970" d="M1790.466,-330.9132C1811.1555,-320.645 1836.1182,-308.2561 1855.3374,-298.7177"/>
+<polygon fill="#191970" stroke="#191970" points="1788.6982,-327.8831 1781.2966,-335.4639 1791.8101,-334.1534 1788.6982,-327.8831"/>
</g>
<!-- Node96->Node77 -->
<g id="edge74" class="edge">
<title>Node96->Node77</title>
-<path fill="none" stroke="#191970" d="M2157.0362,-193.1975C2171.335,-174.6598 2193.5891,-149.3639 2218.5,-134 2306.5823,-79.6748 2339.8985,-86.669 2441.5,-67 2626.6058,-31.1654 2850.6588,-20.14 2952.8478,-16.8476"/>
-<polygon fill="#191970" stroke="#191970" points="2154.0618,-191.33 2150.8623,-201.4302 2159.662,-195.5298 2154.0618,-191.33"/>
+<path fill="none" stroke="#191970" d="M2430.815,-191.8191C2444.6665,-157.1083 2475.1071,-94.9882 2524,-67 2584.5707,-32.327 3062.5678,-19.8015 3227.3727,-16.4884"/>
+<polygon fill="#191970" stroke="#191970" points="2427.5156,-190.6478 2427.1961,-201.2379 2434.0499,-193.1585 2427.5156,-190.6478"/>
</g>
<!-- Node97->Node76 -->
<g id="edge77" class="edge">
<title>Node97->Node76</title>
-<path fill="none" stroke="#191970" d="M2633.1516,-392.8514C2647.5331,-361.4569 2675.4445,-306.6487 2710.5,-268 2764.3332,-208.6489 2854.8437,-174.3611 2904.4212,-159.1044"/>
-<polygon fill="#191970" stroke="#191970" points="2629.87,-391.6153 2628.9657,-402.1715 2636.2555,-394.4832 2629.87,-391.6153"/>
+<path fill="none" stroke="#191970" d="M2888.2206,-394.9396C2918.7784,-364.467 2976.128,-309.2277 3030,-268 3088.9878,-222.8573 3165.3882,-178.0818 3198.9348,-159.0416"/>
+<polygon fill="#191970" stroke="#191970" points="2885.6324,-392.5783 2881.0473,-402.1296 2890.5879,-397.5223 2885.6324,-392.5783"/>
</g>
<!-- Node97->Node82 -->
<g id="edge78" class="edge">
<title>Node97->Node82</title>
-<path fill="none" stroke="#191970" d="M2614.7439,-392.2943C2606.5807,-362.5885 2595.662,-311.7303 2601.5,-268 2603.4862,-253.122 2608.565,-236.4507 2612.0269,-226.2137"/>
-<polygon fill="#191970" stroke="#191970" points="2611.4714,-393.58 2617.5907,-402.2291 2618.2006,-391.6518 2611.4714,-393.58"/>
+<path fill="none" stroke="#191970" d="M2855.5596,-392.7211C2844.1642,-361.5525 2830.244,-307.3548 2852,-268 2863.3053,-247.5497 2886.5086,-234.1241 2905.3564,-226.124"/>
+<polygon fill="#191970" stroke="#191970" points="2852.41,-394.2815 2859.2721,-402.3538 2858.9416,-391.7641 2852.41,-394.2815"/>
</g>
<!-- Node97->Node85 -->
<g id="edge76" class="edge">
<title>Node97->Node85</title>
-<path fill="none" stroke="#191970" d="M2582.4064,-398.0343C2556.7503,-385.5781 2524.5857,-369.9619 2504.2812,-360.1039"/>
-<polygon fill="#191970" stroke="#191970" points="2581.0056,-401.2448 2591.5301,-402.4639 2584.0629,-394.9478 2581.0056,-401.2448"/>
+<path fill="none" stroke="#191970" d="M2825.9064,-398.0343C2800.2503,-385.5781 2768.0857,-369.9619 2747.7812,-360.1039"/>
+<polygon fill="#191970" stroke="#191970" points="2824.5056,-401.2448 2835.0301,-402.4639 2827.5629,-394.9478 2824.5056,-401.2448"/>
</g>
<!-- Node98->Node82 -->
<g id="edge81" class="edge">
<title>Node98->Node82</title>
-<path fill="none" stroke="#191970" d="M2417.3878,-394.5621C2404.4235,-377.5706 2392.269,-354.144 2403.5,-335 2440.3043,-272.2641 2523.7665,-240.2704 2574.2969,-226.0738"/>
-<polygon fill="#191970" stroke="#191970" points="2414.691,-396.7933 2423.7183,-402.3394 2420.12,-392.3743 2414.691,-396.7933"/>
+<path fill="none" stroke="#191970" d="M2660.4752,-394.4627C2647.0891,-377.2374 2634.5842,-353.5588 2647,-335 2676.2248,-291.3154 2827.4493,-245.426 2897.8579,-226.0372"/>
+<polygon fill="#191970" stroke="#191970" points="2657.9359,-396.8838 2667.012,-402.3496 2663.3254,-392.4169 2657.9359,-396.8838"/>
</g>
<!-- Node98->Node85 -->
<g id="edge80" class="edge">
<title>Node98->Node85</title>
-<path fill="none" stroke="#191970" d="M2454.7423,-393.8428C2462.7308,-382.2073 2471.9129,-368.8334 2477.9399,-360.055"/>
-<polygon fill="#191970" stroke="#191970" points="2451.7128,-392.0716 2448.9381,-402.2967 2457.4836,-396.0337 2451.7128,-392.0716"/>
+<path fill="none" stroke="#191970" d="M2698.2423,-393.8428C2706.2308,-382.2073 2715.4129,-368.8334 2721.4399,-360.055"/>
+<polygon fill="#191970" stroke="#191970" points="2695.2128,-392.0716 2692.4381,-402.2967 2700.9836,-396.0337 2695.2128,-392.0716"/>
</g>
<!-- Node99->Node61 -->
<g id="edge89" class="edge">
<title>Node99->Node61</title>
-<path fill="none" stroke="#191970" d="M547.9881,-265.6842C585.2636,-255.0568 632.0785,-241.7095 667.5314,-231.6017"/>
-<polygon fill="#191970" stroke="#191970" points="546.8957,-262.3561 538.2386,-268.4639 548.815,-269.0879 546.8957,-262.3561"/>
+<path fill="none" stroke="#191970" d="M2145.7253,-263.3561C2128.4439,-253.1995 2107.83,-241.0843 2091.8929,-231.7177"/>
+<polygon fill="#191970" stroke="#191970" points="2144.0214,-266.4144 2154.4162,-268.4639 2147.5683,-260.3795 2144.0214,-266.4144"/>
</g>
<!-- Node99->Node62 -->
<g id="edge90" class="edge">
<title>Node99->Node62</title>
-<path fill="none" stroke="#191970" d="M515.018,-262.5719C553.39,-235.3664 619.4778,-188.5105 653.2646,-164.5558"/>
-<polygon fill="#191970" stroke="#191970" points="512.9467,-259.75 506.8133,-268.389 516.9954,-265.4604 512.9467,-259.75"/>
+<path fill="none" stroke="#191970" d="M2170.3369,-259.0686C2159.4645,-231.58 2142.1379,-187.7732 2133.0362,-164.7614"/>
+<polygon fill="#191970" stroke="#191970" points="2167.0905,-260.3772 2174.0233,-268.389 2173.5999,-257.8026 2167.0905,-260.3772"/>
</g>
</g>
</svg>
diff --git a/docs/reference/api/doxygen/array_8h__dep__incl.svg b/docs/reference/api/doxygen/array_8h__dep__incl.svg
index e791e5ce81..cf7a47a605 100644
--- a/docs/reference/api/doxygen/array_8h__dep__incl.svg
+++ b/docs/reference/api/doxygen/array_8h__dep__incl.svg
@@ -189,9 +189,9 @@
<path fill="none" stroke="#191970" d="M1922.9813,-797.1864C1823.2369,-778.64 1649,-737.9491 1649,-680 1649,-680 1649,-680 1649,-484.5 1649,-440.2885 1555.5645,-337.5363 1518.7013,-298.7178"/>
<polygon fill="#191970" stroke="#191970" points="1922.5174,-800.6595 1932.9844,-799.0187 1923.7787,-793.7741 1922.5174,-800.6595"/>
</g>
-<!-- Node145 -->
+<!-- Node146 -->
<g id="node32" class="node">
-<title>Node145</title>
+<title>Node146</title>
<g id="a_node32"><a xlink:href="meta__schedule_2cost__model_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/cost_model.h">
<polygon fill="#ffffff" stroke="#000000" points="2535,-268.5 2535,-298.5 2687,-298.5 2687,-268.5 2535,-268.5"/>
<text text-anchor="start" x="2543" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
@@ -199,15 +199,15 @@
</a>
</g>
</g>
-<!-- Node20->Node145 -->
+<!-- Node20->Node146 -->
<g id="edge105" class="edge">
-<title>Node20->Node145</title>
+<title>Node20->Node146</title>
<path fill="none" stroke="#191970" d="M2059.2745,-805.014C2191.5045,-797.8435 2477.9694,-780.1516 2574,-757 2662.5776,-735.6452 2764,-771.1154 2764,-680 2764,-680 2764,-680 2764,-417.5 2764,-379.8735 2769.6718,-363.4087 2745,-335 2729.9653,-317.6882 2708.4307,-306.168 2687.0097,-298.5125"/>
<polygon fill="#191970" stroke="#191970" points="2059.0575,-801.5205 2049.2601,-805.5529 2059.4337,-808.5104 2059.0575,-801.5205"/>
</g>
-<!-- Node146 -->
+<!-- Node147 -->
<g id="node33" class="node">
-<title>Node146</title>
+<title>Node147</title>
<g id="a_node33"><a xlink:href="measure__candidate_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/measure_candidate.h">
<polygon fill="#ffffff" stroke="#000000" points="2584,-335.5 2584,-365.5 2736,-365.5 2736,-335.5 2584,-335.5"/>
<text text-anchor="start" x="2592" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
@@ -215,15 +215,15 @@
</a>
</g>
</g>
-<!-- Node20->Node146 -->
+<!-- Node20->Node147 -->
<g id="edge110" class="edge">
-<title>Node20->Node146</title>
+<title>Node20->Node147</title>
<path fill="none" stroke="#191970" d="M2059.5575,-804.5949C2195.6124,-796.465 2490.2624,-776.824 2532,-757 2575.434,-736.3703 2608,-728.0843 2608,-680 2608,-680 2608,-680 2608,-484.5 2608,-438.8339 2634.5721,-389.9818 2649.7265,-365.8342"/>
<polygon fill="#191970" stroke="#191970" points="2059.0291,-801.12 2049.2542,-805.2066 2059.4441,-808.1077 2059.0291,-801.12"/>
</g>
-<!-- Node147 -->
+<!-- Node148 -->
<g id="node34" class="node">
-<title>Node147</title>
+<title>Node148</title>
<g id="a_node34"><a xlink:href="feature__extractor_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/feature_extractor.h">
<polygon fill="#ffffff" stroke="#000000" points="2895,-268.5 2895,-298.5 3047,-298.5 3047,-268.5 2895,-268.5"/>
<text text-anchor="start" x="2903" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
@@ -231,15 +231,15 @@
</a>
</g>
</g>
-<!-- Node20->Node147 -->
+<!-- Node20->Node148 -->
<g id="edge108" class="edge">
-<title>Node20->Node147</title>
+<title>Node20->Node148</title>
<path fill="none" stroke="#191970" d="M2059.2922,-806.938C2229.6646,-802.6055 2670.0215,-788.6108 2813,-757 2904.7895,-736.7065 3010,-774.006 3010,-680 3010,-680 3010,-680 3010,-417.5 3010,-372.9183 2989.8525,-323.1982 2978.5196,-298.7571"/>
<polygon fill="#191970" stroke="#191970" points="2059.1027,-803.4416 2049.1937,-807.1915 2059.2784,-810.4394 2059.1027,-803.4416"/>
</g>
-<!-- Node148 -->
+<!-- Node149 -->
<g id="node35" class="node">
-<title>Node148</title>
+<title>Node149</title>
<g id="a_node35"><a xlink:href="runner_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/runner.h">
<polygon fill="#ffffff" stroke="#000000" points="2792,-335.5 2792,-365.5 2944,-365.5 2944,-335.5 2792,-335.5"/>
<text text-anchor="start" x="2800" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
@@ -247,15 +247,15 @@
</a>
</g>
</g>
-<!-- Node20->Node148 -->
+<!-- Node20->Node149 -->
<g id="edge112" class="edge">
-<title>Node20->Node148</title>
+<title>Node20->Node149</title>
<path fill="none" stroke="#191970" d="M2059.3517,-807.2975C2239.1476,-803.6763 2715.766,-790.9543 2778,-757 2815.3774,-736.6072 2835,-722.5786 2835,-680 2835,-680 2835,-680 2835,-484.5 2835,-440.4155 2852.0479,-390.4779 2861.6373,-365.8751"/>
<polygon fill="#191970" stroke="#191970" points="2059.1502,-803.8007 2049.2216,-807.4981 2059.2889,-810.7993 2059.1502,-803.8007"/>
</g>
-<!-- Node149 -->
+<!-- Node150 -->
<g id="node36" class="node">
-<title>Node149</title>
+<title>Node150</title>
<g id="a_node36"><a xlink:href="space__generator_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/space_generator.h">
<polygon fill="#ffffff" stroke="#000000" points="1750,-335.5 1750,-365.5 1902,-365.5 1902,-335.5 1750,-335.5"/>
<text text-anchor="start" x="1758" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
@@ -263,45 +263,45 @@
</a>
</g>
</g>
-<!-- Node20->Node149 -->
+<!-- Node20->Node150 -->
<g id="edge116" class="edge">
-<title>Node20->Node149</title>
+<title>Node20->Node150</title>
<path fill="none" stroke="#191970" d="M1954.7584,-787.8593C1921.3792,-765.8519 1877,-727.635 1877,-680 1877,-680 1877,-680 1877,-484.5 1877,-438.9479 1850.9389,-390.0469 1836.076,-365.862"/>
<polygon fill="#191970" stroke="#191970" points="1953.0891,-790.9458 1963.4038,-793.366 1956.8497,-785.0417 1953.0891,-790.9458"/>
</g>
-<!-- Node155 -->
+<!-- Node156 -->
<g id="node38" class="node">
-<title>Node155</title>
+<title>Node156</title>
<g id="a_node38"><a xlink:href="ir_2function_8h.html" target="_top" xlink:title="Function nodes. ">
<polygon fill="#ffffff" stroke="#000000" points="2019,-670.5 2019,-689.5 2155,-689.5 2155,-670.5 2019,-670.5"/>
<text text-anchor="middle" x="2087" y="-677.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/ir/function.h</text>
</a>
</g>
</g>
-<!-- Node20->Node155 -->
+<!-- Node20->Node156 -->
<g id="edge85" class="edge">
-<title>Node20->Node155</title>
+<title>Node20->Node156</title>
<path fill="none" stroke="#191970" d="M2008.5165,-785.0535C2029.788,-756.5807 2064.6846,-709.8701 2079.7266,-689.7358"/>
<polygon fill="#191970" stroke="#191970" points="2005.5706,-783.1488 2002.3894,-793.2548 2011.1784,-787.3383 2005.5706,-783.1488"/>
</g>
-<!-- Node162 -->
+<!-- Node163 -->
<g id="node43" class="node">
-<title>Node162</title>
+<title>Node163</title>
<g id="a_node43"><a xlink:href="ir_2type_8h.html" target="_top" xlink:title="IR/AST nodes for the unified type system in TVM. ">
<polygon fill="#ffffff" stroke="#ff0000" points="355,-732 355,-751 473,-751 473,-732 355,-732"/>
<text text-anchor="middle" x="414" y="-739" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/ir/type.h</text>
</a>
</g>
</g>
-<!-- Node20->Node162 -->
+<!-- Node20->Node163 -->
<g id="edge100" class="edge">
-<title>Node20->Node162</title>
+<title>Node20->Node163</title>
<path fill="none" stroke="#191970" d="M1922.404,-807.6427C1667.3447,-804.2221 773.8667,-790.0987 491,-757 479.2169,-755.6212 466.5496,-753.3895 454.96,-751.0437"/>
<polygon fill="#191970" stroke="#191970" points="1922.5934,-811.1455 1932.6391,-807.7788 1922.6865,-804.1461 1922.5934,-811.1455"/>
</g>
-<!-- Node154 -->
+<!-- Node155 -->
<g id="node44" class="node">
-<title>Node154</title>
+<title>Node155</title>
<g id="a_node44"><a xlink:href="schedule__rule_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/schedule_rule.h">
<polygon fill="#ffffff" stroke="#000000" points="1958,-335.5 1958,-365.5 2110,-365.5 2110,-335.5 1958,-335.5"/>
<text text-anchor="start" x="1966" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
@@ -309,15 +309,15 @@
</a>
</g>
</g>
-<!-- Node20->Node154 -->
+<!-- Node20->Node155 -->
<g id="edge113" class="edge">
-<title>Node20->Node154</title>
+<title>Node20->Node155</title>
<path fill="none" stroke="#191970" d="M1991,-783.3849C1991,-757.4823 1991,-715.9175 1991,-680 1991,-680 1991,-680 1991,-484.5 1991,-439.5445 2013.2139,-389.9879 2025.7092,-365.6684"/>
<polygon fill="#191970" stroke="#191970" points="1987.5001,-783.4649 1991,-793.4649 1994.5001,-783.465 1987.5001,-783.4649"/>
</g>
-<!-- Node201 -->
+<!-- Node202 -->
<g id="node45" class="node">
-<title>Node201</title>
+<title>Node202</title>
<g id="a_node45"><a xlink:href="structural__equal_8h.html" target="_top" xlink:title="Structural equality comparison. ">
<polygon fill="#ffffff" stroke="#ff0000" points="3155.5,-726.5 3155.5,-756.5 3306.5,-756.5 3306.5,-726.5 3155.5,-726.5"/>
<text text-anchor="start" x="3163.5" y="-744.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/node/structural</text>
@@ -325,15 +325,15 @@
</a>
</g>
</g>
-<!-- Node20->Node201 -->
+<!-- Node20->Node202 -->
<g id="edge119" class="edge">
-<title>Node20->Node201</title>
+<title>Node20->Node202</title>
<path fill="none" stroke="#191970" d="M2059.4914,-806.5657C2273.7238,-800.3368 2929.3774,-779.8698 3141,-757 3145.729,-756.4889 3150.5856,-755.8848 3155.4749,-755.2164"/>
<polygon fill="#191970" stroke="#191970" points="2059.2193,-803.0721 2049.3249,-806.8603 2059.4221,-810.0691 2059.2193,-803.0721"/>
</g>
-<!-- Node213 -->
+<!-- Node214 -->
<g id="node46" class="node">
-<title>Node213</title>
+<title>Node214</title>
<g id="a_node46"><a xlink:href="papi_8h.html" target="_top" xlink:title="include/tvm/runtime\l/contrib/papi.h">
<polygon fill="#ffffff" stroke="#000000" points="3325,-726.5 3325,-756.5 3441,-756.5 3441,-726.5 3325,-726.5"/>
<text text-anchor="start" x="3333" y="-744.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -341,15 +341,15 @@
</a>
</g>
</g>
-<!-- Node20->Node213 -->
+<!-- Node20->Node214 -->
<g id="edge120" class="edge">
-<title>Node20->Node213</title>
+<title>Node20->Node214</title>
<path fill="none" stroke="#191970" d="M2059.3601,-807.5454C2294.5482,-803.9949 3069.3896,-790.0181 3316,-757 3318.8401,-756.6198 3321.7365,-756.1746 3324.6537,-755.68"/>
<polygon fill="#191970" stroke="#191970" points="2059.1789,-804.0476 2049.2324,-807.6969 2059.2836,-811.0469 2059.1789,-804.0476"/>
</g>
-<!-- Node214 -->
+<!-- Node215 -->
<g id="node47" class="node">
-<title>Node214</title>
+<title>Node215</title>
<g id="a_node47"><a xlink:href="packed__func_8h.html" target="_top" xlink:title="Type-erased function used across TVM API. ">
<polygon fill="#ffffff" stroke="#ff0000" points="2180,-402.5 2180,-432.5 2296,-432.5 2296,-402.5 2180,-402.5"/>
<text text-anchor="start" x="2188" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -357,45 +357,45 @@
</a>
</g>
</g>
-<!-- Node20->Node214 -->
+<!-- Node20->Node215 -->
<g id="edge121" class="edge">
-<title>Node20->Node214</title>
+<title>Node20->Node215</title>
<path fill="none" stroke="#191970" d="M2059.1271,-801.5331C2131.01,-789.9744 2234,-759.5356 2234,-680 2234,-680 2234,-680 2234,-551.5 2234,-508.4426 2236.0889,-457.6375 2237.2476,-432.7685"/>
<polygon fill="#191970" stroke="#191970" points="2058.5647,-798.0783 2049.2046,-803.0419 2059.6171,-804.9988 2058.5647,-798.0783"/>
</g>
-<!-- Node191 -->
+<!-- Node192 -->
<g id="node48" class="node">
-<title>Node191</title>
+<title>Node192</title>
<g id="a_node48"><a xlink:href="buffer_8h.html" target="_top" xlink:title="Symbolic n-dimensional array, to represent a memory buffer. ">
<polygon fill="#ffffff" stroke="#ff0000" points="1493,-609 1493,-628 1621,-628 1621,-609 1493,-609"/>
<text text-anchor="middle" x="1557" y="-616" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/buffer.h</text>
</a>
</g>
</g>
-<!-- Node20->Node191 -->
+<!-- Node20->Node192 -->
<g id="edge137" class="edge">
-<title>Node20->Node191</title>
+<title>Node20->Node192</title>
<path fill="none" stroke="#191970" d="M1922.7702,-805.7139C1832.7867,-800.8569 1680.51,-788.1872 1635,-757 1587.8787,-724.7086 1565.9631,-654.0839 1559.2999,-628.2309"/>
<polygon fill="#191970" stroke="#191970" points="1922.7181,-809.2158 1932.8868,-806.2412 1923.0826,-802.2253 1922.7181,-809.2158"/>
</g>
-<!-- Node192 -->
+<!-- Node193 -->
<g id="node49" class="node">
-<title>Node192</title>
+<title>Node193</title>
<g id="a_node49"><a xlink:href="tir_2expr_8h.html" target="_top" xlink:title="TIR expressions. ">
<polygon fill="#ffffff" stroke="#ff0000" points="1404.5,-542 1404.5,-561 1525.5,-561 1525.5,-542 1404.5,-542"/>
<text text-anchor="middle" x="1465" y="-549" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/expr.h</text>
</a>
</g>
</g>
-<!-- Node20->Node192 -->
+<!-- Node20->Node193 -->
<g id="edge143" class="edge">
-<title>Node20->Node192</title>
+<title>Node20->Node193</title>
<path fill="none" stroke="#191970" d="M1922.4931,-804.9924C1825.7704,-799.1139 1654.8435,-785.0534 1600,-757 1533.1014,-722.7802 1519.4461,-700.257 1484,-634 1471.1042,-609.8947 1466.9176,-577.2662 1465.5923,-561.2249"/>
<polygon fill="#191970" stroke="#191970" points="1922.5712,-808.5031 1932.7612,-805.6027 1922.9865,-801.5155 1922.5712,-808.5031"/>
</g>
-<!-- Node197 -->
+<!-- Node198 -->
<g id="node50" class="node">
-<title>Node197</title>
+<title>Node198</title>
<g id="a_node50"><a xlink:href="index__map_8h.html" target="_top" xlink:title="Defines a remapping of buffer indices. ">
<polygon fill="#ffffff" stroke="#ff0000" points="3459,-726.5 3459,-756.5 3577,-756.5 3577,-726.5 3459,-726.5"/>
<text text-anchor="start" x="3467" y="-744.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/index</text>
@@ -403,9 +403,9 @@
</a>
</g>
</g>
-<!-- Node20->Node197 -->
+<!-- Node20->Node198 -->
<g id="edge144" class="edge">
-<title>Node20->Node197</title>
+<title>Node20->Node198</title>
<path fill="none" stroke="#191970" d="M2059.4385,-807.7985C2309.857,-804.9647 3175.925,-792.7476 3450,-757 3452.8816,-756.6242 3455.8206,-756.1825 3458.7808,-755.6908"/>
<polygon fill="#191970" stroke="#191970" points="2059.3433,-804.2992 2049.3831,-807.911 2059.4217,-811.2988 2059.3433,-804.2992"/>
</g>
@@ -636,24 +636,24 @@
<path fill="none" stroke="#191970" d="M976.8853,-614.5319C855.7583,-607.5829 631.8619,-591.5712 605,-567 586.3325,-549.9244 585.1317,-518.2514 586.3154,-499.5084"/>
<polygon fill="#191970" stroke="#191970" points="977.0251,-618.0453 987.207,-615.1163 977.4209,-611.0565 977.0251,-618.0453"/>
</g>
-<!-- Node142 -->
+<!-- Node143 -->
<g id="node29" class="node">
-<title>Node142</title>
+<title>Node143</title>
<g id="a_node29"><a xlink:href="error_8h.html" target="_top" xlink:title="Utilities for error tracking and reporting. ">
<polygon fill="#ffffff" stroke="#ff0000" points="849.5,-542 849.5,-561 968.5,-561 968.5,-542 849.5,-542"/>
<text text-anchor="middle" x="909" y="-549" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/ir/error.h</text>
</a>
</g>
</g>
-<!-- Node22->Node142 -->
+<!-- Node22->Node143 -->
<g id="edge45" class="edge">
-<title>Node22->Node142</title>
+<title>Node22->Node143</title>
<path fill="none" stroke="#191970" d="M1023.8577,-604.5722C995.6873,-591.5555 954.3763,-572.467 929.7364,-561.0817"/>
<polygon fill="#191970" stroke="#191970" points="1022.6792,-607.8831 1033.225,-608.9005 1025.6154,-601.5287 1022.6792,-607.8831"/>
</g>
-<!-- Node143 -->
+<!-- Node144 -->
<g id="node30" class="node">
-<title>Node143</title>
+<title>Node144</title>
<g id="a_node30"><a xlink:href="global__var__supply_8h.html" target="_top" xlink:title="GlobalVarSupply that can be used to generate unique. ">
<polygon fill="#ffffff" stroke="#000000" points="965.5,-469.5 965.5,-499.5 1082.5,-499.5 1082.5,-469.5 965.5,-469.5"/>
<text text-anchor="start" x="973.5" y="-487.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/ir/global</text>
@@ -661,15 +661,15 @@
</a>
</g>
</g>
-<!-- Node22->Node143 -->
+<!-- Node22->Node144 -->
<g id="edge47" class="edge">
-<title>Node22->Node143</title>
+<title>Node22->Node144</title>
<path fill="none" stroke="#191970" d="M1054.9467,-598.3197C1055.2287,-581.4017 1054.4367,-556.7517 1049,-536 1045.6248,-523.1169 1038.9333,-509.6104 1033.3221,-499.663"/>
<polygon fill="#191970" stroke="#191970" points="1051.4379,-598.5496 1054.6238,-608.6541 1058.4345,-598.7683 1051.4379,-598.5496"/>
</g>
-<!-- Node144 -->
+<!-- Node145 -->
<g id="node31" class="node">
-<title>Node144</title>
+<title>Node145</title>
<g id="a_node31"><a xlink:href="arg__info_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/arg_info.h">
<polygon fill="#ffffff" stroke="#000000" points="2338,-402.5 2338,-432.5 2490,-432.5 2490,-402.5 2338,-402.5"/>
<text text-anchor="start" x="2346" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
@@ -677,21 +677,21 @@
</a>
</g>
</g>
-<!-- Node22->Node144 -->
+<!-- Node22->Node145 -->
<g id="edge51" class="edge">
-<title>Node22->Node144</title>
+<title>Node22->Node145</title>
<path fill="none" stroke="#191970" d="M1130.8889,-614.809C1284.4531,-607.0935 1624.2162,-588.2062 1739,-567 1888.6136,-539.359 1918.4832,-502.0302 2067,-469 2080.0977,-466.0871 2239.6385,-442.7741 2337.7688,-428.5317"/>
<polygon fill="#191970" stroke="#191970" points="1130.4162,-611.3282 1120.6034,-615.323 1130.7657,-618.3194 1130.4162,-611.3282"/>
</g>
-<!-- Node22->Node149 -->
+<!-- Node22->Node150 -->
<g id="edge72" class="edge">
-<title>Node22->Node149</title>
+<title>Node22->Node150</title>
<path fill="none" stroke="#191970" d="M1130.907,-615.1091C1291.0369,-607.6546 1649.6718,-588.761 1701,-567 1706.1822,-564.803 1773.9036,-504.7005 1777,-500 1805.8484,-456.2061 1818.9867,-394.0093 1823.7528,-365.648"/>
<polygon fill="#191970" stroke="#191970" points="1130.4795,-611.6251 1120.652,-615.5835 1130.803,-618.6176 1130.4795,-611.6251"/>
</g>
-<!-- Node150 -->
+<!-- Node151 -->
<g id="node37" class="node">
-<title>Node150</title>
+<title>Node151</title>
<g id="a_node37"><a xlink:href="state_8h.html" target="_top" xlink:title="This file defines ScheduleState, the core data structure of TensorIR scheduling. ">
<polygon fill="#ffffff" stroke="#ff0000" points="1411,-402.5 1411,-432.5 1545,-432.5 1545,-402.5 1411,-402.5"/>
<text text-anchor="start" x="1419" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/schedule</text>
@@ -699,9 +699,9 @@
</a>
</g>
</g>
-<!-- Node22->Node150 -->
+<!-- Node22->Node151 -->
<g id="edge83" class="edge">
-<title>Node22->Node150</title>
+<title>Node22->Node151</title>
<path fill="none" stroke="#191970" d="M1130.9831,-607.1693C1172.5302,-599.2754 1223.9687,-586.5702 1267,-567 1346.8951,-530.6645 1427.817,-462.6382 1461.603,-432.5145"/>
<polygon fill="#191970" stroke="#191970" points="1130.1178,-603.7698 1120.9181,-609.0249 1131.387,-610.6537 1130.1178,-603.7698"/>
</g>
@@ -983,135 +983,135 @@
<path fill="none" stroke="#191970" d="M249.3004,-131.8486C239.421,-119.1194 226.4626,-102.4229 218.4365,-92.0817"/>
<polygon fill="#191970" stroke="#191970" points="246.6534,-134.1466 255.5497,-139.9005 252.1834,-129.8547 246.6534,-134.1466"/>
</g>
-<!-- Node142->Node26 -->
+<!-- Node143->Node26 -->
<g id="edge46" class="edge">
-<title>Node142->Node26</title>
+<title>Node143->Node26</title>
<path fill="none" stroke="#191970" d="M891.2042,-534.94C877.393,-522.0879 858.7303,-504.7212 847.2967,-494.0817"/>
<polygon fill="#191970" stroke="#191970" points="888.9791,-537.6504 898.6842,-541.9005 893.7478,-532.5259 888.9791,-537.6504"/>
</g>
-<!-- Node143->Node23 -->
+<!-- Node144->Node23 -->
<g id="edge48" class="edge">
-<title>Node143->Node23</title>
+<title>Node144->Node23</title>
<path fill="none" stroke="#191970" d="M991.7159,-463.2983C981.7832,-455.0691 971.9167,-444.7725 966,-433 943.6544,-388.5386 947.4161,-327.2019 950.7669,-298.9207"/>
<polygon fill="#191970" stroke="#191970" points="989.5784,-466.0698 999.6218,-469.4433 993.8743,-460.5429 989.5784,-466.0698"/>
</g>
-<!-- Node144->Node49 -->
+<!-- Node145->Node49 -->
<g id="edge55" class="edge">
-<title>Node144->Node49</title>
+<title>Node145->Node49</title>
<path fill="none" stroke="#191970" d="M2381.0107,-397.3728C2348.3843,-378.382 2296.4512,-350.5681 2248,-335 2167.5978,-309.1656 2071.4079,-295.89 2006.028,-289.346"/>
<polygon fill="#191970" stroke="#191970" points="2379.2888,-400.4206 2389.682,-402.4778 2382.8401,-394.3883 2379.2888,-400.4206"/>
</g>
-<!-- Node144->Node50 -->
+<!-- Node145->Node50 -->
<g id="edge67" class="edge">
-<title>Node144->Node50</title>
+<title>Node145->Node50</title>
<path fill="none" stroke="#191970" d="M2420.8087,-392.6171C2432.3705,-350.364 2455.5,-265.8364 2464.8931,-231.5088"/>
<polygon fill="#191970" stroke="#191970" points="2417.3974,-391.8231 2418.1339,-402.3923 2424.1492,-393.6707 2417.3974,-391.8231"/>
</g>
-<!-- Node144->Node145 -->
+<!-- Node145->Node146 -->
<g id="edge52" class="edge">
-<title>Node144->Node145</title>
+<title>Node145->Node146</title>
<path fill="none" stroke="#191970" d="M2446.6561,-396.9168C2461.2367,-387.6141 2478.5735,-376.3975 2494,-366 2528.0067,-343.0794 2566.7808,-315.4327 2590.0498,-298.6738"/>
<polygon fill="#191970" stroke="#191970" points="2444.5355,-394.1175 2437.9756,-402.4372 2448.292,-400.0242 2444.5355,-394.1175"/>
</g>
-<!-- Node144->Node146 -->
+<!-- Node145->Node147 -->
<g id="edge56" class="edge">
-<title>Node144->Node146</title>
+<title>Node145->Node147</title>
<path fill="none" stroke="#191970" d="M2478.9155,-399.8198C2518.016,-389.1704 2567.2817,-375.7525 2604.552,-365.6017"/>
<polygon fill="#191970" stroke="#191970" points="2477.936,-396.459 2469.2072,-402.4639 2479.7755,-403.213 2477.936,-396.459"/>
</g>
-<!-- Node144->Node148 -->
+<!-- Node145->Node149 -->
<g id="edge61" class="edge">
-<title>Node144->Node148</title>
+<title>Node145->Node149</title>
<path fill="none" stroke="#191970" d="M2500.2951,-404.7648C2584.4856,-392.3402 2711.3131,-373.6234 2791.8527,-361.7376"/>
<polygon fill="#191970" stroke="#191970" points="2499.6271,-401.3254 2490.2453,-406.2479 2500.6492,-408.2504 2499.6271,-401.3254"/>
</g>
-<!-- Node145->Node47 -->
+<!-- Node146->Node47 -->
<g id="edge54" class="edge">
-<title>Node145->Node47</title>
+<title>Node146->Node47</title>
<path fill="none" stroke="#191970" d="M2626.3313,-259.7498C2661.2057,-205.7246 2745.6649,-74.8861 2774.3105,-30.5103"/>
<polygon fill="#191970" stroke="#191970" points="2623.2798,-258.0235 2620.7969,-268.3233 2629.1609,-261.8199 2623.2798,-258.0235"/>
</g>
-<!-- Node145->Node50 -->
+<!-- Node146->Node50 -->
<g id="edge53" class="edge">
-<title>Node145->Node50</title>
+<title>Node146->Node50</title>
<path fill="none" stroke="#191970" d="M2570.0747,-264.1902C2548.1989,-253.8685 2521.6504,-241.3421 2501.2525,-231.7177"/>
<polygon fill="#191970" stroke="#191970" points="2568.595,-267.362 2579.1324,-268.4639 2571.5821,-261.0313 2568.595,-267.362"/>
</g>
-<!-- Node146->Node46 -->
+<!-- Node147->Node46 -->
<g id="edge59" class="edge">
-<title>Node146->Node46</title>
+<title>Node147->Node46</title>
<path fill="none" stroke="#191970" d="M2680.8315,-327.126C2686.9469,-318.8268 2692.8389,-309.0395 2696,-299 2700.1379,-285.8583 2699.1438,-281.4143 2696,-268 2680.0872,-200.1004 2635.4585,-128.5823 2614.7748,-97.9233"/>
<polygon fill="#191970" stroke="#191970" points="2677.9223,-325.163 2674.4829,-335.184 2683.4208,-329.4951 2677.9223,-325.163"/>
</g>
-<!-- Node146->Node50 -->
+<!-- Node147->Node50 -->
<g id="edge60" class="edge">
-<title>Node146->Node50</title>
+<title>Node147->Node50</title>
<path fill="none" stroke="#191970" d="M2589.107,-332.1526C2567.4088,-324.294 2544.4282,-313.4969 2526,-299 2502.6027,-280.594 2485.0303,-249.9672 2475.9406,-231.6583"/>
<polygon fill="#191970" stroke="#191970" points="2588.042,-335.4877 2598.6367,-335.4509 2590.3315,-328.8726 2588.042,-335.4877"/>
</g>
-<!-- Node146->Node145 -->
+<!-- Node147->Node146 -->
<g id="edge57" class="edge">
-<title>Node146->Node145</title>
+<title>Node147->Node146</title>
<path fill="none" stroke="#191970" d="M2642.8592,-327.0626C2635.9619,-317.6315 2628.2068,-307.0276 2622.0749,-298.6432"/>
<polygon fill="#191970" stroke="#191970" points="2640.1528,-329.2911 2648.8811,-335.2967 2645.803,-325.1588 2640.1528,-329.2911"/>
</g>
-<!-- Node146->Node147 -->
+<!-- Node147->Node148 -->
<g id="edge58" class="edge">
-<title>Node146->Node147</title>
+<title>Node147->Node148</title>
<path fill="none" stroke="#191970" d="M2739.5832,-333.3551C2789.4924,-322.6029 2853.2281,-308.8721 2901.17,-298.5438"/>
<polygon fill="#191970" stroke="#191970" points="2738.8331,-329.9363 2729.7945,-335.4639 2740.3073,-336.7793 2738.8331,-329.9363"/>
</g>
-<!-- Node148->Node46 -->
+<!-- Node149->Node46 -->
<g id="edge63" class="edge">
-<title>Node148->Node46</title>
+<title>Node149->Node46</title>
<path fill="none" stroke="#191970" d="M2847.1736,-327.9185C2810.2132,-288.0497 2730.4069,-202.8874 2660,-134 2647.1601,-121.4372 2632.1579,-107.6877 2620.9146,-97.5579"/>
<polygon fill="#191970" stroke="#191970" points="2844.8239,-330.5325 2854.1854,-335.4936 2849.9609,-325.7774 2844.8239,-330.5325"/>
</g>
-<!-- Node148->Node47 -->
+<!-- Node149->Node47 -->
<g id="edge65" class="edge">
-<title>Node148->Node47</title>
+<title>Node149->Node47</title>
<path fill="none" stroke="#191970" d="M2867.6978,-325.1096C2867.4034,-298.1602 2867,-254.3276 2867,-216.5 2867,-216.5 2867,-216.5 2867,-149.5 2867,-99.9692 2825.5089,-53.4923 2801.1977,-30.5111"/>
<polygon fill="#191970" stroke="#191970" points="2864.1992,-325.2675 2867.8119,-335.2273 2871.1988,-325.1885 2864.1992,-325.2675"/>
</g>
-<!-- Node148->Node48 -->
+<!-- Node149->Node48 -->
<g id="edge66" class="edge">
-<title>Node148->Node48</title>
+<title>Node149->Node48</title>
<path fill="none" stroke="#191970" d="M2840.8722,-329.1349C2816.9468,-311.0478 2780.5993,-285.3656 2746,-268 2665.2236,-227.4579 2642.2556,-220.4793 2554,-201 2398.7507,-166.7343 2212.3685,-155.2484 2110.2343,-151.4117"/>
<polygon fill="#191970" stroke="#191970" points="2839.0288,-332.1313 2849.0985,-335.4254 2843.2809,-326.5707 2839.0288,-332.1313"/>
</g>
-<!-- Node148->Node50 -->
+<!-- Node149->Node50 -->
<g id="edge64" class="edge">
-<title>Node148->Node50</title>
+<title>Node149->Node50</title>
<path fill="none" stroke="#191970" d="M2832.3026,-330.4564C2798.1951,-312.017 2744.8852,-285.0271 2696,-268 2646.8657,-250.8862 2589.7366,-237.9601 2545.0823,-229.3383"/>
<polygon fill="#191970" stroke="#191970" points="2830.9388,-333.6993 2841.3938,-335.4151 2834.2908,-327.554 2830.9388,-333.6993"/>
</g>
-<!-- Node148->Node145 -->
+<!-- Node149->Node146 -->
<g id="edge62" class="edge">
-<title>Node148->Node145</title>
+<title>Node149->Node146</title>
<path fill="none" stroke="#191970" d="M2800.4412,-332.8874C2759.551,-322.2273 2707.9467,-308.774 2668.9274,-298.6017"/>
<polygon fill="#191970" stroke="#191970" points="2799.7646,-336.3279 2810.3242,-335.4639 2801.5306,-329.5543 2799.7646,-336.3279"/>
</g>
-<!-- Node149->Node48 -->
+<!-- Node150->Node48 -->
<g id="edge73" class="edge">
-<title>Node149->Node48</title>
+<title>Node150->Node48</title>
<path fill="none" stroke="#191970" d="M1826.82,-325.3425C1828.5074,-307.8507 1833.0721,-284.7587 1845,-268 1881.7178,-216.4114 1948.6278,-182.3828 1992.1869,-164.5272"/>
<polygon fill="#191970" stroke="#191970" points="1823.3229,-325.1792 1826.1011,-335.4032 1830.3051,-325.6782 1823.3229,-325.1792"/>
</g>
-<!-- Node155->Node22 -->
+<!-- Node156->Node22 -->
<g id="edge86" class="edge">
-<title>Node155->Node22</title>
+<title>Node156->Node22</title>
<path fill="none" stroke="#191970" d="M2008.601,-675.3325C1812.5104,-663.6582 1303.4505,-633.3511 1120.6119,-622.4658"/>
<polygon fill="#191970" stroke="#191970" points="2008.6321,-678.8405 2018.8225,-675.941 2009.0482,-671.8528 2008.6321,-678.8405"/>
</g>
-<!-- Node155->Node91 -->
+<!-- Node156->Node91 -->
<g id="edge87" class="edge">
-<title>Node155->Node91</title>
+<title>Node156->Node91</title>
<path fill="none" stroke="#191970" d="M2008.4524,-679.5509C1694.2667,-677.1076 538.6481,-661.7032 408,-567 265.734,-463.8752 261.9638,-211.1694 262.751,-159.0114"/>
<polygon fill="#191970" stroke="#191970" points="2008.8034,-683.0535 2018.8296,-679.6292 2008.8563,-676.0537 2008.8034,-683.0535"/>
</g>
-<!-- Node156 -->
+<!-- Node157 -->
<g id="node39" class="node">
-<title>Node156</title>
+<title>Node157</title>
<g id="a_node39"><a xlink:href="script_2ir__builder_2base_8h.html" target="_top" xlink:title="include/tvm/script\l/ir_builder/base.h">
<polygon fill="#ffffff" stroke="#ff0000" points="2019,-603.5 2019,-633.5 2123,-633.5 2123,-603.5 2019,-603.5"/>
<text text-anchor="start" x="2027" y="-621.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/script</text>
@@ -1119,15 +1119,15 @@
</a>
</g>
</g>
-<!-- Node155->Node156 -->
+<!-- Node156->Node157 -->
<g id="edge88" class="edge">
-<title>Node155->Node156</title>
+<title>Node156->Node157</title>
<path fill="none" stroke="#191970" d="M2081.968,-660.6584C2079.6907,-651.9047 2077.0394,-641.7139 2074.914,-633.5446"/>
<polygon fill="#191970" stroke="#191970" points="2078.5949,-661.594 2084.5,-670.3906 2085.3694,-659.8315 2078.5949,-661.594"/>
</g>
-<!-- Node157 -->
+<!-- Node158 -->
<g id="node40" class="node">
-<title>Node157</title>
+<title>Node158</title>
<g id="a_node40"><a xlink:href="ir__builder_2ir_2frame_8h.html" target="_top" xlink:title="include/tvm/script\l/ir_builder/ir/frame.h">
<polygon fill="#ffffff" stroke="#ff0000" points="2045,-536.5 2045,-566.5 2161,-566.5 2161,-536.5 2045,-536.5"/>
<text text-anchor="start" x="2053" y="-554.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/script</text>
@@ -1135,15 +1135,15 @@
</a>
</g>
</g>
-<!-- Node155->Node157 -->
+<!-- Node156->Node158 -->
<g id="edge91" class="edge">
-<title>Node155->Node157</title>
+<title>Node156->Node158</title>
<path fill="none" stroke="#191970" d="M2109.2638,-663.6617C2118.1073,-655.7992 2127.2844,-645.5637 2132,-634 2141.4591,-610.8042 2126.6776,-583.237 2114.9737,-566.5695"/>
<polygon fill="#191970" stroke="#191970" points="2106.8504,-661.1158 2101.3737,-670.1853 2111.3109,-666.5106 2106.8504,-661.1158"/>
</g>
-<!-- Node158 -->
+<!-- Node159 -->
<g id="node41" class="node">
-<title>Node158</title>
+<title>Node159</title>
<g id="a_node41"><a xlink:href="ir_2ir_8h.html" target="_top" xlink:title="include/tvm/script\l/ir_builder/ir/ir.h">
<polygon fill="#ffffff" stroke="#000000" points="2076,-469.5 2076,-499.5 2180,-499.5 2180,-469.5 2076,-469.5"/>
<text text-anchor="start" x="2084" y="-487.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/script</text>
@@ -1151,204 +1151,204 @@
</a>
</g>
</g>
-<!-- Node155->Node158 -->
+<!-- Node156->Node159 -->
<g id="edge92" class="edge">
-<title>Node155->Node158</title>
+<title>Node156->Node159</title>
<path fill="none" stroke="#191970" d="M2113.1237,-664.9342C2124.8765,-657.0255 2138.0811,-646.3958 2147,-634 2165.3876,-608.4441 2164.8182,-598.0541 2170,-567 2172.2676,-553.4101 2174.6216,-548.9795 2170,-536 2165.0463,-522.0879 2154.5463,-509.2129 2145.2847,-499.8159"/>
<polygon fill="#191970" stroke="#191970" points="2111.0561,-662.1017 2104.5142,-670.4357 2114.8254,-668.0003 2111.0561,-662.1017"/>
</g>
-<!-- Node160 -->
+<!-- Node161 -->
<g id="node42" class="node">
-<title>Node160</title>
+<title>Node161</title>
<g id="a_node42"><a xlink:href="tir_2function_8h.html" target="_top" xlink:title="TIR Function. ">
<polygon fill="#ffffff" stroke="#ff0000" points="1481,-475 1481,-494 1621,-494 1621,-475 1481,-475"/>
<text text-anchor="middle" x="1551" y="-482" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/function.h</text>
</a>
</g>
</g>
-<!-- Node155->Node160 -->
+<!-- Node156->Node161 -->
<g id="edge93" class="edge">
-<title>Node155->Node160</title>
+<title>Node156->Node161</title>
<path fill="none" stroke="#191970" d="M2051.3438,-666.9948C1950.5511,-630.2318 1665.1647,-526.1403 1577.1648,-494.0433"/>
<polygon fill="#191970" stroke="#191970" points="2050.1908,-670.2998 2060.7848,-670.4383 2052.5895,-663.7235 2050.1908,-670.2998"/>
</g>
-<!-- Node156->Node157 -->
+<!-- Node157->Node158 -->
<g id="edge89" class="edge">
-<title>Node156->Node157</title>
+<title>Node157->Node158</title>
<path fill="none" stroke="#191970" d="M2082.6092,-594.1932C2087.0075,-584.9844 2091.8855,-574.771 2095.7674,-566.6432"/>
<polygon fill="#191970" stroke="#191970" points="2079.4129,-592.7646 2078.2613,-603.2967 2085.7294,-595.7815 2079.4129,-592.7646"/>
</g>
-<!-- Node157->Node158 -->
+<!-- Node158->Node159 -->
<g id="edge90" class="edge">
-<title>Node157->Node158</title>
+<title>Node158->Node159</title>
<path fill="none" stroke="#191970" d="M2112.1783,-526.9021C2115.586,-517.7696 2119.3487,-507.6854 2122.3496,-499.6432"/>
<polygon fill="#191970" stroke="#191970" points="2108.8897,-525.704 2108.6729,-536.2967 2115.448,-528.1512 2108.8897,-525.704"/>
</g>
-<!-- Node160->Node23 -->
+<!-- Node161->Node23 -->
<g id="edge94" class="edge">
-<title>Node160->Node23</title>
+<title>Node161->Node23</title>
<path fill="none" stroke="#191970" d="M1558.6118,-465.3061C1564.6136,-446.6276 1569.6356,-418.6905 1554,-402 1470.1102,-312.4503 1107.8914,-429.6744 1003,-366 978.0891,-350.8778 963.9427,-318.1994 957.436,-298.8163"/>
<polygon fill="#191970" stroke="#191970" points="1555.2743,-464.245 1555.2077,-474.8396 1561.8667,-466.5989 1555.2743,-464.245"/>
</g>
-<!-- Node160->Node95 -->
+<!-- Node161->Node95 -->
<g id="edge96" class="edge">
-<title>Node160->Node95</title>
+<title>Node161->Node95</title>
<path fill="none" stroke="#191970" d="M1508.7279,-472.0779C1464.4547,-459.0678 1395.8731,-438.9145 1355.6062,-427.0817"/>
<polygon fill="#191970" stroke="#191970" points="1507.752,-475.4391 1518.3332,-474.9005 1509.7256,-468.7231 1507.752,-475.4391"/>
</g>
-<!-- Node160->Node144 -->
+<!-- Node161->Node145 -->
<g id="edge95" class="edge">
-<title>Node160->Node144</title>
+<title>Node161->Node145</title>
<path fill="none" stroke="#191970" d="M1631.1179,-480.9539C1769.0348,-474.4192 2060.1006,-458.7414 2305,-433 2315.6452,-431.8811 2326.8489,-430.51 2337.8575,-429.0506"/>
<polygon fill="#191970" stroke="#191970" points="1630.879,-477.4612 1621.0546,-481.4273 1631.208,-484.4534 1630.879,-477.4612"/>
</g>
-<!-- Node160->Node150 -->
+<!-- Node161->Node151 -->
<g id="edge97" class="edge">
-<title>Node160->Node150</title>
+<title>Node161->Node151</title>
<path fill="none" stroke="#191970" d="M1533.0106,-467.9892C1521.1894,-457.1395 1505.8453,-443.0566 1494.367,-432.5218"/>
<polygon fill="#191970" stroke="#191970" points="1530.8069,-470.7173 1540.5409,-474.9005 1535.5401,-465.5601 1530.8069,-470.7173"/>
</g>
-<!-- Node162->Node21 -->
+<!-- Node163->Node21 -->
<g id="edge101" class="edge">
-<title>Node162->Node21</title>
+<title>Node163->Node21</title>
<path fill="none" stroke="#191970" d="M391.4883,-726.286C374.4725,-714.7864 351.6391,-699.355 337.1189,-689.5419"/>
<polygon fill="#191970" stroke="#191970" points="389.5361,-729.191 399.7813,-731.8906 393.4557,-723.3913 389.5361,-729.191"/>
</g>
-<!-- Node162->Node22 -->
+<!-- Node163->Node22 -->
<g id="edge102" class="edge">
-<title>Node162->Node22</title>
+<title>Node163->Node22</title>
<path fill="none" stroke="#191970" d="M473.8825,-729.9913C599.448,-705.8592 888.2541,-650.3543 1004.3445,-628.0432"/>
<polygon fill="#191970" stroke="#191970" points="472.8469,-726.6262 463.6872,-731.9507 474.1681,-733.5004 472.8469,-726.6262"/>
</g>
-<!-- Node162->Node111 -->
+<!-- Node163->Node111 -->
<g id="edge103" class="edge">
-<title>Node162->Node111</title>
+<title>Node163->Node111</title>
<path fill="none" stroke="#191970" d="M416.6636,-721.6825C422.924,-682.0149 441.9298,-591.8794 491,-536 505.2165,-519.8108 525.7819,-507.8794 544.3214,-499.6012"/>
<polygon fill="#191970" stroke="#191970" points="413.1875,-721.2648 415.1934,-731.668 420.1128,-722.2845 413.1875,-721.2648"/>
</g>
-<!-- Node154->Node48 -->
+<!-- Node155->Node48 -->
<g id="edge114" class="edge">
-<title>Node154->Node48</title>
+<title>Node155->Node48</title>
<path fill="none" stroke="#191970" d="M2034,-325.348C2034,-283.0061 2034,-198.7637 2034,-164.5088"/>
<polygon fill="#191970" stroke="#191970" points="2030.5001,-325.3923 2034,-335.3923 2037.5001,-325.3924 2030.5001,-325.3923"/>
</g>
-<!-- Node214->Node23 -->
+<!-- Node215->Node23 -->
<g id="edge122" class="edge">
-<title>Node214->Node23</title>
+<title>Node215->Node23</title>
<path fill="none" stroke="#191970" d="M2169.5498,-415.8208C1923.7259,-409.6247 1097.2521,-387.3001 1045,-366 1027.0209,-358.671 988.4752,-320.4645 967.433,-298.7069"/>
<polygon fill="#191970" stroke="#191970" points="2169.6944,-419.3254 2179.7791,-416.0777 2169.8702,-412.3276 2169.6944,-419.3254"/>
</g>
-<!-- Node214->Node27 -->
+<!-- Node215->Node27 -->
<g id="edge135" class="edge">
-<title>Node214->Node27</title>
+<title>Node215->Node27</title>
<path fill="none" stroke="#191970" d="M2169.7428,-415.0482C1983.8761,-408.2821 1453.1196,-388.3616 1012,-366 940.4854,-362.3747 858.0011,-357.3317 805.2216,-353.9948"/>
<polygon fill="#191970" stroke="#191970" points="2169.7912,-418.5522 2179.9117,-415.4178 2170.0455,-411.5568 2169.7912,-418.5522"/>
</g>
-<!-- Node214->Node45 -->
+<!-- Node215->Node45 -->
<g id="edge123" class="edge">
-<title>Node214->Node45</title>
+<title>Node215->Node45</title>
<path fill="none" stroke="#191970" d="M2230.502,-392.7514C2223.3323,-373.6336 2210.5248,-348.2693 2190,-335 2125.267,-293.15 1921.6171,-307.4616 1845,-299 1826.6307,-296.9713 1806.844,-294.7078 1788.2918,-292.551"/>
<polygon fill="#191970" stroke="#191970" points="2227.2473,-394.0484 2233.8335,-402.3474 2233.8601,-391.7526 2227.2473,-394.0484"/>
</g>
-<!-- Node214->Node46 -->
+<!-- Node215->Node46 -->
<g id="edge127" class="edge">
-<title>Node214->Node46</title>
+<title>Node215->Node46</title>
<path fill="none" stroke="#191970" d="M2264.5997,-395.7509C2291.1413,-371.4389 2328,-329.4113 2328,-283.5 2328,-283.5 2328,-283.5 2328,-216.5 2328,-168.095 2467.1163,-120.8793 2547.8146,-97.5625"/>
<polygon fill="#191970" stroke="#191970" points="2262.2422,-393.1636 2257.0867,-402.4195 2266.889,-398.3988 2262.2422,-393.1636"/>
</g>
-<!-- Node214->Node47 -->
+<!-- Node215->Node47 -->
<g id="edge133" class="edge">
-<title>Node214->Node47</title>
+<title>Node215->Node47</title>
<path fill="none" stroke="#191970" d="M2306.1627,-405.031C2313.8487,-403.8808 2321.581,-402.8358 2329,-402 2471.8744,-385.9033 2841.1279,-418.4448 2975,-366 3059.1946,-333.0165 3113,-306.9248 3113,-216.5 3113,-216.5 3113,-216.5 3113,-149.5 3113,-95.1498 2957.054,-52.1213 2859.8923,-30.587"/>
<polygon fill="#191970" stroke="#191970" points="2305.4583,-401.5984 2296.1157,-406.595 2306.5351,-408.5151 2305.4583,-401.5984"/>
</g>
-<!-- Node214->Node48 -->
+<!-- Node215->Node48 -->
<g id="edge134" class="edge">
-<title>Node214->Node48</title>
+<title>Node215->Node48</title>
<path fill="none" stroke="#191970" d="M2247.6536,-392.7277C2252.7021,-375.7674 2256.2872,-353.1508 2248,-335 2209.9148,-251.585 2113.7436,-191.2569 2064.3327,-164.6487"/>
<polygon fill="#191970" stroke="#191970" points="2244.2494,-391.8729 2244.4141,-402.4665 2250.8916,-394.0824 2244.2494,-391.8729"/>
</g>
-<!-- Node214->Node49 -->
+<!-- Node215->Node49 -->
<g id="edge125" class="edge">
-<title>Node214->Node49</title>
+<title>Node215->Node49</title>
<path fill="none" stroke="#191970" d="M2242.1597,-392.5067C2243.6857,-374.2049 2242.2935,-350.0342 2228,-335 2198.3357,-303.7983 2084.3795,-291.4382 2006.3046,-286.584"/>
<polygon fill="#191970" stroke="#191970" points="2238.6772,-392.1536 2240.9994,-402.4908 2245.6304,-392.9617 2238.6772,-392.1536"/>
</g>
-<!-- Node214->Node50 -->
+<!-- Node215->Node50 -->
<g id="edge131" class="edge">
-<title>Node214->Node50</title>
+<title>Node215->Node50</title>
<path fill="none" stroke="#191970" d="M2286.8235,-398.3127C2305.1042,-389.8847 2325.4374,-378.9393 2342,-366 2393.7954,-325.5356 2439.6766,-261.1119 2459.1219,-231.854"/>
<polygon fill="#191970" stroke="#191970" points="2285.1833,-395.2121 2277.4817,-402.4878 2288.0396,-401.6028 2285.1833,-395.2121"/>
</g>
-<!-- Node214->Node52 -->
+<!-- Node215->Node52 -->
<g id="edge128" class="edge">
-<title>Node214->Node52</title>
+<title>Node215->Node52</title>
<path fill="none" stroke="#191970" d="M2215.8404,-394.6988C2193.0375,-371.4912 2159.777,-338.4409 2152,-335 2045.0588,-287.6852 1743.3919,-310.315 1627,-299 1611.7903,-297.5214 1595.5562,-295.6707 1580.0357,-293.7681"/>
<polygon fill="#191970" stroke="#191970" points="2213.624,-397.4379 2223.1208,-402.1349 2218.6258,-392.5408 2213.624,-397.4379"/>
</g>
-<!-- Node214->Node56 -->
+<!-- Node215->Node56 -->
<g id="edge136" class="edge">
-<title>Node214->Node56</title>
+<title>Node215->Node56</title>
<path fill="none" stroke="#191970" d="M2200.6858,-397.4918C2185.1411,-388.5687 2167.2076,-377.4857 2152,-366 2135.9422,-353.8721 2137.5128,-342.8876 2119,-335 1975.7031,-273.9465 1573.0969,-321.7085 1419,-299 1415.9913,-298.5566 1412.9165,-298.0344 1409.8229,-297.4548"/>
<polygon fill="#191970" stroke="#191970" points="2199.095,-400.6129 2209.5235,-402.4827 2202.5371,-394.5177 2199.095,-400.6129"/>
</g>
-<!-- Node214->Node145 -->
+<!-- Node215->Node146 -->
<g id="edge124" class="edge">
-<title>Node214->Node145</title>
+<title>Node215->Node146</title>
<path fill="none" stroke="#191970" d="M2289.7181,-398.9203C2364.2566,-372.1424 2500.4499,-323.215 2569.0909,-298.5558"/>
<polygon fill="#191970" stroke="#191970" points="2288.2906,-395.7141 2280.0628,-402.389 2290.6573,-402.3019 2288.2906,-395.7141"/>
</g>
-<!-- Node214->Node147 -->
+<!-- Node215->Node148 -->
<g id="edge126" class="edge">
-<title>Node214->Node147</title>
+<title>Node215->Node148</title>
<path fill="none" stroke="#191970" d="M2306.1644,-405.0456C2313.85,-403.8927 2321.5818,-402.8431 2329,-402 2398.0044,-394.1573 2900.1884,-411.1004 2953,-366 2972.286,-349.53 2973.726,-317.6441 2972.6325,-298.6994"/>
<polygon fill="#191970" stroke="#191970" points="2305.459,-401.6132 2296.1177,-406.6121 2306.5375,-408.5296 2305.459,-401.6132"/>
</g>
-<!-- Node214->Node148 -->
+<!-- Node215->Node149 -->
<g id="edge129" class="edge">
-<title>Node214->Node148</title>
+<title>Node215->Node149</title>
<path fill="none" stroke="#191970" d="M2306.1902,-405.2591C2313.8711,-404.0666 2321.5947,-402.9495 2329,-402 2513.0731,-378.3995 2560.4586,-385.605 2745,-366 2760.1959,-364.3856 2776.4239,-362.4745 2791.9426,-360.5554"/>
<polygon fill="#191970" stroke="#191970" points="2305.4711,-401.8295 2296.1479,-406.8622 2306.5746,-408.7419 2305.4711,-401.8295"/>
</g>
-<!-- Node214->Node149 -->
+<!-- Node215->Node150 -->
<g id="edge132" class="edge">
-<title>Node214->Node149</title>
+<title>Node215->Node150</title>
<path fill="none" stroke="#191970" d="M2169.7334,-406.3984C2095.971,-394.4031 1978.7282,-375.3369 1902.0623,-362.8694"/>
<polygon fill="#191970" stroke="#191970" points="2169.4121,-409.892 2179.8443,-408.0426 2170.5358,-402.9828 2169.4121,-409.892"/>
</g>
-<!-- Node214->Node154 -->
+<!-- Node215->Node155 -->
<g id="edge130" class="edge">
-<title>Node214->Node154</title>
+<title>Node215->Node155</title>
<path fill="none" stroke="#191970" d="M2182.5117,-399.2759C2150.3608,-388.7165 2110.3645,-375.5805 2079.9813,-365.6017"/>
<polygon fill="#191970" stroke="#191970" points="2181.6256,-402.6687 2192.2184,-402.4639 2183.8098,-396.0182 2181.6256,-402.6687"/>
</g>
-<!-- Node191->Node160 -->
+<!-- Node192->Node161 -->
<g id="edge142" class="edge">
-<title>Node191->Node160</title>
+<title>Node192->Node161</title>
<path fill="none" stroke="#191970" d="M1556.11,-598.624C1554.7996,-569.3572 1552.4011,-515.7914 1551.4276,-494.0496"/>
<polygon fill="#191970" stroke="#191970" points="1552.6246,-599.0297 1556.5685,-608.8631 1559.6176,-598.7165 1552.6246,-599.0297"/>
</g>
-<!-- Node191->Node192 -->
+<!-- Node192->Node193 -->
<g id="edge138" class="edge">
-<title>Node191->Node192</title>
+<title>Node192->Node193</title>
<path fill="none" stroke="#191970" d="M1535.7304,-603.0102C1517.9577,-590.067 1493.1761,-572.0195 1478.1569,-561.0817"/>
<polygon fill="#191970" stroke="#191970" points="1533.6746,-605.8428 1543.8186,-608.9005 1537.7955,-600.1843 1533.6746,-605.8428"/>
</g>
-<!-- Node192->Node56 -->
+<!-- Node193->Node56 -->
<g id="edge139" class="edge">
-<title>Node192->Node56</title>
+<title>Node193->Node56</title>
<path fill="none" stroke="#191970" d="M1461.3852,-531.8099C1459.2082,-513.7779 1458.981,-487.2025 1472,-469 1495.1547,-436.6263 1532.2357,-466.3244 1554,-433 1561.5339,-421.4645 1560.795,-413.9856 1554,-402 1523.0796,-347.4596 1455.9672,-315.3173 1408.4689,-298.5709"/>
<polygon fill="#191970" stroke="#191970" points="1457.9371,-532.4172 1462.8916,-541.7822 1464.8586,-531.3715 1457.9371,-532.4172"/>
</g>
-<!-- Node192->Node95 -->
+<!-- Node193->Node95 -->
<g id="edge140" class="edge">
-<title>Node192->Node95</title>
+<title>Node193->Node95</title>
<path fill="none" stroke="#191970" d="M1447.1314,-534.6381C1416.9843,-506.1894 1356.9751,-449.561 1333.1198,-427.0496"/>
<polygon fill="#191970" stroke="#191970" points="1445.1126,-537.5453 1454.7877,-541.8631 1449.9169,-532.4542 1445.1126,-537.5453"/>
</g>
-<!-- Node192->Node160 -->
+<!-- Node193->Node161 -->
<g id="edge141" class="edge">
-<title>Node192->Node160</title>
+<title>Node193->Node161</title>
<path fill="none" stroke="#191970" d="M1485.3372,-535.6559C1501.9165,-522.7395 1524.7891,-504.9201 1538.7012,-494.0817"/>
<polygon fill="#191970" stroke="#191970" points="1483.0593,-532.9937 1477.3217,-541.9005 1487.3613,-538.5158 1483.0593,-532.9937"/>
</g>
diff --git a/docs/reference/api/doxygen/attr__registry__map_8h__dep__incl.svg b/docs/reference/api/doxygen/attr__registry__map_8h__dep__incl.svg
index 6ebd733f14..47adafa5d0 100644
--- a/docs/reference/api/doxygen/attr__registry__map_8h__dep__incl.svg
+++ b/docs/reference/api/doxygen/attr__registry__map_8h__dep__incl.svg
@@ -31,39 +31,39 @@
<path fill="none" stroke="#191970" d="M1768.0451,-404.3114C1622.155,-377.384 1293.0628,-316.6424 1165.4458,-293.0878"/>
<polygon fill="#191970" stroke="#191970" points="1767.7581,-407.8174 1778.2273,-406.1907 1769.0287,-400.9337 1767.7581,-407.8174"/>
</g>
-<!-- Node112 -->
+<!-- Node113 -->
<g id="node33" class="node">
-<title>Node112</title>
+<title>Node113</title>
<g id="a_node33"><a xlink:href="executor_8h.html" target="_top" xlink:title="Object representation of Executor configuration and registry. ">
<polygon fill="#ffffff" stroke="#000000" points="1867.5,-341 1867.5,-360 2023.5,-360 2023.5,-341 1867.5,-341"/>
<text text-anchor="middle" x="1945.5" y="-348" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/relay/executor.h</text>
</a>
</g>
</g>
-<!-- Node24->Node112 -->
+<!-- Node24->Node113 -->
<g id="edge53" class="edge">
-<title>Node24->Node112</title>
+<title>Node24->Node113</title>
<path fill="none" stroke="#191970" d="M1871.8625,-397.0445C1891.2974,-384.7601 1915.1199,-369.7025 1930.3057,-360.1039"/>
<polygon fill="#191970" stroke="#191970" points="1869.8714,-394.1624 1863.2885,-402.4639 1873.6115,-400.0795 1869.8714,-394.1624"/>
</g>
-<!-- Node113 -->
+<!-- Node114 -->
<g id="node34" class="node">
-<title>Node113</title>
+<title>Node114</title>
<g id="a_node34"><a xlink:href="runtime_8h.html" target="_top" xlink:title="Object representation of Runtime configuration and registry. ">
<polygon fill="#ffffff" stroke="#000000" points="2041.5,-341 2041.5,-360 2193.5,-360 2193.5,-341 2041.5,-341"/>
<text text-anchor="middle" x="2117.5" y="-348" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/relay/runtime.h</text>
</a>
</g>
</g>
-<!-- Node24->Node113 -->
+<!-- Node24->Node114 -->
<g id="edge54" class="edge">
-<title>Node24->Node113</title>
+<title>Node24->Node114</title>
<path fill="none" stroke="#191970" d="M1910.233,-400.4528C1963.4792,-387.6201 2034.5049,-370.5024 2078.0682,-360.0033"/>
<polygon fill="#191970" stroke="#191970" points="1909.4099,-397.0509 1900.5083,-402.7966 1911.05,-403.8561 1909.4099,-397.0509"/>
</g>
-<!-- Node114 -->
+<!-- Node115 -->
<g id="node35" class="node">
-<title>Node114</title>
+<title>Node115</title>
<g id="a_node35"><a xlink:href="tag_8h.html" target="_top" xlink:title="Target tag registry. ">
<polygon fill="#ffffff" stroke="#000000" points="2738,-201.5 2738,-231.5 2845,-231.5 2845,-201.5 2738,-201.5"/>
<text text-anchor="start" x="2746" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/target</text>
@@ -71,15 +71,15 @@
</a>
</g>
</g>
-<!-- Node24->Node114 -->
+<!-- Node24->Node115 -->
<g id="edge55" class="edge">
-<title>Node24->Node114</title>
+<title>Node24->Node115</title>
<path fill="none" stroke="#191970" d="M1838.1745,-392.2978C1838.6678,-373.678 1842.677,-349.1928 1858.5,-335 2003.4249,-205.006 2536.9459,-266.7663 2728.5,-232 2731.6031,-231.4368 2734.7811,-230.8114 2737.9805,-230.1441"/>
<polygon fill="#191970" stroke="#191970" points="1834.6753,-392.4793 1838.2275,-402.4609 1841.6752,-392.4427 1834.6753,-392.4793"/>
</g>
-<!-- Node115 -->
+<!-- Node116 -->
<g id="node36" class="node">
-<title>Node115</title>
+<title>Node116</title>
<g id="a_node36"><a xlink:href="target__kind_8h.html" target="_top" xlink:title="Target kind registry. ">
<polygon fill="#ffffff" stroke="#000000" points="1704,-335.5 1704,-365.5 1811,-365.5 1811,-335.5 1704,-335.5"/>
<text text-anchor="start" x="1712" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/target</text>
@@ -87,9 +87,9 @@
</a>
</g>
</g>
-<!-- Node24->Node115 -->
+<!-- Node24->Node116 -->
<g id="edge56" class="edge">
-<title>Node24->Node115</title>
+<title>Node24->Node116</title>
<path fill="none" stroke="#191970" d="M1812.9183,-395.7808C1800.8722,-385.9383 1786.9233,-374.541 1776.0334,-365.6432"/>
<polygon fill="#191970" stroke="#191970" points="1810.9346,-398.6797 1820.8929,-402.2967 1815.3637,-393.2591 1810.9346,-398.6797"/>
</g>
@@ -627,9 +627,9 @@
<path fill="none" stroke="#191970" d="M673.7357,-196.3529C675.025,-179.4548 675.5286,-154.8155 670.5,-134 659.9764,-90.4385 631.2521,-44.4847 618.1607,-25.0582"/>
<polygon fill="#191970" stroke="#191970" points="670.2171,-196.3915 672.7759,-206.6727 677.187,-197.0398 670.2171,-196.3915"/>
</g>
-<!-- Node111 -->
+<!-- Node112 -->
<g id="node32" class="node">
-<title>Node111</title>
+<title>Node112</title>
<g id="a_node32"><a xlink:href="data__layout_8h.html" target="_top" xlink:title="Layout expression to describe the data organization of a tensor. And BijectiveLayout to mapping two d...">
<polygon fill="#ffffff" stroke="#ff0000" points="904,-134.5 904,-164.5 1017,-164.5 1017,-134.5 904,-134.5"/>
<text text-anchor="start" x="912" y="-152.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/data</text>
@@ -637,9 +637,9 @@
</a>
</g>
</g>
-<!-- Node59->Node111 -->
+<!-- Node59->Node112 -->
<g id="edge49" class="edge">
-<title>Node59->Node111</title>
+<title>Node59->Node112</title>
<path fill="none" stroke="#191970" d="M722.3863,-204.7028C773.2048,-192.9214 851.1444,-174.8523 903.89,-162.6241"/>
<polygon fill="#191970" stroke="#191970" points="721.5509,-201.3036 712.5997,-206.9717 723.1319,-208.1227 721.5509,-201.3036"/>
</g>
@@ -679,15 +679,15 @@
<path fill="none" stroke="#191970" d="M489.9167,-68.7424C519.6489,-55.7224 563.5066,-36.5167 589.6195,-25.0817"/>
<polygon fill="#191970" stroke="#191970" points="488.1774,-65.6831 480.4212,-72.9005 490.9854,-72.0952 488.1774,-65.6831"/>
</g>
-<!-- Node111->Node44 -->
+<!-- Node112->Node44 -->
<g id="edge50" class="edge">
-<title>Node111->Node44</title>
+<title>Node112->Node44</title>
<path fill="none" stroke="#191970" d="M939.273,-126.9253C930.2062,-117.2828 919.8643,-106.2843 911.7391,-97.6432"/>
<polygon fill="#191970" stroke="#191970" points="936.8042,-129.409 946.2043,-134.2967 941.9039,-124.6138 936.8042,-129.409"/>
</g>
-<!-- Node116 -->
+<!-- Node117 -->
<g id="node37" class="node">
-<title>Node116</title>
+<title>Node117</title>
<g id="a_node37"><a xlink:href="target_8h.html" target="_top" xlink:title="Compilation target object. ">
<polygon fill="#ffffff" stroke="#000000" points="1704,-268.5 1704,-298.5 1811,-298.5 1811,-268.5 1704,-268.5"/>
<text text-anchor="start" x="1712" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/target</text>
@@ -695,45 +695,45 @@
</a>
</g>
</g>
-<!-- Node115->Node116 -->
+<!-- Node116->Node117 -->
<g id="edge57" class="edge">
-<title>Node115->Node116</title>
+<title>Node116->Node117</title>
<path fill="none" stroke="#191970" d="M1757.5,-325.0249C1757.5,-316.128 1757.5,-306.4287 1757.5,-298.6432"/>
<polygon fill="#191970" stroke="#191970" points="1754.0001,-325.2966 1757.5,-335.2967 1761.0001,-325.2967 1754.0001,-325.2966"/>
</g>
-<!-- Node116->Node35 -->
+<!-- Node117->Node35 -->
<g id="edge72" class="edge">
-<title>Node116->Node35</title>
+<title>Node117->Node35</title>
<path fill="none" stroke="#191970" d="M1821.2748,-277.0155C1850.5753,-274.1151 1885.7885,-270.74 1917.5,-268 2022.7446,-258.9066 2291.7149,-266.6705 2391.5,-232 2414.788,-223.9085 2415.5115,-212.1495 2437.5,-201 2523.8683,-157.2062 2585.5855,-184.307 2629.5,-98 2635.7481,-85.7204 2638.7046,-77.2519 2629.5,-67 2603.5895,-38.1414 2362.8984,-24.0147 2233.6641,-18.4591"/>
<polygon fill="#191970" stroke="#191970" points="1820.7775,-273.5476 1811.1728,-278.0198 1821.4701,-280.5133 1820.7775,-273.5476"/>
</g>
-<!-- Node116->Node42 -->
+<!-- Node117->Node42 -->
<g id="edge69" class="edge">
-<title>Node116->Node42</title>
+<title>Node117->Node42</title>
<path fill="none" stroke="#191970" d="M1821.0792,-268.6849C1822.5647,-268.4431 1824.0399,-268.2142 1825.5,-268 1943.3961,-250.7059 2251.1865,-279.4249 2360.5,-232 2379.2458,-223.8673 2479.3585,-123.947 2510.945,-92.1499"/>
<polygon fill="#191970" stroke="#191970" points="1820.4072,-265.2494 1811.1749,-270.447 1821.6334,-272.1411 1820.4072,-265.2494"/>
</g>
-<!-- Node116->Node44 -->
+<!-- Node117->Node44 -->
<g id="edge70" class="edge">
-<title>Node116->Node44</title>
+<title>Node117->Node44</title>
<path fill="none" stroke="#191970" d="M1693.3655,-282.8344C1557.3581,-280.6591 1244.4395,-271.3938 1146.5,-232 1082.2955,-206.1753 1085.1518,-169.0827 1025.5,-134 1000.0006,-119.0032 969.3146,-106.4903 944.3807,-97.555"/>
<polygon fill="#191970" stroke="#191970" points="1693.5762,-286.3379 1703.6282,-282.9904 1693.6826,-279.3387 1693.5762,-286.3379"/>
</g>
-<!-- Node116->Node45 -->
+<!-- Node117->Node45 -->
<g id="edge71" class="edge">
-<title>Node116->Node45</title>
+<title>Node117->Node45</title>
<path fill="none" stroke="#191970" d="M1693.5821,-280.5186C1550.6301,-273.5273 1208.7597,-254.9228 1095.5,-232 1003.2057,-213.3204 974.2247,-215.1111 894.5,-165 876.9371,-153.9608 837.293,-117.2247 829.5,-98 820.2537,-75.1903 832.8966,-47.5115 843.0532,-30.7159"/>
<polygon fill="#191970" stroke="#191970" points="1693.7757,-284.032 1703.9338,-281.0215 1694.1154,-277.0403 1693.7757,-284.032"/>
</g>
-<!-- Node116->Node114 -->
+<!-- Node117->Node115 -->
<g id="edge76" class="edge">
-<title>Node116->Node114</title>
+<title>Node117->Node115</title>
<path fill="none" stroke="#191970" d="M1821.0716,-268.6305C1822.5594,-268.405 1824.0371,-268.1942 1825.5,-268 2223.6591,-215.1441 2330.8081,-288.2638 2728.5,-232 2731.5112,-231.574 2734.588,-231.0654 2737.6831,-230.4965"/>
<polygon fill="#191970" stroke="#191970" points="1820.4312,-265.189 1811.1557,-270.3091 1821.5996,-272.0908 1820.4312,-265.189"/>
</g>
-<!-- Node101 -->
+<!-- Node102 -->
<g id="node38" class="node">
-<title>Node101</title>
+<title>Node102</title>
<g id="a_node38"><a xlink:href="search__task_8h.html" target="_top" xlink:title="Meta information and hardware parameters for a search task. ">
<polygon fill="#ffffff" stroke="#ff0000" points="1401.5,-201.5 1401.5,-231.5 1553.5,-231.5 1553.5,-201.5 1401.5,-201.5"/>
<text text-anchor="start" x="1409.5" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/auto_scheduler</text>
@@ -741,15 +741,15 @@
</a>
</g>
</g>
-<!-- Node116->Node101 -->
+<!-- Node117->Node102 -->
<g id="edge58" class="edge">
-<title>Node116->Node101</title>
+<title>Node117->Node102</title>
<path fill="none" stroke="#191970" d="M1693.9883,-268.3026C1647.9893,-257.2957 1586.0944,-242.4851 1540.268,-231.5195"/>
<polygon fill="#191970" stroke="#191970" points="1693.3045,-271.7377 1703.8445,-270.661 1694.9336,-264.9299 1693.3045,-271.7377"/>
</g>
-<!-- Node109 -->
+<!-- Node110 -->
<g id="node39" class="node">
-<title>Node109</title>
+<title>Node110</title>
<g id="a_node39"><a xlink:href="driver__api_8h.html" target="_top" xlink:title="Compiler driver APIs to drive the compilation. ">
<polygon fill="#ffffff" stroke="#000000" points="1571.5,-201.5 1571.5,-231.5 1677.5,-231.5 1677.5,-201.5 1571.5,-201.5"/>
<text text-anchor="start" x="1579.5" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/driver</text>
@@ -757,15 +757,15 @@
</a>
</g>
</g>
-<!-- Node116->Node109 -->
+<!-- Node117->Node110 -->
<g id="edge59" class="edge">
-<title>Node116->Node109</title>
+<title>Node117->Node110</title>
<path fill="none" stroke="#191970" d="M1718.6187,-263.9132C1698.2357,-253.645 1673.6428,-241.2561 1654.7083,-231.7177"/>
<polygon fill="#191970" stroke="#191970" points="1717.1467,-267.0906 1727.6522,-268.4639 1720.2961,-260.8391 1717.1467,-267.0906"/>
</g>
-<!-- Node117 -->
+<!-- Node118 -->
<g id="node40" class="node">
-<title>Node117</title>
+<title>Node118</title>
<g id="a_node40"><a xlink:href="memory__pools_8h.html" target="_top" xlink:title="The object definition for relay.build argument type of memory pools. ">
<polygon fill="#ffffff" stroke="#000000" points="321.5,-201.5 321.5,-231.5 449.5,-231.5 449.5,-201.5 321.5,-201.5"/>
<text text-anchor="start" x="329.5" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/ir/memory</text>
@@ -773,15 +773,15 @@
</a>
</g>
</g>
-<!-- Node116->Node117 -->
+<!-- Node117->Node118 -->
<g id="edge60" class="edge">
-<title>Node116->Node117</title>
+<title>Node117->Node118</title>
<path fill="none" stroke="#191970" d="M1693.7211,-282.162C1512.4306,-278.0673 978.6556,-263.9891 536.5,-232 507.9815,-229.9367 476.5127,-226.8382 449.7173,-223.9525"/>
<polygon fill="#191970" stroke="#191970" points="1693.8015,-285.6646 1703.8773,-282.3893 1693.9582,-278.6664 1693.8015,-285.6646"/>
</g>
-<!-- Node118 -->
+<!-- Node119 -->
<g id="node41" class="node">
-<title>Node118</title>
+<title>Node119</title>
<g id="a_node41"><a xlink:href="tir_2usmp_2utils_8h.html" target="_top" xlink:title="Utilities for Unified Static Memory Planner. ">
<polygon fill="#ffffff" stroke="#ff0000" points="211.5,-134.5 211.5,-164.5 329.5,-164.5 329.5,-134.5 211.5,-134.5"/>
<text text-anchor="start" x="219.5" y="-152.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/usmp</text>
@@ -789,15 +789,15 @@
</a>
</g>
</g>
-<!-- Node116->Node118 -->
+<!-- Node117->Node119 -->
<g id="edge82" class="edge">
-<title>Node116->Node118</title>
+<title>Node117->Node119</title>
<path fill="none" stroke="#191970" d="M1693.6764,-282.2921C1484.2038,-278.0692 818.9718,-262.5366 606.5,-232 552.6188,-224.2562 406.6058,-186.1225 325.9304,-164.5133"/>
<polygon fill="#191970" stroke="#191970" points="1693.8232,-285.7957 1703.8912,-282.4964 1693.9632,-278.7971 1693.8232,-285.7957"/>
</g>
-<!-- Node122 -->
+<!-- Node123 -->
<g id="node42" class="node">
-<title>Node122</title>
+<title>Node123</title>
<g id="a_node42"><a xlink:href="builder_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/builder.h">
<polygon fill="#ffffff" stroke="#ff0000" points="1155.5,-201.5 1155.5,-231.5 1307.5,-231.5 1307.5,-201.5 1155.5,-201.5"/>
<text text-anchor="start" x="1163.5" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
@@ -805,15 +805,15 @@
</a>
</g>
</g>
-<!-- Node116->Node122 -->
+<!-- Node117->Node123 -->
<g id="edge63" class="edge">
-<title>Node116->Node122</title>
+<title>Node117->Node123</title>
<path fill="none" stroke="#191970" d="M1693.6606,-277.0299C1608.5829,-268.1627 1453.4219,-251.1375 1321.5,-232 1316.9497,-231.3399 1312.2753,-230.6304 1307.5635,-229.8913"/>
<polygon fill="#191970" stroke="#191970" points="1693.3742,-280.5189 1703.6822,-278.0704 1694.0972,-273.5563 1693.3742,-280.5189"/>
</g>
-<!-- Node125 -->
+<!-- Node126 -->
<g id="node43" class="node">
-<title>Node125</title>
+<title>Node126</title>
<g id="a_node43"><a xlink:href="tune__context_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/tune_context.h">
<polygon fill="#ffffff" stroke="#ff0000" points="1207.5,-134.5 1207.5,-164.5 1359.5,-164.5 1359.5,-134.5 1207.5,-134.5"/>
<text text-anchor="start" x="1215.5" y="-152.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
@@ -821,15 +821,15 @@
</a>
</g>
</g>
-<!-- Node116->Node125 -->
+<!-- Node117->Node126 -->
<g id="edge68" class="edge">
-<title>Node116->Node125</title>
+<title>Node117->Node126</title>
<path fill="none" stroke="#191970" d="M1693.8852,-278.4011C1604.2097,-270.5568 1445.6293,-254.0898 1392.5,-232 1354.3693,-216.1462 1317.7172,-183.7307 1298.1492,-164.5912"/>
<polygon fill="#191970" stroke="#191970" points="1693.6722,-281.8957 1703.9364,-279.2695 1694.2748,-274.9216 1693.6722,-281.8957"/>
</g>
-<!-- Node126 -->
+<!-- Node127 -->
<g id="node44" class="node">
-<title>Node126</title>
+<title>Node127</title>
<g id="a_node44"><a xlink:href="database_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/database.h">
<polygon fill="#ffffff" stroke="#ff0000" points="1733.5,-201.5 1733.5,-231.5 1885.5,-231.5 1885.5,-201.5 1733.5,-201.5"/>
<text text-anchor="start" x="1741.5" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
@@ -837,15 +837,15 @@
</a>
</g>
</g>
-<!-- Node116->Node126 -->
+<!-- Node117->Node127 -->
<g id="edge65" class="edge">
-<title>Node116->Node126</title>
+<title>Node117->Node127</title>
<path fill="none" stroke="#191970" d="M1775.4665,-260.3509C1782.8417,-250.8482 1791.1734,-240.1132 1797.7471,-231.6432"/>
<polygon fill="#191970" stroke="#191970" points="1772.6659,-258.2508 1769.2996,-268.2967 1778.1958,-262.5427 1772.6659,-258.2508"/>
</g>
-<!-- Node128 -->
+<!-- Node129 -->
<g id="node45" class="node">
-<title>Node128</title>
+<title>Node129</title>
<g id="a_node45"><a xlink:href="extracted__task_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/extracted_task.h">
<polygon fill="#ffffff" stroke="#000000" points="1903.5,-201.5 1903.5,-231.5 2055.5,-231.5 2055.5,-201.5 1903.5,-201.5"/>
<text text-anchor="start" x="1911.5" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
@@ -853,15 +853,15 @@
</a>
</g>
</g>
-<!-- Node116->Node128 -->
+<!-- Node117->Node129 -->
<g id="edge66" class="edge">
-<title>Node116->Node128</title>
+<title>Node117->Node129</title>
<path fill="none" stroke="#191970" d="M1816.9814,-265.5484C1852.1209,-254.9432 1896.1125,-241.6665 1929.4616,-231.6017"/>
<polygon fill="#191970" stroke="#191970" points="1815.8834,-262.2238 1807.3211,-268.4639 1817.9059,-268.9252 1815.8834,-262.2238"/>
</g>
-<!-- Node129 -->
+<!-- Node130 -->
<g id="node46" class="node">
-<title>Node129</title>
+<title>Node130</title>
<g id="a_node46"><a xlink:href="profiler_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/profiler.h">
<polygon fill="#ffffff" stroke="#000000" points="2073.5,-201.5 2073.5,-231.5 2225.5,-231.5 2225.5,-201.5 2073.5,-201.5"/>
<text text-anchor="start" x="2081.5" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
@@ -869,15 +869,15 @@
</a>
</g>
</g>
-<!-- Node116->Node129 -->
+<!-- Node117->Node130 -->
<g id="edge67" class="edge">
-<title>Node116->Node129</title>
+<title>Node117->Node130</title>
<path fill="none" stroke="#191970" d="M1821.1066,-268.8478C1822.5839,-268.5572 1824.05,-268.2741 1825.5,-268 1931.0516,-248.0495 1958.5374,-249.6374 2064.5,-232 2067.3114,-231.532 2070.1737,-231.0482 2073.0638,-230.5533"/>
<polygon fill="#191970" stroke="#191970" points="1820.3427,-265.4315 1811.2443,-270.8601 1821.7421,-272.2902 1820.3427,-265.4315"/>
</g>
-<!-- Node130 -->
+<!-- Node131 -->
<g id="node47" class="node">
-<title>Node130</title>
+<title>Node131</title>
<g id="a_node47"><a xlink:href="codegen_8h.html" target="_top" xlink:title="Translates IRModule to runtime::Module. ">
<polygon fill="#ffffff" stroke="#000000" points="2446,-201.5 2446,-231.5 2553,-231.5 2553,-201.5 2446,-201.5"/>
<text text-anchor="start" x="2454" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/target</text>
@@ -885,15 +885,15 @@
</a>
</g>
</g>
-<!-- Node116->Node130 -->
+<!-- Node117->Node131 -->
<g id="edge73" class="edge">
-<title>Node116->Node130</title>
+<title>Node117->Node131</title>
<path fill="none" stroke="#191970" d="M1821.0769,-268.6693C1822.5632,-268.4322 1824.0391,-268.2084 1825.5,-268 2092.603,-229.8895 2164.397,-270.1105 2431.5,-232 2436.1749,-231.333 2441.0036,-230.5095 2445.8308,-229.5926"/>
<polygon fill="#191970" stroke="#191970" points="1820.414,-265.232 1811.1692,-270.4074 1821.6236,-272.1267 1820.414,-265.232"/>
</g>
-<!-- Node131 -->
+<!-- Node132 -->
<g id="node48" class="node">
-<title>Node131</title>
+<title>Node132</title>
<g id="a_node48"><a xlink:href="generic__func_8h.html" target="_top" xlink:title="Generic function that can be specialzied on a per target basis. ">
<polygon fill="#ffffff" stroke="#ff0000" points="1425,-134.5 1425,-164.5 1532,-164.5 1532,-134.5 1425,-134.5"/>
<text text-anchor="start" x="1433" y="-152.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/target</text>
@@ -901,15 +901,15 @@
</a>
</g>
</g>
-<!-- Node116->Node131 -->
+<!-- Node117->Node132 -->
<g id="edge74" class="edge">
-<title>Node116->Node131</title>
+<title>Node117->Node132</title>
<path fill="none" stroke="#191970" d="M1742.9237,-259.5916C1730.2789,-240.8729 1710.2913,-215.634 1686.5,-201 1660.9127,-185.2613 1584.8288,-168.8843 1532.1441,-158.938"/>
<polygon fill="#191970" stroke="#191970" points="1740.1316,-261.7176 1748.5257,-268.182 1745.995,-257.8938 1740.1316,-261.7176"/>
</g>
-<!-- Node132 -->
+<!-- Node133 -->
<g id="node49" class="node">
-<title>Node132</title>
+<title>Node133</title>
<g id="a_node49"><a xlink:href="virtual__device_8h.html" target="_top" xlink:title="A compile time representation for where data is to be stored at runtime, and how to compile code to c...">
<polygon fill="#ffffff" stroke="#ff0000" points="2244,-201.5 2244,-231.5 2351,-231.5 2351,-201.5 2244,-201.5"/>
<text text-anchor="start" x="2252" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/target</text>
@@ -917,66 +917,66 @@
</a>
</g>
</g>
-<!-- Node116->Node132 -->
+<!-- Node117->Node133 -->
<g id="edge77" class="edge">
-<title>Node116->Node132</title>
+<title>Node117->Node133</title>
<path fill="none" stroke="#191970" d="M1821.0854,-268.7258C1822.5691,-268.4717 1824.0422,-268.2292 1825.5,-268 2005.766,-239.6569 2054.4833,-261.8859 2234.5,-232 2237.6112,-231.4835 2240.7956,-230.8945 2243.9998,-230.2552"/>
<polygon fill="#191970" stroke="#191970" points="1820.39,-265.2949 1811.1907,-270.5506 1821.6597,-272.1788 1820.39,-265.2949"/>
</g>
-<!-- Node136 -->
+<!-- Node137 -->
<g id="node50" class="node">
-<title>Node136</title>
+<title>Node137</title>
<g id="a_node50"><a xlink:href="tir_2transform_8h.html" target="_top" xlink:title="TIR specific transformation passes. ">
<polygon fill="#ffffff" stroke="#000000" points="2571.5,-207 2571.5,-226 2719.5,-226 2719.5,-207 2571.5,-207"/>
<text text-anchor="middle" x="2645.5" y="-214" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/transform.h</text>
</a>
</g>
</g>
-<!-- Node116->Node136 -->
+<!-- Node117->Node137 -->
<g id="edge81" class="edge">
-<title>Node116->Node136</title>
+<title>Node117->Node137</title>
<path fill="none" stroke="#191970" d="M1821.074,-268.6483C1822.5611,-268.4175 1824.038,-268.2008 1825.5,-268 2150.3974,-223.3872 2236.785,-270.1891 2562.5,-232 2574.8752,-230.549 2588.1829,-228.3528 2600.4394,-226.0662"/>
<polygon fill="#191970" stroke="#191970" points="1820.4232,-265.2088 1811.1618,-270.3543 1821.6106,-272.1074 1820.4232,-265.2088"/>
</g>
-<!-- Node117->Node118 -->
+<!-- Node118->Node119 -->
<g id="edge61" class="edge">
-<title>Node117->Node118</title>
+<title>Node118->Node119</title>
<path fill="none" stroke="#191970" d="M350.9247,-196.3561C333.4917,-186.1995 312.6969,-174.0843 296.62,-164.7177"/>
<polygon fill="#191970" stroke="#191970" points="349.2893,-199.454 359.6918,-201.4639 352.8132,-193.4056 349.2893,-199.454"/>
</g>
-<!-- Node118->Node47 -->
+<!-- Node119->Node47 -->
<g id="edge62" class="edge">
-<title>Node118->Node47</title>
+<title>Node119->Node47</title>
<path fill="none" stroke="#191970" d="M270.5,-124.0249C270.5,-115.128 270.5,-105.4287 270.5,-97.6432"/>
<polygon fill="#191970" stroke="#191970" points="267.0001,-124.2966 270.5,-134.2967 274.0001,-124.2967 267.0001,-124.2966"/>
</g>
-<!-- Node122->Node125 -->
+<!-- Node123->Node126 -->
<g id="edge64" class="edge">
-<title>Node122->Node125</title>
+<title>Node123->Node126</title>
<path fill="none" stroke="#191970" d="M1249.4665,-193.3509C1256.8417,-183.8482 1265.1734,-173.1132 1271.7471,-164.6432"/>
<polygon fill="#191970" stroke="#191970" points="1246.6659,-191.2508 1243.2996,-201.2967 1252.1958,-195.5427 1246.6659,-191.2508"/>
</g>
-<!-- Node131->Node44 -->
+<!-- Node132->Node44 -->
<g id="edge75" class="edge">
-<title>Node131->Node44</title>
+<title>Node132->Node44</title>
<path fill="none" stroke="#191970" d="M1414.5862,-139.9456C1399.4925,-137.8414 1383.4435,-135.7276 1368.5,-134 1192.0641,-113.6026 1146.1242,-124.4917 970.5,-98 965.888,-97.3043 961.1295,-96.4978 956.3569,-95.6253"/>
<polygon fill="#191970" stroke="#191970" points="1414.4666,-143.4634 1424.8578,-141.396 1415.4454,-136.5321 1414.4666,-143.4634"/>
</g>
-<!-- Node132->Node26 -->
+<!-- Node133->Node26 -->
<g id="edge79" class="edge">
-<title>Node132->Node26</title>
+<title>Node133->Node26</title>
<path fill="none" stroke="#191970" d="M2233.7721,-201.0494C2038.1577,-169.3283 1817.7498,-156.6839 1709.0664,-151.9675"/>
<polygon fill="#191970" stroke="#191970" points="2233.2223,-204.506 2243.6572,-202.6723 2234.3564,-197.5984 2233.2223,-204.506"/>
</g>
-<!-- Node132->Node33 -->
+<!-- Node133->Node33 -->
<g id="edge78" class="edge">
-<title>Node132->Node33</title>
+<title>Node133->Node33</title>
<path fill="none" stroke="#191970" d="M2308.0748,-192.0686C2319.9728,-164.58 2338.934,-120.7732 2348.8943,-97.7614"/>
<polygon fill="#191970" stroke="#191970" points="2304.8009,-190.8214 2304.0406,-201.389 2311.2249,-193.602 2304.8009,-190.8214"/>
</g>
-<!-- Node132->Node35 -->
+<!-- Node133->Node35 -->
<g id="edge80" class="edge">
-<title>Node132->Node35</title>
+<title>Node133->Node35</title>
<path fill="none" stroke="#191970" d="M2361.1741,-201.6056C2444.4245,-180.6939 2583.1397,-140.4142 2612.5,-98 2620.3418,-86.6716 2621.6844,-77.27 2612.5,-67 2587.7412,-39.3146 2358.8402,-24.7576 2233.5593,-18.8058"/>
<polygon fill="#191970" stroke="#191970" points="2360.1677,-198.2492 2351.3077,-204.0586 2361.8567,-205.0423 2360.1677,-198.2492"/>
</g>
diff --git a/docs/reference/api/doxygen/bound_8h.html b/docs/reference/api/doxygen/bound_8h.html
index bf1e1d53a0..e8e05a5ee4 100644
--- a/docs/reference/api/doxygen/bound_8h.html
+++ b/docs/reference/api/doxygen/bound_8h.html
@@ -86,7 +86,7 @@ Include dependency graph for bound.h:</div>
</div><div class="textblock"><div class="dynheader">
This graph shows which files directly or indirectly include this file:</div>
<div class="dyncontent">
-<div class="center"><iframe scrolling="no" frameborder="0" src="bound_8h__dep__incl.svg" width="5047" height="812"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
+<div class="center"><iframe scrolling="no" frameborder="0" src="bound_8h__dep__incl.svg" width="5311" height="812"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
</div>
</div>
</div>
diff --git a/docs/reference/api/doxygen/bound_8h__dep__incl.svg b/docs/reference/api/doxygen/bound_8h__dep__incl.svg
index c8dab31813..9e83bc95cc 100644
--- a/docs/reference/api/doxygen/bound_8h__dep__incl.svg
+++ b/docs/reference/api/doxygen/bound_8h__dep__incl.svg
@@ -4,1139 +4,1139 @@
<!-- Generated by graphviz version 2.40.1 (20161225.0304)
-->
<!-- Title: include/tvm/arith/bound.h Pages: 1 -->
-<svg width="3785pt" height="609pt"
- viewBox="0.00 0.00 3785.00 609.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<svg width="3983pt" height="609pt"
+ viewBox="0.00 0.00 3982.50 609.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 605)">
<title>include/tvm/arith/bound.h</title>
-<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-605 3781,-605 3781,4 -4,4"/>
+<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-605 3978.5,-605 3978.5,4 -4,4"/>
<!-- Node55 -->
<g id="node1" class="node">
<title>Node55</title>
-<polygon fill="#bfbfbf" stroke="#000000" points="306,-581.5 306,-600.5 448,-600.5 448,-581.5 306,-581.5"/>
-<text text-anchor="middle" x="377" y="-588.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/arith/bound.h</text>
+<polygon fill="#bfbfbf" stroke="#000000" points="244.5,-581.5 244.5,-600.5 386.5,-600.5 386.5,-581.5 244.5,-581.5"/>
+<text text-anchor="middle" x="315.5" y="-588.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/arith/bound.h</text>
</g>
<!-- Node56 -->
<g id="node2" class="node">
<title>Node56</title>
<g id="a_node2"><a xlink:href="tensor_8h.html" target="_top" xlink:title="Dataflow tensor object. ">
-<polygon fill="#ffffff" stroke="#000000" points="312.5,-525.5 312.5,-544.5 441.5,-544.5 441.5,-525.5 312.5,-525.5"/>
-<text text-anchor="middle" x="377" y="-532.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/te/tensor.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="251,-525.5 251,-544.5 380,-544.5 380,-525.5 251,-525.5"/>
+<text text-anchor="middle" x="315.5" y="-532.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/te/tensor.h</text>
</a>
</g>
</g>
<!-- Node55->Node56 -->
<g id="edge1" class="edge">
<title>Node55->Node56</title>
-<path fill="none" stroke="#191970" d="M377,-571.1575C377,-562.155 377,-551.9199 377,-544.6427"/>
-<polygon fill="#191970" stroke="#191970" points="373.5001,-571.2455 377,-581.2455 380.5001,-571.2456 373.5001,-571.2455"/>
+<path fill="none" stroke="#191970" d="M315.5,-571.1575C315.5,-562.155 315.5,-551.9199 315.5,-544.6427"/>
+<polygon fill="#191970" stroke="#191970" points="312.0001,-571.2455 315.5,-581.2455 319.0001,-571.2456 312.0001,-571.2455"/>
</g>
<!-- Node57 -->
<g id="node3" class="node">
<title>Node57</title>
<g id="a_node3"><a xlink:href="relay_2op__attr__types_8h.html" target="_top" xlink:title="The Expr and related elements in DataFlow construction. ">
-<polygon fill="#ffffff" stroke="#000000" points="136.5,-335.5 136.5,-365.5 253.5,-365.5 253.5,-335.5 136.5,-335.5"/>
-<text text-anchor="start" x="144.5" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/relay/op</text>
-<text text-anchor="middle" x="195" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_attr_types.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="276,-335.5 276,-365.5 393,-365.5 393,-335.5 276,-335.5"/>
+<text text-anchor="start" x="284" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/relay/op</text>
+<text text-anchor="middle" x="334.5" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_attr_types.h</text>
</a>
</g>
</g>
<!-- Node56->Node57 -->
<g id="edge2" class="edge">
<title>Node56->Node57</title>
-<path fill="none" stroke="#191970" d="M327.7957,-522.2545C307.4632,-515.0842 284.7762,-504.3962 268,-489 229.5509,-453.7135 207.7129,-393.4505 199.197,-365.5244"/>
-<polygon fill="#191970" stroke="#191970" points="326.9583,-525.6644 337.5518,-525.4971 329.1661,-519.0217 326.9583,-525.6644"/>
+<path fill="none" stroke="#191970" d="M317.5285,-515.3025C321.3414,-478.277 329.5017,-399.0364 332.9309,-365.7363"/>
+<polygon fill="#191970" stroke="#191970" points="314.0377,-515.0343 316.4948,-525.3402 321.0008,-515.7514 314.0377,-515.0343"/>
</g>
<!-- Node58 -->
<g id="node4" class="node">
<title>Node58</title>
<g id="a_node4"><a xlink:href="op__strategy_8h.html" target="_top" xlink:title="The Relay operator Strategy and related data structure. ">
-<polygon fill="#ffffff" stroke="#000000" points="180.5,-268.5 180.5,-298.5 297.5,-298.5 297.5,-268.5 180.5,-268.5"/>
-<text text-anchor="start" x="188.5" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/relay/op</text>
-<text text-anchor="middle" x="239" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_strategy.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="0,-268.5 0,-298.5 117,-298.5 117,-268.5 0,-268.5"/>
+<text text-anchor="start" x="8" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/relay/op</text>
+<text text-anchor="middle" x="58.5" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_strategy.h</text>
</a>
</g>
</g>
<!-- Node56->Node58 -->
<g id="edge5" class="edge">
<title>Node56->Node58</title>
-<path fill="none" stroke="#191970" d="M364.2324,-517.075C358.4397,-508.6902 351.6395,-498.4825 346,-489 304.7762,-419.6838 262.491,-332.8897 246.094,-298.5143"/>
-<polygon fill="#191970" stroke="#191970" points="361.4161,-519.1549 370.0201,-525.3373 367.1494,-515.1388 361.4161,-519.1549"/>
+<path fill="none" stroke="#191970" d="M249.0263,-523.5709C206.0431,-515.1115 154.968,-502.7145 137.5,-489 75.719,-440.4944 62.2192,-337.4424 59.2973,-298.6727"/>
+<polygon fill="#191970" stroke="#191970" points="248.4221,-527.0187 258.9038,-525.4747 249.747,-520.1453 248.4221,-527.0187"/>
</g>
<!-- Node61 -->
<g id="node6" class="node">
<title>Node61</title>
<g id="a_node6"><a xlink:href="autodiff_8h.html" target="_top" xlink:title="Automatic differentiation of tensor expressions. ">
-<polygon fill="#ffffff" stroke="#000000" points="544.5,-464 544.5,-483 681.5,-483 681.5,-464 544.5,-464"/>
-<text text-anchor="middle" x="613" y="-471" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/te/autodiff.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="401,-464 401,-483 538,-483 538,-464 401,-464"/>
+<text text-anchor="middle" x="469.5" y="-471" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/te/autodiff.h</text>
</a>
</g>
</g>
<!-- Node56->Node61 -->
<g id="edge6" class="edge">
<title>Node56->Node61</title>
-<path fill="none" stroke="#191970" d="M423.3774,-522.9144C468.1657,-511.2428 535.0972,-493.8009 576.298,-483.0643"/>
-<polygon fill="#191970" stroke="#191970" points="422.4103,-519.5494 413.6161,-525.4581 424.1755,-526.3232 422.4103,-519.5494"/>
+<path fill="none" stroke="#191970" d="M348.935,-521.6477C378.0959,-510.0023 419.6812,-493.3952 445.6065,-483.0419"/>
+<polygon fill="#191970" stroke="#191970" points="347.5513,-518.4315 339.5625,-525.3906 350.1474,-524.9323 347.5513,-518.4315"/>
</g>
<!-- Node62 -->
<g id="node7" class="node">
<title>Node62</title>
<g id="a_node7"><a xlink:href="operation_8h.html" target="_top" xlink:title="Operation node can generate one or multiple Tensors. ">
-<polygon fill="#ffffff" stroke="#000000" points="2064,-341 2064,-360 2210,-360 2210,-341 2064,-341"/>
-<text text-anchor="middle" x="2137" y="-348" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/te/operation.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="2240.5,-341 2240.5,-360 2386.5,-360 2386.5,-341 2240.5,-341"/>
+<text text-anchor="middle" x="2313.5" y="-348" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/te/operation.h</text>
</a>
</g>
</g>
<!-- Node56->Node62 -->
<g id="edge7" class="edge">
<title>Node56->Node62</title>
-<path fill="none" stroke="#191970" d="M415.8661,-522.022C438.6432,-513.7922 467.6334,-502.2212 492,-489 513.026,-477.5915 513.451,-465.9868 536,-458 681.0822,-406.6126 1768.8234,-363.8339 2063.5407,-353.0971"/>
-<polygon fill="#191970" stroke="#191970" points="414.4528,-518.8096 406.1976,-525.4506 416.7924,-525.4071 414.4528,-518.8096"/>
+<path fill="none" stroke="#191970" d="M327.3686,-517.0139C340.7319,-498.494 364.3027,-470.6141 392.5,-458 563.4874,-381.5083 1909.7016,-356.5293 2240.3487,-351.5175"/>
+<polygon fill="#191970" stroke="#191970" points="324.3197,-515.2687 321.4965,-525.4804 330.0716,-519.2581 324.3197,-515.2687"/>
</g>
-<!-- Node106 -->
-<g id="node47" class="node">
-<title>Node106</title>
-<g id="a_node47"><a xlink:href="te_2schedule_8h.html" target="_top" xlink:title="Define a schedule. ">
-<polygon fill="#ffffff" stroke="#ff0000" points="344.5,-402.5 344.5,-421.5 487.5,-421.5 487.5,-402.5 344.5,-402.5"/>
-<text text-anchor="middle" x="416" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/te/schedule.h</text>
+<!-- Node107 -->
+<g id="node48" class="node">
+<title>Node107</title>
+<g id="a_node48"><a xlink:href="te_2schedule_8h.html" target="_top" xlink:title="Define a schedule. ">
+<polygon fill="#ffffff" stroke="#ff0000" points="149,-402.5 149,-421.5 292,-421.5 292,-402.5 149,-402.5"/>
+<text text-anchor="middle" x="220.5" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/te/schedule.h</text>
</a>
</g>
</g>
-<!-- Node56->Node106 -->
-<g id="edge100" class="edge">
-<title>Node56->Node106</title>
-<path fill="none" stroke="#191970" d="M363.7539,-516.9043C353.7611,-500.8661 343.2979,-477.0971 353,-458 361.5263,-441.2172 379.3669,-429.098 393.8934,-421.5258"/>
-<polygon fill="#191970" stroke="#191970" points="360.9492,-519.006 369.4271,-525.3602 366.7621,-515.106 360.9492,-519.006"/>
+<!-- Node56->Node107 -->
+<g id="edge101" class="edge">
+<title>Node56->Node107</title>
+<path fill="none" stroke="#191970" d="M308.3358,-515.6533C301.6727,-499.2497 290.6511,-475.6442 276.5,-458 264.6922,-443.2776 247.4956,-430.0151 235.2791,-421.5563"/>
+<polygon fill="#191970" stroke="#191970" points="305.1852,-517.2084 312.0741,-525.2578 311.7085,-514.6693 305.1852,-517.2084"/>
</g>
-<!-- Node119 -->
+<!-- Node120 -->
<g id="node50" class="node">
-<title>Node119</title>
+<title>Node120</title>
<g id="a_node50"><a xlink:href="tensor__intrin_8h.html" target="_top" xlink:title="Tensor intrinsic operations. ">
-<polygon fill="#ffffff" stroke="#000000" points="361.5,-458.5 361.5,-488.5 482.5,-488.5 482.5,-458.5 361.5,-458.5"/>
-<text text-anchor="start" x="369.5" y="-476.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/te/tensor</text>
-<text text-anchor="middle" x="422" y="-465.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_intrin.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="147,-458.5 147,-488.5 268,-488.5 268,-458.5 147,-458.5"/>
+<text text-anchor="start" x="155" y="-476.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/te/tensor</text>
+<text text-anchor="middle" x="207.5" y="-465.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_intrin.h</text>
</a>
</g>
</g>
-<!-- Node56->Node119 -->
+<!-- Node56->Node120 -->
<g id="edge106" class="edge">
-<title>Node56->Node119</title>
-<path fill="none" stroke="#191970" d="M390.1066,-517.0877C396.7336,-508.0307 404.6858,-497.1627 410.9918,-488.5446"/>
-<polygon fill="#191970" stroke="#191970" points="387.1118,-515.2535 384.0313,-525.3906 392.761,-519.3871 387.1118,-515.2535"/>
+<title>Node56->Node120</title>
+<path fill="none" stroke="#191970" d="M289.8456,-520.3912C272.9723,-510.7829 250.9459,-498.24 233.9198,-488.5446"/>
+<polygon fill="#191970" stroke="#191970" points="288.2032,-523.4836 298.625,-525.3906 291.6671,-517.4007 288.2032,-523.4836"/>
</g>
<!-- Node57->Node58 -->
<g id="edge3" class="edge">
<title>Node57->Node58</title>
-<path fill="none" stroke="#191970" d="M210.5816,-326.7735C216.7272,-317.4154 223.6052,-306.9421 229.0552,-298.6432"/>
-<polygon fill="#191970" stroke="#191970" points="207.5481,-325.0167 204.9843,-335.2967 213.3992,-328.8592 207.5481,-325.0167"/>
+<path fill="none" stroke="#191970" d="M266.1482,-333.9074C220.4752,-322.8201 161.0074,-308.3841 117.4232,-297.8038"/>
+<polygon fill="#191970" stroke="#191970" points="265.424,-337.3331 275.9675,-336.291 267.0754,-330.5307 265.424,-337.3331"/>
</g>
<!-- Node59 -->
<g id="node5" class="node">
<title>Node59</title>
<g id="a_node5"><a xlink:href="relay_2transform_8h.html" target="_top" xlink:title="Relay specific transformation passes. ">
-<polygon fill="#ffffff" stroke="#ff0000" points="0,-274 0,-293 162,-293 162,-274 0,-274"/>
-<text text-anchor="middle" x="81" y="-281" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/relay/transform.h</text>
+<polygon fill="#ffffff" stroke="#ff0000" points="135.5,-274 135.5,-293 297.5,-293 297.5,-274 135.5,-274"/>
+<text text-anchor="middle" x="216.5" y="-281" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/relay/transform.h</text>
</a>
</g>
</g>
<!-- Node57->Node59 -->
<g id="edge4" class="edge">
<title>Node57->Node59</title>
-<path fill="none" stroke="#191970" d="M160.7588,-330.3758C139.7558,-318.0319 113.82,-302.789 97.341,-293.1039"/>
-<polygon fill="#191970" stroke="#191970" points="159.0214,-333.4144 169.4162,-335.4639 162.5683,-327.3795 159.0214,-333.4144"/>
+<path fill="none" stroke="#191970" d="M299.0574,-330.3758C277.3174,-318.0319 250.4716,-302.789 233.4144,-293.1039"/>
+<polygon fill="#191970" stroke="#191970" points="297.5943,-333.5699 308.0185,-335.4639 301.0506,-327.4827 297.5943,-333.5699"/>
</g>
<!-- Node63 -->
<g id="node8" class="node">
<title>Node63</title>
<g id="a_node8"><a xlink:href="cublas_8h.html" target="_top" xlink:title="External function interface to cuBLAS libraries. ">
-<polygon fill="#ffffff" stroke="#000000" points="2159.5,-201.5 2159.5,-231.5 2292.5,-231.5 2292.5,-201.5 2159.5,-201.5"/>
-<text text-anchor="start" x="2167.5" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/contrib</text>
-<text text-anchor="middle" x="2226" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/cublas.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="544,-201.5 544,-231.5 677,-231.5 677,-201.5 544,-201.5"/>
+<text text-anchor="start" x="552" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/contrib</text>
+<text text-anchor="middle" x="610.5" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/cublas.h</text>
</a>
</g>
</g>
<!-- Node62->Node63 -->
<g id="edge8" class="edge">
<title>Node62->Node63</title>
-<path fill="none" stroke="#191970" d="M2138.0333,-330.7094C2139.7867,-312.9897 2144.4808,-286.9539 2157,-268 2167.0508,-252.7832 2183.1927,-240.3737 2197.3374,-231.5516"/>
-<polygon fill="#191970" stroke="#191970" points="2134.5372,-330.5249 2137.2653,-340.7625 2141.5169,-331.0582 2134.5372,-330.5249"/>
+<path fill="none" stroke="#191970" d="M2230.3786,-349.2627C1912.8769,-344.3494 790.8497,-325.2026 719.5,-299 698.7152,-291.3669 698.0873,-281.452 680.5,-268 664.2019,-255.5341 645.4647,-241.7785 631.4797,-231.6218"/>
+<polygon fill="#191970" stroke="#191970" points="2230.3819,-352.763 2240.4347,-349.4177 2230.4899,-345.7639 2230.3819,-352.763"/>
</g>
<!-- Node64 -->
<g id="node9" class="node">
<title>Node64</title>
<g id="a_node9"><a xlink:href="cuda_2dense_8h.html" target="_top" xlink:title="CUDA schedule for dense operation. ">
-<polygon fill="#ffffff" stroke="#000000" points="1780,-67.5 1780,-97.5 1902,-97.5 1902,-67.5 1780,-67.5"/>
... 55808 lines suppressed ...