You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by tq...@apache.org on 2022/06/22 21:28:37 UTC
[tvm-site] branch asf-site updated: deploying docs (apache/tvm@c334790bf88694db8d748d2299f50f2b04c46486)
This is an automated email from the ASF dual-hosted git repository.
tqchen pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/tvm-site.git
The following commit(s) were added to refs/heads/asf-site by this push:
new ad7a1837c deploying docs (apache/tvm@c334790bf88694db8d748d2299f50f2b04c46486)
ad7a1837c is described below
commit ad7a1837cc5a6272774c07c7647945d3a39329a0
Author: tvm-bot <95...@users.noreply.github.com>
AuthorDate: Wed Jun 22 21:28:32 2022 +0000
deploying docs (apache/tvm@c334790bf88694db8d748d2299f50f2b04c46486)
---
.../how_to/compile_models/from_mxnet.rst.txt | 2 +-
.../how_to/compile_models/from_oneflow.rst.txt | 2 +-
.../how_to/compile_models/from_paddle.rst.txt | 2 +-
.../how_to/compile_models/from_pytorch.rst.txt | 2 +-
.../how_to/compile_models/from_tensorflow.rst.txt | 2 +-
.../compile_models/sg_execution_times.rst.txt | 22 +-
.../deploy_models/deploy_model_on_android.rst.txt | 2 +-
.../deploy_object_detection_pytorch.rst.txt | 4 +-
.../deploy_models/deploy_prequantized.rst.txt | 6 +-
.../deploy_prequantized_tflite.rst.txt | 4 +-
.../how_to/deploy_models/deploy_quantized.rst.txt | 2 +-
.../deploy_models/deploy_ssd_gluoncv.rst.txt | 4 +-
.../deploy_models/sg_execution_times.rst.txt | 16 +-
.../extend_tvm/bring_your_own_datatypes.rst.txt | 2 +-
.../how_to/extend_tvm/sg_execution_times.rst.txt | 8 +-
.../how_to/extend_tvm/use_pass_instrument.rst.txt | 16 +-
.../optimize_operators/opt_conv_cuda.rst.txt | 2 +-
.../optimize_operators/opt_conv_tensorcore.rst.txt | 2 +-
.../how_to/optimize_operators/opt_gemm.rst.txt | 16 +-
.../optimize_operators/sg_execution_times.rst.txt | 8 +-
.../sg_execution_times.rst.txt | 14 +-
.../tune_conv2d_layer_cuda.rst.txt | 2790 +++++++++++++++++++-
.../tune_network_cuda.rst.txt | 2 +-
.../tune_network_x86.rst.txt | 4 +-
.../tune_sparse_x86.rst.txt | 119 +-
.../tune_with_autotvm/sg_execution_times.rst.txt | 6 +-
.../tune_with_autotvm/tune_conv2d_cuda.rst.txt | 34 +-
.../work_with_microtvm/micro_autotune.rst.txt | 16 +-
.../how_to/work_with_microtvm/micro_train.rst.txt | 16 +-
.../work_with_microtvm/sg_execution_times.rst.txt | 8 +-
.../work_with_relay/sg_execution_times.rst.txt | 6 +-
.../how_to/work_with_schedules/intrin_math.rst.txt | 2 +-
.../work_with_schedules/sg_execution_times.rst.txt | 14 +-
.../how_to/work_with_schedules/tensorize.rst.txt | 2 +-
.../tutorials/autotvm/sg_execution_times.rst.txt | 4 +-
.../frontend/deploy_classification.rst.txt | 2 +-
.../tutorials/frontend/deploy_detection.rst.txt | 2 +-
.../tutorials/frontend/sg_execution_times.rst.txt | 6 +-
.../tutorials/optimize/sg_execution_times.rst.txt | 4 +-
.../topic/vta/tutorials/sg_execution_times.rst.txt | 6 +-
.../tutorial/auto_scheduler_matmul_x86.rst.txt | 11 +-
docs/_sources/tutorial/autotvm_matmul_x86.rst.txt | 20 +-
docs/_sources/tutorial/autotvm_relay_x86.rst.txt | 54 +-
.../tutorial/cross_compilation_and_rpc.rst.txt | 2 +-
docs/_sources/tutorial/intro_topi.rst.txt | 2 +-
docs/_sources/tutorial/sg_execution_times.rst.txt | 22 +-
.../tutorial/tensor_expr_get_started.rst.txt | 44 +-
docs/commit_hash | 2 +-
docs/how_to/compile_models/from_mxnet.html | 2 +-
docs/how_to/compile_models/from_oneflow.html | 236 +-
docs/how_to/compile_models/from_paddle.html | 2 +-
docs/how_to/compile_models/from_pytorch.html | 24 +-
docs/how_to/compile_models/from_tensorflow.html | 2 +-
docs/how_to/compile_models/sg_execution_times.html | 22 +-
.../deploy_models/deploy_model_on_android.html | 2 +-
.../deploy_object_detection_pytorch.html | 22 +-
docs/how_to/deploy_models/deploy_prequantized.html | 7 +-
.../deploy_models/deploy_prequantized_tflite.html | 4 +-
docs/how_to/deploy_models/deploy_quantized.html | 2 +-
docs/how_to/deploy_models/deploy_ssd_gluoncv.html | 37 +-
docs/how_to/deploy_models/sg_execution_times.html | 16 +-
.../extend_tvm/bring_your_own_datatypes.html | 2 +-
docs/how_to/extend_tvm/sg_execution_times.html | 8 +-
docs/how_to/extend_tvm/use_pass_instrument.html | 16 +-
docs/how_to/optimize_operators/opt_conv_cuda.html | 2 +-
.../optimize_operators/opt_conv_tensorcore.html | 2 +-
docs/how_to/optimize_operators/opt_gemm.html | 16 +-
.../optimize_operators/sg_execution_times.html | 8 +-
.../sg_execution_times.html | 14 +-
.../tune_conv2d_layer_cuda.html | 2790 +++++++++++++++++++-
.../tune_with_autoscheduler/tune_network_cuda.html | 2 +-
.../tune_with_autoscheduler/tune_network_x86.html | 4 +-
.../tune_with_autoscheduler/tune_sparse_x86.html | 119 +-
.../tune_with_autotvm/sg_execution_times.html | 6 +-
.../how_to/tune_with_autotvm/tune_conv2d_cuda.html | 34 +-
docs/how_to/work_with_microtvm/micro_autotune.html | 16 +-
docs/how_to/work_with_microtvm/micro_train.html | 16 +-
.../work_with_microtvm/sg_execution_times.html | 8 +-
.../how_to/work_with_relay/sg_execution_times.html | 6 +-
docs/how_to/work_with_schedules/intrin_math.html | 2 +-
.../work_with_schedules/sg_execution_times.html | 14 +-
docs/how_to/work_with_schedules/tensorize.html | 2 +-
docs/reference/api/python/auto_scheduler.html | 4 +-
.../api/typedoc/classes/bytestreamreader.html | 12 +-
.../api/typedoc/classes/cachedcallstack.html | 34 +-
docs/reference/api/typedoc/classes/dldatatype.html | 12 +-
docs/reference/api/typedoc/classes/dldevice.html | 10 +-
.../reference/api/typedoc/classes/environment.html | 12 +-
docs/reference/api/typedoc/classes/ffilibrary.html | 20 +-
.../api/typedoc/classes/graphexecutor.html | 16 +-
docs/reference/api/typedoc/classes/instance.html | 40 +-
docs/reference/api/typedoc/classes/memory.html | 34 +-
docs/reference/api/typedoc/classes/module.html | 10 +-
docs/reference/api/typedoc/classes/ndarray.html | 22 +-
.../api/typedoc/classes/packedfunccell.html | 6 +-
docs/reference/api/typedoc/classes/rpcserver.html | 14 +-
docs/reference/api/typedoc/classes/scalar.html | 6 +-
.../api/typedoc/classes/webgpucontext.html | 12 +-
docs/reference/api/typedoc/enums/argtypecode.html | 30 +-
.../api/typedoc/enums/aynccallbackcode.html | 4 +-
.../api/typedoc/enums/dldatatypecode.html | 8 +-
.../api/typedoc/enums/rpcserverstate.html | 12 +-
docs/reference/api/typedoc/enums/sizeof.html | 18 +-
docs/reference/api/typedoc/index.html | 112 +-
.../api/typedoc/interfaces/disposable.html | 2 +-
.../api/typedoc/interfaces/functioninfo.html | 6 +-
.../api/typedoc/interfaces/libraryprovider.html | 4 +-
docs/searchindex.js | 2 +-
.../vta/tutorials/autotvm/sg_execution_times.html | 4 +-
.../tutorials/frontend/deploy_classification.html | 2 +-
.../vta/tutorials/frontend/deploy_detection.html | 2 +-
.../vta/tutorials/frontend/sg_execution_times.html | 6 +-
.../vta/tutorials/optimize/sg_execution_times.html | 4 +-
docs/topic/vta/tutorials/sg_execution_times.html | 6 +-
docs/tutorial/auto_scheduler_matmul_x86.html | 7 +-
docs/tutorial/autotvm_matmul_x86.html | 20 +-
docs/tutorial/autotvm_relay_x86.html | 258 +-
docs/tutorial/cross_compilation_and_rpc.html | 2 +-
docs/tutorial/intro_topi.html | 2 +-
docs/tutorial/sg_execution_times.html | 26 +-
docs/tutorial/tensor_expr_get_started.html | 44 +-
121 files changed, 6428 insertions(+), 1220 deletions(-)
diff --git a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
index f327c91ce..2ecd0219d 100644
--- a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
@@ -114,7 +114,7 @@ In this section, we download a pretrained imagenet model and classify an image.
.. code-block:: none
- Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip812ab175-3adc-45fc-a776-1bd65330f280 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+ Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip2cde5cc7-4677-4c09-92ea-a4a047eb10a2 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
x (1, 3, 224, 224)
diff --git a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
index dc3846153..d09c0e5df 100644
--- a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
@@ -112,7 +112,7 @@ Load a pretrained OneFlow model and save model
.. code-block:: none
Downloading: "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip" to /workspace/.oneflow/flowvision_cache/resnet18.zip
-
0%| | 0.00/41.5M [00:00<?, ?B/s]
0%| | 16.0k/41.5M [00:00<07:57, 91.1kB/s]
0%| | 40.0k/41.5M [00:00<06:09, 117kB/s]
0%| | 72.0k/41.5M [00:00<04:56, 147kB/s]
0%| | 96.0k/41.5M [00:00<05:05, 142kB/s]
0%| | 128k/41.5M [00:00<04:37, 156kB/s]
0%| | 160k/41.5M [00:01<04:23, 165kB/s]
0%| | 192k/41.5M [00:01<04:14, 170kB/s]
1%| | 232k/41.5M [00:01<03:50, 188kB/s]
1%| | 264k/41.5M [00:01<03:52, 186kB/s]
1%| | 304k/41.5M [00:01<03:37, 198kB/s]
1%| | 352k/41.5M [00:01<03:15, 221kB/s]
1%| | 392k/41.5M [00:02<03:13, 222kB/s]
1%|1 | 440k/41.5M [00:02<03:01, 237kB/s]
1%|1 | 488k/41.5M [00:02<02:53, 248kB/s]
1%|1 | 536k/41.5M [00:02<02:48, 255kB/s]
1%|1 | 584k/41.5M [00:02<02:45, 260kB/s]
2%|1 | 640k/41.5M [00:03<02:34, 277kB/s]
2%|1 | 696k/41.5M [00:03<02:28, 289kB/s]
2%|1 | 760k/41.5M [00:03<02:17, 311kB/s]
2%|1 | 824k/41.5M [00:03<02:10, 326kB/s]
2%|2 | 888k/41.5M [00:03<02:06, 337kB/s]
2%|2 | 960k/41.5M [00:03<01:58, 358kB/s]
2%|2 | 1.01M/41.5M [00:04<01:53, 373kB/s]
3%|2 | 1.09M/41.5M [00:04<01:46, 397kB/s]
3%|2 | 1.16M/41.5M [00:04<01:42, 413kB/s]
3%|2 | 1.24M/41.5M [00:04<01:39, 425kB/s]
3%|3 | 1.33M/41.5M [00:04<01:34, 447kB/s]
3%|3 | 1.42M/41.5M [00:05<01:28, 476kB/s]
4%|3 | 1.52M/41.5M [00:05<01:24, 496kB/s]
4%|3 | 1.62M/41.5M [00:05<01:19, 524kB/s]
4%|4 | 1.73M/41.5M [00:05<01:14, 557kB/s]
4%|4 | 1.84M/41.5M [00:05<01:11, 580kB/s]
5%|4 | 1.95M/41.5M [00:05<01:08, 610kB/s]
5%|5 | 2.09M/41.5M [00:06<01:02, 658kB/s]
5%|5 | 2.23M/41.5M [00:06<00:58, 705kB/s]
6%|5 | 2.38M/41.5M [00:06<00:54, 751kB/s]
6%|6 | 2.54M/41.5M [00:06<00:50, 811kB/s]
7%|6 | 2.71M/41.5M [00:06<00:46, 866kB/s]
7%|6 | 2.90M/41.5M [00:07<00:43, 932kB/s]
7%|7 | 3.09M/41.5M [00:07<00:40, 992kB/s]
8%|7 | 3.31M/41.5M [00:07<00:37, 1.07MB/s]
9%|8 | 3.54M/41.5M [00:07<00:34, 1.15MB/s]
9%|9 | 3.78M/41.5M [00:07<00:32, 1.22MB/s]
10%|9 | 4.04M/41.5M [00:07<00:30, 1.30MB/s]
10%|# | 4.31M/41.5M [00:08<00:28, 1.39MB/s]
11%|#1 | 4.61M/41.5M [00:08<00:25, 1.49MB/s]
12%|#1 | 4.91M/41.5M [00:08<00:22, 1.74MB/s]
13%|#2 | 5.24M/41.5M [00:08<00:19, 1.93MB/s]
13%|#3 | 5.44M/41.5M [00:08<00:19, 1.95MB/s]
14%|#3 | 5.63M/41.5M [00:08<00:22, 1.66MB/s]
14%|#4 | 5.98M/41.5M [00:08<00:18, 1.99MB/s]
15%|#5 | 6.37M/41.5M [00:09<00:16, 2.27MB/s]
16%|#5 | 6.59M/41.5M
[00:09<00:16, 2.29MB/s]
16%|#6 | 6.82M/41.5M [00:09<00:18, 1.93MB/s]
17%|#7 | 7.23M/41.5M [00:09<00:15, 2.36MB/s]
19%|#8 | 7.69M/41.5M [00:09<00:12, 2.91MB/s]
19%|#9 | 7.99M/41.5M [00:09<00:12, 2.71MB/s]
20%|#9 | 8.27M/41.5M [00:09<00:15, 2.32MB/s]
21%|##1 | 8.72M/41.5M [00:10<00:12, 2.72MB/s]
22%|##2 | 9.26M/41.5M [00:10<00:09, 3.40MB/s]
23%|##3 | 9.62M/41.5M [00:10<00:10, 3.16MB/s]
24%|##3 | 9.95M/41.5M [00:10<00:12, 2.71MB/s]
25%|##5 | 10.5M/41.5M [00:10<00:11, 2.92MB/s]
27%|##6 | 11.1M/41.5M [00:10<00:08, 3.61MB/s]
28%|##8 | 11.8M/41.5M [00:10<00:07, 4.27MB/s]
29%|##9 | 12.2M/41.5M [00:11<00:08, 3.82MB/s]
30%|### | 12.6M/41.5M [00:11<00:09, 3.28MB/s]
32%|###1 | 13.2M/41.5M [00:11<00:08, 3.58MB/s]
34%|###3 | 14.0M/41.5M [00:11<00:06, 4.40MB/s]
36%|###5 | 14.8M/41.5M [00:11<00:05, 5.20MB/s]
37%|###7 | 15.4M/41.5M [00:11<00:05, 4.65MB/s]
38%|###8 | 15.8M/41.5M [00:11<00:06, 4.01MB/s]
40%|###9 | 16.6M/41.5M [00:12<00:05, 4.86MB/s]
42%|####2 | 17.5M/41.5M [00:12<00:04, 5.71MB/s]
44%|####3 | 18.1M/41.5M [00:12<00:04, 5.20MB/s]
45%|####4 | 18.6M/41.5M [00:12<00:05, 4.49MB/s]
47%|####7 | 19.5M/41.5M [00:12<00:04, 5.68MB/s]
50%|####9 | 20.6M/41.5M [00:12<00:03, 6.64MB/s]
51%|#####1 | 21.2M/41.5M [00:12<00:03, 6.02MB/s]
53%|#####2 | 21.9M/41.5M [00:13<00:03, 5.19MB/s]
55%|#####5 | 23.0M/41.5M [00:13<00:02, 6.58MB/s]
58%|#####8 | 24.1M/41.5M [00:13<00:02, 7.67MB/s]
60%|###### | 24.9M/41.5M [00:13<00:02, 6.96MB/s]
62%|######1 | 25.6M/41.5M [00:13<00:02, 6.02MB/s]
65%|######4 | 26.9M/41.5M [00:13<00:02, 7.55MB/s]
68%|######8 | 28.2M/41.5M [00:13<00:01, 8.86MB/s]
70%|####### | 29.2M/41.5M [00:13<00:01, 8.01MB/s]
72%|#######2 | 30.0M/41.5M
[00:14<00:01, 6.94MB/s]
75%|#######5 | 31.3M/41.5M [00:14<00:01, 8.35MB/s]
79%|#######8 | 32.6M/41.5M [00:14<00:00, 9.59MB/s]
81%|########1 | 33.6M/41.5M [00:14<00:00, 8.55MB/s]
83%|########3 | 34.5M/41.5M [00:14<00:00, 7.40MB/s]
86%|########6 | 35.7M/41.5M [00:14<00:00, 8.45MB/s]
89%|########9 | 37.1M/41.5M [00:14<00:00, 9.58MB/s]
92%|#########1| 38.0M/41.5M [00:15<00:00, 8.60MB/s]
94%|#########3| 38.9M/41.5M [00:15<00:00, 7.43MB/s]
97%|#########6| 40.1M/41.5M [00:15<00:00, 8.47MB/s]
100%|#########9| 41.5M/41.5M [00:15<00:00, 9.60MB/s]
100%|##########| 41.5M/41.5M [00:15<00:00, 2.82MB/s]
+
0%| | 0.00/41.5M [00:00<?, ?B/s]
0%| | 16.0k/41.5M [00:00<08:17, 87.5kB/s]
0%| | 32.0k/41.5M [00:00<08:18, 87.2kB/s]
0%| | 48.0k/41.5M [00:00<08:18, 87.1kB/s]
0%| | 64.0k/41.5M [00:00<08:18, 87.1kB/s]
0%| | 80.0k/41.5M [00:00<08:19, 87.0kB/s]
0%| | 96.0k/41.5M [00:01<08:18, 87.0kB/s]
0%| | 112k/41.5M [00:01<08:18, 87.0kB/s]
0%| | 128k/41.5M [00:01<08:18, 87.0kB/s]
0%| | 144k/41.5M [00:01<08:18, 87.0kB/s]
0%| | 168k/41.5M [00:01<07:11, 100kB/s]
0%| | 184k/41.5M [00:02<07:29, 96.3kB/s]
0%| | 208k/41.5M [00:02<06:45, 107kB/s]
1%| | 232k/41.5M [00:02<06:19, 114kB/s]
1%| | 256k/41.5M [00:02<06:03, 119kB/s]
1%| | 280k/41.5M [00:02<05:53, 122kB/s]
1%| | 304k/41.5M [00:03<05:45, 125kB/s]
1%| | 336k/41.5M [00:03<05:09, 140
kB/s]
1%| | 368k/41.5M [00:03<04:47, 150kB/s]
1%| | 400k/41.5M [00:03<04:34, 157kB/s]
1%|1 | 440k/41.5M [00:03<04:05, 175kB/s]
1%|1 | 480k/41.5M [00:03<03:48, 188kB/s]
1%|1 | 528k/41.5M [00:04<03:24, 210kB/s]
1%|1 | 584k/41.5M [00:04<03:00, 238kB/s]
2%|1 | 640k/41.5M [00:04<02:45, 258kB/s]
2%|1 | 696k/41.5M [00:04<02:37, 272kB/s]
2%|1 | 768k/41.5M [00:04<02:18, 308kB/s]
2%|1 | 848k/41.5M [00:05<02:03, 346kB/s]
2%|2 | 928k/41.5M [00:05<01:54, 373kB/s]
2%|2 | 1.00M/41.5M [00:05<01:41, 418kB/s]
3%|2 | 1.10M/41.5M [00:05<01:31, 461kB/s]
3%|2 | 1.15M/41.5M [00:06<02:48, 250kB/s]
3%|3 | 1.43M/41.5M [00:06<01:19, 529kB/s]
4%|3 | 1.52M/41.5M [00:06<01:21, 516kB/s]
4%|3 | 1.60M/41.5M [00:06<01:22, 507kB/s]
4%|4 | 1.69M/41.5M [00:06<01:23, 500
kB/s]
4%|4 | 1.79M/41.5M [00:07<01:20, 517kB/s]
5%|4 | 1.88M/41.5M [00:07<01:19, 519kB/s]
5%|4 | 1.98M/41.5M [00:07<01:17, 533kB/s]
5%|5 | 2.09M/41.5M [00:07<01:16, 542kB/s]
5%|5 | 2.20M/41.5M [00:07<01:13, 562kB/s]
6%|5 | 2.30M/41.5M [00:08<01:13, 563kB/s]
6%|5 | 2.41M/41.5M [00:08<01:11, 577kB/s]
6%|6 | 2.52M/41.5M [00:08<01:09, 586kB/s]
6%|6 | 2.63M/41.5M [00:08<01:07, 606kB/s]
7%|6 | 2.74M/41.5M [00:08<01:06, 607kB/s]
7%|6 | 2.85M/41.5M [00:09<01:06, 608kB/s]
7%|7 | 2.97M/41.5M [00:09<01:05, 621kB/s]
7%|7 | 3.09M/41.5M [00:09<01:03, 631kB/s]
8%|7 | 3.20M/41.5M [00:09<01:04, 624kB/s]
8%|7 | 3.31M/41.5M [00:09<01:03, 633kB/s]
8%|8 | 3.43M/41.5M [00:09<01:02, 639kB/s]
9%|8 | 3.54M/41.5M [00:10<01:03, 630kB/s]
9%|8 | 3.66M/41.5M [00:10
<01:02, 637kB/s]
9%|9 | 3.77M/41.5M [00:10<01:02, 628kB/s]
9%|9 | 3.88M/41.5M [00:10<01:02, 636kB/s]
10%|9 | 3.99M/41.5M [00:10<01:02, 628kB/s]
10%|9 | 4.11M/41.5M [00:11<01:01, 635kB/s]
10%|# | 4.23M/41.5M [00:11<01:01, 641kB/s]
10%|# | 4.34M/41.5M [00:11<01:01, 631kB/s]
11%|# | 4.45M/41.5M [00:11<01:00, 638kB/s]
11%|#1 | 4.57M/41.5M [00:11<01:00, 642kB/s]
11%|#1 | 4.69M/41.5M [00:12<00:59, 645kB/s]
12%|#1 | 4.80M/41.5M [00:12<00:59, 648kB/s]
12%|#1 | 4.92M/41.5M [00:12<00:59, 649kB/s]
12%|#2 | 5.04M/41.5M [00:12<00:58, 650kB/s]
12%|#2 | 5.16M/41.5M [00:12<00:58, 651kB/s]
13%|#2 | 5.28M/41.5M [00:12<00:57, 665kB/s]
13%|#3 | 5.41M/41.5M [00:13<00:56, 674kB/s]
13%|#3 | 5.53M/41.5M [00:13<00:55, 681kB/s]
14%|#3 | 5.66M/41.5M [00:13<00:53, 698kB/s]
14%|#3 | 5.80M/4
1.5M [00:13<00:52, 711kB/s]
14%|#4 | 5.94M/41.5M [00:13<00:50, 732kB/s]
15%|#4 | 6.08M/41.5M [00:14<00:49, 748kB/s]
15%|#4 | 6.22M/41.5M [00:14<00:48, 758kB/s]
15%|#5 | 6.37M/41.5M [00:14<00:47, 779kB/s]
16%|#5 | 6.52M/41.5M [00:14<00:45, 806kB/s]
16%|#6 | 6.69M/41.5M [00:14<00:43, 839kB/s]
17%|#6 | 6.85M/41.5M [00:15<00:42, 861kB/s]
17%|#6 | 7.03M/41.5M [00:15<00:40, 903kB/s]
17%|#7 | 7.21M/41.5M [00:15<00:38, 932kB/s]
18%|#7 | 7.41M/41.5M [00:15<00:36, 979kB/s]
18%|#8 | 7.60M/41.5M [00:15<00:35, 1.01MB/s]
19%|#8 | 7.81M/41.5M [00:16<00:33, 1.06MB/s]
19%|#9 | 8.02M/41.5M [00:16<00:32, 1.10MB/s]
20%|#9 | 8.26M/41.5M [00:16<00:30, 1.16MB/s]
20%|## | 8.49M/41.5M [00:16<00:28, 1.20MB/s]
21%|##1 | 8.74M/41.5M [00:16<00:27, 1.26MB/s]
22%|##1 | 9.01M/41.5M [00:16<00:25, 1.33MB/s]
22%|#
#2 | 9.28M/41.5M [00:17<00:24, 1.38MB/s]
23%|##3 | 9.57M/41.5M [00:17<00:23, 1.45MB/s]
24%|##3 | 9.87M/41.5M [00:17<00:21, 1.51MB/s]
25%|##4 | 10.2M/41.5M [00:17<00:20, 1.58MB/s]
25%|##5 | 10.5M/41.5M [00:17<00:19, 1.67MB/s]
26%|##6 | 10.9M/41.5M [00:18<00:18, 1.77MB/s]
27%|##7 | 11.2M/41.5M [00:18<00:17, 1.85MB/s]
28%|##8 | 11.6M/41.5M [00:18<00:16, 1.95MB/s]
29%|##9 | 12.0M/41.5M [00:18<00:15, 2.06MB/s]
30%|### | 12.5M/41.5M [00:18<00:14, 2.17MB/s]
31%|###1 | 12.9M/41.5M [00:19<00:13, 2.28MB/s]
32%|###2 | 13.4M/41.5M [00:19<00:12, 2.39MB/s]
34%|###3 | 13.9M/41.5M [00:19<00:11, 2.51MB/s]
35%|###4 | 14.4M/41.5M [00:19<00:10, 2.62MB/s]
36%|###6 | 15.0M/41.5M [00:19<00:10, 2.76MB/s]
38%|###7 | 15.6M/41.5M [00:19<00:09, 2.90MB/s]
39%|###8 | 16.2M/41.5M [00:20<00:08, 3.06MB/s]
41%|#### | 16.8M/41.5M [00:2
0<00:08, 3.21MB/s]
42%|####2 | 17.5M/41.5M [00:20<00:07, 3.37MB/s]
44%|####3 | 18.2M/41.5M [00:20<00:06, 3.53MB/s]
46%|####5 | 18.9M/41.5M [00:20<00:06, 3.70MB/s]
47%|####7 | 19.7M/41.5M [00:21<00:05, 3.87MB/s]
49%|####9 | 20.5M/41.5M [00:21<00:04, 4.42MB/s]
51%|#####1 | 21.2M/41.5M [00:21<00:04, 5.06MB/s]
52%|#####2 | 21.7M/41.5M [00:21<00:04, 4.66MB/s]
54%|#####3 | 22.2M/41.5M [00:21<00:05, 3.98MB/s]
56%|#####5 | 23.0M/41.5M [00:21<00:04, 4.70MB/s]
58%|#####7 | 23.9M/41.5M [00:21<00:03, 5.57MB/s]
59%|#####8 | 24.5M/41.5M [00:22<00:03, 5.12MB/s]
60%|###### | 25.0M/41.5M [00:22<00:03, 4.91MB/s]
62%|######2 | 25.9M/41.5M [00:22<00:02, 6.01MB/s]
64%|######3 | 26.5M/41.5M [00:22<00:02, 5.46MB/s]
65%|######5 | 27.1M/41.5M [00:22<00:02, 5.30MB/s]
68%|######7 | 28.1M/41.5M [00:22<00:02, 6.58MB/s]
69%|######9 | 28.8M/41.5M [00:22<00:02, 5.94MB/s]
71%|
####### | 29.4M/41.5M [00:22<00:02, 5.76MB/s]
74%|#######3 | 30.5M/41.5M [00:23<00:01, 7.16MB/s]
75%|#######5 | 31.3M/41.5M [00:23<00:01, 6.45MB/s]
77%|#######7 | 32.0M/41.5M [00:23<00:01, 6.23MB/s]
80%|#######9 | 33.1M/41.5M [00:23<00:01, 7.76MB/s]
82%|########1 | 33.9M/41.5M [00:23<00:01, 6.96MB/s]
84%|########3 | 34.7M/41.5M [00:23<00:01, 6.73MB/s]
87%|########6 | 35.9M/41.5M [00:23<00:00, 8.30MB/s]
89%|########8 | 36.8M/41.5M [00:23<00:00, 7.44MB/s]
90%|######### | 37.5M/41.5M [00:24<00:00, 7.20MB/s]
94%|#########3| 38.9M/41.5M [00:24<00:00, 8.84MB/s]
96%|#########5| 39.8M/41.5M [00:24<00:00, 7.91MB/s]
98%|#########7| 40.6M/41.5M [00:24<00:00, 6.69MB/s]
100%|##########| 41.5M/41.5M [00:24<00:00, 1.78MB/s]
diff --git a/docs/_sources/how_to/compile_models/from_paddle.rst.txt b/docs/_sources/how_to/compile_models/from_paddle.rst.txt
index b8d915831..c3780c103 100644
--- a/docs/_sources/how_to/compile_models/from_paddle.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_paddle.rst.txt
@@ -235,7 +235,7 @@ Look up prediction top 1 index in 1000 class synset.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 7.006 seconds)
+ **Total running time of the script:** ( 1 minutes 7.747 seconds)
.. _sphx_glr_download_how_to_compile_models_from_paddle.py:
diff --git a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
index 2e3b7c9d8..23edca199 100644
--- a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
@@ -93,7 +93,7 @@ Load a pretrained PyTorch model
.. code-block:: none
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
-
0%| | 0.00/44.7M [00:00<?, ?B/s]
6%|5 | 2.66M/44.7M [00:00<00:01, 27.8MB/s]
12%|#2 | 5.52M/44.7M [00:00<00:01, 28.9MB/s]
23%|##2 | 10.3M/44.7M [00:00<00:00, 38.3MB/s]
35%|###4 | 15.5M/44.7M [00:00<00:00, 44.5MB/s]
45%|####5 | 20.2M/44.7M [00:00<00:00, 46.4MB/s]
58%|#####7 | 25.8M/44.7M [00:00<00:00, 49.5MB/s]
70%|######9 | 31.1M/44.7M [00:00<00:00, 51.7MB/s]
82%|########1 | 36.5M/44.7M [00:00<00:00, 53.2MB/s]
94%|#########3| 41.9M/44.7M [00:00<00:00, 54.1MB/s]
100%|##########| 44.7M/44.7M [00:00<00:00, 49.1MB/s]
+
0%| | 0.00/44.7M [00:00<?, ?B/s]
7%|7 | 3.23M/44.7M [00:00<00:01, 33.9MB/s]
14%|#4 | 6.47M/44.7M [00:00<00:01, 33.3MB/s]
22%|##1 | 9.75M/44.7M [00:00<00:01, 33.7MB/s]
29%|##9 | 13.0M/44.7M [00:00<00:01, 31.9MB/s]
38%|###8 | 17.1M/44.7M [00:00<00:00, 35.3MB/s]
46%|####5 | 20.5M/44.7M [00:00<00:00, 34.8MB/s]
53%|#####3 | 23.8M/44.7M [00:00<00:01, 21.6MB/s]
59%|#####9 | 26.5M/44.7M [00:01<00:00, 21.8MB/s]
67%|######6 | 29.8M/44.7M [00:01<00:00, 24.9MB/s]
75%|#######4 | 33.3M/44.7M [00:01<00:00, 27.8MB/s]
83%|########2 | 37.0M/44.7M [00:01<00:00, 29.9MB/s]
90%|########9 | 40.1M/44.7M [00:01<00:00, 28.3MB/s]
96%|#########6| 43.0M/44.7M [00:01<00:00, 28.8MB/s]
100%|##########| 44.7M/44.7M [00:01<00:00, 28.6MB/s]
diff --git a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
index 53f770229..90458dd40 100644
--- a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
@@ -422,7 +422,7 @@ Run the corresponding model on tensorflow
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 4.119 seconds)
+ **Total running time of the script:** ( 1 minutes 1.170 seconds)
.. _sphx_glr_download_how_to_compile_models_from_tensorflow.py:
diff --git a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
index 62d09c4c6..397dde91d 100644
--- a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
@@ -5,26 +5,26 @@
Computation times
=================
-**05:47.852** total execution time for **how_to_compile_models** files:
+**05:55.609** total execution time for **how_to_compile_models** files:
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``) | 01:07.006 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``) | 01:07.747 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``) | 01:04.119 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``) | 01:01.170 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``) | 00:56.601 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``) | 00:57.992 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``) | 00:40.653 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``) | 00:50.009 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``) | 00:38.824 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``) | 00:36.725 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``) | 00:22.452 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``) | 00:22.695 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``) | 00:21.643 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``) | 00:21.516 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``) | 00:19.948 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``) | 00:20.885 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``) | 00:14.254 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``) | 00:14.522 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``) | 00:02.351 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``) | 00:02.349 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
index e2de49339..5f20da138 100644
--- a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
@@ -440,7 +440,7 @@ Execute on TVM
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 16.0557 15.8693 16.5941 15.7658 0.3119
+ 15.9465 15.9620 16.0734 15.7781 0.0909
diff --git a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
index d47ec9c7b..0e908041c 100644
--- a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
@@ -122,7 +122,7 @@ Load pre-trained maskrcnn from torchvision and do tracing
.. code-block:: none
Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
-
0%| | 0.00/170M [00:00<?, ?B/s]
6%|5 | 9.44M/170M [00:00<00:01, 98.6MB/s]
15%|#5 | 25.9M/170M [00:00<00:01, 142MB/s]
25%|##5 | 42.5M/170M [00:00<00:00, 156MB/s]
35%|###4 | 58.8M/170M [00:00<00:00, 162MB/s]
44%|####4 | 75.3M/170M [00:00<00:00, 166MB/s]
54%|#####4 | 91.9M/170M [00:00<00:00, 169MB/s]
64%|######3 | 108M/170M [00:00<00:00, 170MB/s]
74%|#######3 | 125M/170M [00:00<00:00, 171MB/s]
83%|########3 | 141M/170M [00:00<00:00, 171MB/s]
93%|#########2| 158M/170M [00:01<00:00, 172MB/s]
100%|##########| 170M/170M [00:01<00:00, 166MB/s]
+
0%| | 0.00/170M [00:00<?, ?B/s]
8%|8 | 14.2M/170M [00:00<00:01, 148MB/s]
21%|## | 35.6M/170M [00:00<00:00, 193MB/s]
34%|###3 | 56.9M/170M [00:00<00:00, 207MB/s]
46%|####6 | 78.3M/170M [00:00<00:00, 213MB/s]
59%|#####8 | 99.7M/170M [00:00<00:00, 217MB/s]
71%|#######1 | 121M/170M [00:00<00:00, 218MB/s]
84%|########3 | 142M/170M [00:00<00:00, 219MB/s]
96%|#########6| 163M/170M [00:00<00:00, 221MB/s]
100%|##########| 170M/170M [00:00<00:00, 214MB/s]
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:3878: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
for i in range(dim)
/usr/local/lib/python3.7/dist-packages/torchvision/models/detection/anchor_utils.py:127: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
@@ -291,7 +291,7 @@ Get boxes with score larger than 0.9
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 2 minutes 51.510 seconds)
+ **Total running time of the script:** ( 2 minutes 55.836 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_object_detection_pytorch.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
index f994d18a5..f5006051c 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
@@ -219,7 +219,7 @@ training. Other models require a full post training calibration.
.. code-block:: none
Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
-
0%| | 0.00/13.6M [00:00<?, ?B/s]
100%|##########| 13.6M/13.6M [00:00<00:00, 153MB/s]
+
0%| | 0.00/13.6M [00:00<?, ?B/s]
68%|######7 | 9.20M/13.6M [00:00<00:00, 95.6MB/s]
100%|##########| 13.6M/13.6M [00:00<00:00, 110MB/s]
@@ -399,7 +399,7 @@ Here we give an example of how to measure performance of TVM compiled models.
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 90.5119 90.4735 93.1524 90.1430 0.3397
+ 90.3645 90.2495 96.8307 90.1843 0.6696
@@ -448,7 +448,7 @@ TODO
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 5.729 seconds)
+ **Total running time of the script:** ( 1 minutes 7.076 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_prequantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
index bf790ef4f..c615a7b62 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
@@ -426,7 +426,7 @@ Here we give an example of how to measure performance of TVM compiled models.
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 118.8315 118.7852 125.2959 117.9721 0.7412
+ 119.8498 119.8288 121.2662 119.0162 0.3249
@@ -463,7 +463,7 @@ Here we give an example of how to measure performance of TVM compiled models.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 2 minutes 9.625 seconds)
+ **Total running time of the script:** ( 2 minutes 3.701 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_prequantized_tflite.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
index e05d5fd7d..79d6eb426 100644
--- a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
@@ -254,7 +254,7 @@ We create a Relay VM to build and execute the model.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 14.689 seconds)
+ **Total running time of the script:** ( 1 minutes 32.299 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_quantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
index 1687fdf29..de82f8975 100644
--- a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
@@ -157,7 +157,7 @@ Convert and compile model for CPU.
data: None
input_sym_arg_type = in_param.infer_type()[0]
Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
-
0%| | 0/132723 [00:00<?, ?KB/s]
2%|1 | 2047/132723 [00:00<00:06, 20406.67KB/s]
5%|4 | 6578/132723 [00:00<00:03, 35035.25KB/s]
11%|# | 14180/132723 [00:00<00:02, 53734.70KB/s]
17%|#6 | 22154/132723 [00:00<00:01, 63994.50KB/s]
23%|##2 | 30073/132723 [00:00<00:01, 69470.33KB/s]
29%|##8 | 38102/132723 [00:00<00:01, 73147.09KB/s]
35%|###4 | 46097/132723 [00:00<00:01, 75365.68KB/s]
41%|#### | 54077/132723 [00:00<00:01, 76774.94KB/s]
47%|####6 | 62103/132723 [00:00<00:00, 77862.69KB/s]
53%|#####2 | 70157/132723 [00:01<00:00, 78685.87KB/s]
59%|#####8 | 78228/132723 [00:01<00:00, 79301.38KB/s]
65%|######5 | 86317/132723 [00:01<00:00, 79781.27KB/s]
71%|#######1 | 94447/132723 [00:01<00:00, 80239.53KB/s]
77%|#######7 | 102596/132723 [00:01<00:00, 80616.06KB/s]
83%|########3 | 110731/132723 [00:01<00:00, 80836.08KB/s]
90%|########9
| 118922/132723 [00:01<00:00, 81158.05KB/s]
96%|#########5| 127088/132723 [00:01<00:00, 81306.03KB/s]
100%|##########| 132723/132723 [00:01<00:00, 74969.03KB/s]
+
0%| | 0/132723 [00:00<?, ?KB/s]
2%|2 | 3124/132723 [00:00<00:04, 31238.61KB/s]
8%|7 | 10039/132723 [00:00<00:02, 53537.03KB/s]
14%|#3 | 18569/132723 [00:00<00:01, 68032.11KB/s]
21%|## | 27284/132723 [00:00<00:01, 75576.21KB/s]
27%|##7 | 36034/132723 [00:00<00:01, 79872.26KB/s]
34%|###3 | 44669/132723 [00:00<00:01, 82069.42KB/s]
40%|#### | 53394/132723 [00:00<00:00, 83759.28KB/s]
47%|####6 | 62119/132723 [00:00<00:00, 84868.29KB/s]
53%|#####3 | 70833/132723 [00:00<00:00, 85572.69KB/s]
60%|#####9 | 79608/132723 [00:01<00:00, 86241.46KB/s]
67%|######6 | 88369/132723 [00:01<00:00, 86655.55KB/s]
73%|#######3 | 97076/132723 [00:01<00:00, 86778.49KB/s]
80%|#######9 | 105828/132723 [00:01<00:00, 87001.18KB/s]
86%|########6 | 114574/132723 [00:01<00:00, 87137.46KB/s]
93%|#########2| 123289/132723 [00:01<00:00, 87138.26KB/s]
99%|########
#9| 132003/132723 [00:01<00:00, 87121.86KB/s]
100%|##########| 132723/132723 [00:01<00:00, 82401.45KB/s]
@@ -240,7 +240,7 @@ Display result
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 2 minutes 15.415 seconds)
+ **Total running time of the script:** ( 2 minutes 17.409 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_ssd_gluoncv.py:
diff --git a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
index 4b4d5d094..92132cbf8 100644
--- a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
@@ -5,22 +5,22 @@
Computation times
=================
-**10:27.491** total execution time for **how_to_deploy_models** files:
+**10:47.671** total execution time for **how_to_deploy_models** files:
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``) | 02:51.510 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``) | 02:55.836 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``) | 02:15.415 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``) | 02:17.409 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``) | 02:09.625 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``) | 02:03.701 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``) | 01:14.689 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``) | 01:32.299 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``) | 01:05.729 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``) | 01:07.076 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``) | 00:28.241 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``) | 00:28.877 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``) | 00:22.277 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``) | 00:22.467 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_deploy_models_deploy_sparse.py` (``deploy_sparse.py``) | 00:00.006 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
index d89fd6416..a7e17f704 100644
--- a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
@@ -463,7 +463,7 @@ First let us define two helper functions to get the mobilenet model and a cat im
.. code-block:: none
- Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip28234e9f-61c7-4f97-854a-564afd9983fc from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+ Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip8ca90524-9141-4870-9ca3-1f55665bdfc5 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
diff --git a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
index c43f5ebc4..d3fe3e141 100644
--- a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
Computation times
=================
-**00:41.355** total execution time for **how_to_extend_tvm** files:
+**00:40.008** total execution time for **how_to_extend_tvm** files:
+-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``) | 00:38.244 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``) | 00:36.865 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``) | 00:02.187 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``) | 00:02.214 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``) | 00:00.917 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``) | 00:00.923 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_extend_tvm_low_level_custom_pass.py` (``low_level_custom_pass.py``) | 00:00.006 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
index 46b69bc15..c80aaa6c6 100644
--- a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
@@ -215,10 +215,10 @@ profile the execution time of each passes.
.. code-block:: none
Printing results of timing profile...
- InferType: 6848us [6848us] (45.94%; 45.94%)
- FoldScaleAxis: 8058us [6us] (54.06%; 54.06%)
- FoldConstant: 8051us [1612us] (54.02%; 99.92%)
- InferType: 6439us [6439us] (43.20%; 79.98%)
+ InferType: 6849us [6849us] (46.58%; 46.58%)
+ FoldScaleAxis: 7855us [6us] (53.42%; 53.42%)
+ FoldConstant: 7849us [1571us] (53.38%; 99.92%)
+ InferType: 6278us [6278us] (42.70%; 79.99%)
@@ -257,10 +257,10 @@ Refer to following sections and :py:func:`tvm.instrument.pass_instrument` for th
.. code-block:: none
Printing results of timing profile...
- InferType: 6432us [6432us] (44.63%; 44.63%)
- FoldScaleAxis: 7981us [6us] (55.37%; 55.37%)
- FoldConstant: 7975us [1667us] (55.33%; 99.92%)
- InferType: 6308us [6308us] (43.77%; 79.10%)
+ InferType: 6312us [6312us] (44.55%; 44.55%)
+ FoldScaleAxis: 7857us [5us] (55.45%; 55.45%)
+ FoldConstant: 7852us [1593us] (55.41%; 99.93%)
+ InferType: 6259us [6259us] (44.17%; 79.72%)
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
index 03703cbb6..299baf95f 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
@@ -327,7 +327,7 @@ latency of convolution.
.. code-block:: none
- Convolution: 54.167125 ms
+ Convolution: 43.185978 ms
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
index 373942f4c..655342216 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
@@ -658,7 +658,7 @@ be able to run on our build server
.. code-block:: none
- conv2d with tensor core: 6.873242 ms
+ conv2d with tensor core: 11.878459 ms
diff --git a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
index 91eb779e9..75f5f141c 100644
--- a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
@@ -130,8 +130,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
.. code-block:: none
- Numpy running time: 0.018172
- Baseline: 3.395604
+ Numpy running time: 0.018950
+ Baseline: 3.338291
@@ -226,7 +226,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
.. code-block:: none
- Opt1: 0.300426
+ Opt1: 0.308090
@@ -329,7 +329,7 @@ In this tutorial, we chose to vectorize the inner loop row data since it is cach
.. code-block:: none
- Opt2: 0.332632
+ Opt2: 0.331753
@@ -425,7 +425,7 @@ the access pattern for A matrix is more cache friendly.
.. code-block:: none
- Opt3: 0.117883
+ Opt3: 0.121683
@@ -550,7 +550,7 @@ flattening.
.. code-block:: none
- Opt4: 0.110434
+ Opt4: 0.111281
@@ -672,7 +672,7 @@ write to C when all the block results are ready.
.. code-block:: none
- Opt5: 0.111427
+ Opt5: 0.111284
@@ -797,7 +797,7 @@ Futhermore, we can also utilize multi-core processors to do the thread-level par
.. code-block:: none
- Opt6: 0.145331
+ Opt6: 0.145534
diff --git a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
index 6800c8e63..6f039c30b 100644
--- a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
@@ -5,12 +5,12 @@
Computation times
=================
-**00:34.322** total execution time for **how_to_optimize_operators** files:
+**00:34.419** total execution time for **how_to_optimize_operators** files:
+-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``) | 00:32.069 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``) | 00:32.140 | 0.0 MB |
+-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``) | 00:01.234 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``) | 00:01.283 | 0.0 MB |
+-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``) | 00:01.020 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``) | 00:00.996 | 0.0 MB |
+-----------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
index 3859aea9e..2ca94f4b3 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
@@ -5,18 +5,18 @@
Computation times
=================
-**05:12.442** total execution time for **how_to_tune_with_autoscheduler** files:
+**05:12.594** total execution time for **how_to_tune_with_autoscheduler** files:
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``) | 02:35.530 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``) | 02:34.321 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``) | 01:19.801 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``) | 01:20.685 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``) | 00:42.629 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``) | 00:43.097 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``) | 00:17.693 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``) | 00:17.412 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``) | 00:08.451 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``) | 00:08.696 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``) | 00:08.338 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``) | 00:08.384 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
index 2f5cf921b..c69713124 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
@@ -239,72 +239,128 @@ cooperative fetching, unrolling and operator fusion.
compute: Buffer(compute_2: Pointer(float32), float32, [25088], [])}
buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, compute_1: compute}
preflattened_buffer_map = {data_1: data_3: Buffer(data_2, float32, [1, 512, 7, 7], []), kernel_1: kernel_3: Buffer(kernel_2, float32, [512, 512, 3, 3], []), bias_1: bias_3: Buffer(bias_2, float32, [1, 512, 1, 1], []), compute_1: compute_3: Buffer(compute_2, float32, [1, 512, 7, 7], [])} {
- attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 64;
- allocate(conv2d_nchw: Pointer(local float32), float32, [2]), storage_scope = local;
- allocate(pad_temp.shared: Pointer(shared float32), float32, [2016]), storage_scope = shared;
- allocate(kernel.shared: Pointer(shared float32), float32, [768]), storage_scope = shared;
- attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196 {
- conv2d_nchw_1: Buffer(conv2d_nchw, float32, [1], [], scope="local", align=4)[0] = 0f32
+ attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 112;
+ allocate(conv2d_nchw: Pointer(local float32), float32, [7]), storage_scope = local;
+ allocate(pad_temp.shared: Pointer(shared float32), float32, [144]), storage_scope = shared;
+ allocate(kernel.shared: Pointer(shared float32), float32, [1536]), storage_scope = shared;
+ attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32 {
+ conv2d_nchw_1: Buffer(conv2d_nchw, float32, [7], [], scope="local", align=16)[0] = 0f32
conv2d_nchw_1[1] = 0f32
- for (rc.outer.outer: int32, 0, 16) {
+ conv2d_nchw_1[2] = 0f32
+ conv2d_nchw_1[3] = 0f32
+ conv2d_nchw_1[4] = 0f32
+ conv2d_nchw_1[5] = 0f32
+ conv2d_nchw_1[6] = 0f32
+ for (rc.outer.outer: int32, 0, 32) {
for (ry.outer.outer: int32, 0, 3) {
- let cse_var_2: int32 = (rc.outer.outer*1568)
- let cse_var_1: int32 = (ry.outer.outer*7)
+ let cse_var_4: int32 = (rc.outer.outer*784)
+ let cse_var_3: int32 = (ry.outer.outer*7)
+ let cse_var_2: int32 = (rc.outer.outer*144)
+ let cse_var_1: int32 = (ry.outer.outer*3)
{
- attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
- pad_temp.shared_1: Buffer(pad_temp.shared, float32, [2016], [], scope="shared")[threadIdx.x_1] = @tir.if_then_else(((((1 <= (floordiv(floormod(threadIdx.x_1, 63), 9) + ry.outer.outer)) && ((floordiv(floormod(threadIdx.x_1, 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 9))) && (floormod(threadIdx.x_1, 9) < 8)), data[((((cse_var_2 + (floordiv(threadIdx.x_1, 9)*7)) + cse_var_1) + floormod(threadIdx.x_1, 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
- pad_temp.shared_1[(threadIdx.x_1 + 196)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 196), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 196), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 7), 9))) && (floormod((threadIdx.x_1 + 7), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 196), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 7), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
- pad_temp.shared_1[(threadIdx.x_1 + 392)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 392), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 392), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 5), 9))) && (floormod((threadIdx.x_1 + 5), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 392), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 5), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
- pad_temp.shared_1[(threadIdx.x_1 + 588)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 588), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 588), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 3), 9))) && (floormod((threadIdx.x_1 + 3), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 588), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 3), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
- pad_temp.shared_1[(threadIdx.x_1 + 784)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 784), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 784), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 1), 9))) && (floormod((threadIdx.x_1 + 1), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 784), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 1), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
- pad_temp.shared_1[(threadIdx.x_1 + 980)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 980), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 980), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 8), 9))) && (floormod((threadIdx.x_1 + 8), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 980), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 8), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
- pad_temp.shared_1[(threadIdx.x_1 + 1176)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 1176), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 1176), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 6), 9))) && (floormod((threadIdx.x_1 + 6), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1176), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 6), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
- pad_temp.shared_1[(threadIdx.x_1 + 1372)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 1372), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 1372), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 4), 9))) && (floormod((threadIdx.x_1 + 4), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1372), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 4), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
- pad_temp.shared_1[(threadIdx.x_1 + 1568)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 1568), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 1568), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 2), 9))) && (floormod((threadIdx.x_1 + 2), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1568), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 2), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
- pad_temp.shared_1[(threadIdx.x_1 + 1764)] = @tir.if_then_else(((((1 <= (floordiv(floormod(threadIdx.x_1, 63), 9) + ry.outer.outer)) && ((floordiv(floormod(threadIdx.x_1, 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 9))) && (floormod(threadIdx.x_1, 9) < 8)), data[((((cse_var_2 + (floordiv(threadIdx.x_1, 9)*7)) + cse_var_1) + floormod(threadIdx.x_1, 9)) + 1364)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
- if @tir.likely((threadIdx.x_1 < 56), dtype=bool) {
- pad_temp.shared_1[(threadIdx.x_1 + 1960)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 1960), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 1960), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 7), 9))) && (floormod((threadIdx.x_1 + 7), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1960), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 7), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ pad_temp.shared_1: Buffer(pad_temp.shared, float32, [144], [], scope="shared")[threadIdx.x_1] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(threadIdx.x_1, 9))) && (floormod(threadIdx.x_1, 9) < 8)), data[(((((cse_var_4 + (floordiv(threadIdx.x_1, 9)*49)) + cse_var_3) + (floormod(blockIdx.x, 7)*7)) + floormod(threadIdx.x_1, 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ pad_temp.shared_1[(threadIdx.x_1 + 32)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1 + 5), 9))) && (floormod((threadIdx.x_1 + 5), 9) < 8)), data[(((((cse_var_4 + (floordiv((threadIdx.x_1 + 32), 9)*49)) + cse_var_3) + (floormod(blockIdx.x, 7)*7)) + floormod((threadIdx.x_1 + 5), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ pad_temp.shared_1[(threadIdx.x_1 + 64)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1 + 1), 9))) && (floormod((threadIdx.x_1 + 1), 9) < 8)), data[(((((cse_var_4 + (floordiv((threadIdx.x_1 + 64), 9)*49)) + cse_var_3) + (floormod(blockIdx.x, 7)*7)) + floormod((threadIdx.x_1 + 1), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ pad_temp.shared_1[(threadIdx.x_1 + 96)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1 + 6), 9))) && (floormod((threadIdx.x_1 + 6), 9) < 8)), data[(((((cse_var_4 + (floordiv((threadIdx.x_1 + 96), 9)*49)) + cse_var_3) + (floormod(blockIdx.x, 7)*7)) + floormod((threadIdx.x_1 + 6), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ if @tir.likely((threadIdx.x_1 < 16), dtype=bool) {
+ pad_temp.shared_1[(threadIdx.x_1 + 128)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1 + 2), 9))) && (floormod((threadIdx.x_1 + 2), 9) < 8)), data[(((((cse_var_4 + (floordiv((threadIdx.x_1 + 128), 9)*49)) + cse_var_3) + (floormod(blockIdx.x, 7)*7)) + floormod((threadIdx.x_1 + 2), 9)) - 8)], 0f32, dtype=float32)
}
- attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196 {
- kernel.shared_1: Buffer(kernel.shared, float32, [768], [], scope="shared")[(threadIdx.x_2*3)] = kernel[(((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 32)*4608)) + (rc.outer.outer*288)) + (floormod(threadIdx.x_2, 32)*9)) + (ry.outer.outer*3))]
- kernel.shared_1[((threadIdx.x_2*3) + 1)] = kernel[((((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 32)*4608)) + (rc.outer.outer*288)) + (floormod(threadIdx.x_2, 32)*9)) + (ry.outer.outer*3)) + 1)]
- kernel.shared_1[((threadIdx.x_2*3) + 2)] = kernel[((((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 32)*4608)) + (rc.outer.outer*288)) + (floormod(threadIdx.x_2, 32)*9)) + (ry.outer.outer*3)) + 2)]
- }
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
- if @tir.likely((threadIdx.x_2 < 60), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*3) + 588)] = kernel[(((((blockIdx.x*36864) + (floordiv((floordiv(threadIdx.x_2, 4) + 49), 8)*4608)) + (rc.outer.outer*288)) + (floormod((threadIdx.x_2 + 4), 32)*9)) + (ry.outer.outer*3))]
- kernel.shared_1[((threadIdx.x_2*3) + 589)] = kernel[((((((blockIdx.x*36864) + (floordiv((floordiv(threadIdx.x_2, 4) + 49), 8)*4608)) + (rc.outer.outer*288)) + (floormod((threadIdx.x_2 + 4), 32)*9)) + (ry.outer.outer*3)) + 1)]
- kernel.shared_1[((threadIdx.x_2*3) + 590)] = kernel[((((((blockIdx.x*36864) + (floordiv((floordiv(threadIdx.x_2, 4) + 49), 8)*4608)) + (rc.outer.outer*288)) + (floormod((threadIdx.x_2 + 4), 32)*9)) + (ry.outer.outer*3)) + 2)]
- }
- for (rc.outer.inner: int32, 0, 16) {
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7))]*kernel.shared_1[((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6))]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7))]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 384)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 1)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 1)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 1)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 385)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 2)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 2)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 2)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 386)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 3)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 387)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 64)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 4)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 64)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 388)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 65)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 5)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 65)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 389)]))
+ attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ kernel.shared_1: Buffer(kernel.shared, float32, [1536], [], scope="shared")[ramp((threadIdx.x_2*4), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(threadIdx.x_2, 12)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp((threadIdx.x_2*4), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp(threadIdx.x_2, 1, 4), broadcast(3, 4)))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ kernel.shared_1[ramp(((threadIdx.x_2*4) + 128), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 128), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 128), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 32), 1, 4), broadcast(3, 4)))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ kernel.shared_1[ramp(((threadIdx.x_2*4) + 256), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 256), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 256), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 64), 1, 4), broadcast(3, 4)))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ kernel.shared_1[ramp(((threadIdx.x_2*4) + 384), 1, 4)] = kernel[(((broadcast(((((floordiv(blockIdx.x, 7)*147456) + (floordiv(threadIdx.x_2, 12)*4608)) + cse_var_2) + 36864), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 384), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 96), 1, 4), broadcast(3, 4)))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ kernel.shared_1[ramp(((threadIdx.x_2*4) + 512), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 512), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 512), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 128), 1, 4), broadcast(3, 4)))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ kernel.shared_1[ramp(((threadIdx.x_2*4) + 640), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 640), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 640), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 160), 1, 4), broadcast(3, 4)))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ kernel.shared_1[ramp(((threadIdx.x_2*4) + 768), 1, 4)] = kernel[(((broadcast(((((floordiv(blockIdx.x, 7)*147456) + (floordiv(threadIdx.x_2, 12)*4608)) + cse_var_2) + 73728), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 768), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 192), 1, 4), broadcast(3, 4)))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ kernel.shared_1[ramp(((threadIdx.x_2*4) + 896), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 896), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 896), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 224), 1, 4), broadcast(3, 4)))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ kernel.shared_1[ramp(((threadIdx.x_2*4) + 1024), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 1024), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 1024), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 256), 1, 4), broadcast(3, 4)))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ kernel.shared_1[ramp(((threadIdx.x_2*4) + 1152), 1, 4)] = kernel[(((broadcast(((((floordiv(blockIdx.x, 7)*147456) + (floordiv(threadIdx.x_2, 12)*4608)) + cse_var_2) + 110592), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 1152), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 288), 1, 4), broadcast(3, 4)))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ kernel.shared_1[ramp(((threadIdx.x_2*4) + 1280), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 1280), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 1280), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 320), 1, 4), broadcast(3, 4)))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ kernel.shared_1[ramp(((threadIdx.x_2*4) + 1408), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 1408), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 1408), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 352), 1, 4), broadcast(3, 4)))]
+ for (rc.outer.inner: int32, 0, 8) {
+ let cse_var_19: int32 = (rc.outer.inner*18)
+ let cse_var_18: int32 = (cse_var_19 + 7)
+ let cse_var_17: int32 = (cse_var_19 + 6)
+ let cse_var_16: int32 = (cse_var_19 + 5)
+ let cse_var_15: int32 = (cse_var_19 + 4)
+ let cse_var_14: int32 = (cse_var_19 + 3)
+ let cse_var_13: int32 = (cse_var_19 + 2)
+ let cse_var_12: int32 = (cse_var_19 + 16)
+ let cse_var_11: int32 = (cse_var_19 + 15)
+ let cse_var_10: int32 = (cse_var_19 + 14)
+ let cse_var_9: int32 = (cse_var_19 + 13)
+ let cse_var_8: int32 = (cse_var_19 + 12)
+ let cse_var_7: int32 = (cse_var_19 + 11)
+ let cse_var_6: int32 = (cse_var_19 + 10)
+ let cse_var_5: int32 = (cse_var_19 + 1)
+ {
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_19]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_5]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_13]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_14]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_15]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_16]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_17]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_5]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_13]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_14]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_15]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_16]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_17]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_18]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_13]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_14]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_15]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_16]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_17]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_18]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(cse_var_19 + 8)]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(cse_var_19 + 9)]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_6]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_7]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_8]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_9]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_10]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_11]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_6]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_7]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_8]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_9]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_10]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_11]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_12]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_7]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_8]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_9]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_10]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_11]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_12]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(cse_var_19 + 17)]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+ }
}
}
}
}
- compute[((blockIdx.x*392) + threadIdx.x)] = max((conv2d_nchw_1[0] + bias[((blockIdx.x*8) + floordiv(threadIdx.x, 49))]), 0f32)
- compute[(((blockIdx.x*392) + threadIdx.x) + 196)] = max((conv2d_nchw_1[1] + bias[(((blockIdx.x*8) + floordiv(threadIdx.x, 49)) + 4)]), 0f32)
+ for (i3.inner: int32, 0, 7) {
+ compute[((((floordiv(blockIdx.x, 7)*1568) + (threadIdx.x*49)) + (floormod(blockIdx.x, 7)*7)) + i3.inner)] = max((conv2d_nchw_1[i3.inner] + bias[((floordiv(blockIdx.x, 7)*32) + threadIdx.x)]), 0f32)
+ }
}
}
@@ -358,7 +414,7 @@ We build the binary and check its correctness and performance.
.. code-block:: none
- Execution time of this operator: 0.321 ms
+ Execution time of this operator: 0.322 ms
@@ -408,18 +464,18 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=1)
- conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=4)
- conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=2)
+ conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=32)
+ conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
- conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=7)
+ conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=1)
conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
- conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
+ conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=7)
conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
- conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=7)
+ conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=2)
- conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=16)
+ conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=8)
conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1)
conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=1)
conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=3)
@@ -429,13 +485,13 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=1)
- compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=4)
- compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=2)
+ compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=32)
+ compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
- compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=7)
+ compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=1)
compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
- compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
- compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=7)
+ compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
+ compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
@@ -453,14 +509,14 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused = s[compute].fuse(compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i)
s[compute].bind(compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused, te.thread_axis("threadIdx.x"))
kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
- kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=3)
+ kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=4)
s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
- kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=196)
+ kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=32)
s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
- pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=196)
+ pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=32)
s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 64)
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "unroll_explicit", True)
@@ -480,55 +536,2561 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
#define int64_t long long
#define uint64_t unsigned long long
#endif
- extern "C" __global__ void __launch_bounds__(196) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
- float conv2d_nchw[2];
- __shared__ float pad_temp_shared[2016];
- __shared__ float kernel_shared[768];
+ extern "C" __global__ void __launch_bounds__(32) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+ float conv2d_nchw[7];
+ __shared__ float pad_temp_shared[144];
+ __shared__ float kernel_shared[1536];
conv2d_nchw[0] = 0.000000e+00f;
conv2d_nchw[1] = 0.000000e+00f;
- for (int rc_outer_outer = 0; rc_outer_outer < 16; ++rc_outer_outer) {
+ conv2d_nchw[2] = 0.000000e+00f;
+ conv2d_nchw[3] = 0.000000e+00f;
+ conv2d_nchw[4] = 0.000000e+00f;
+ conv2d_nchw[5] = 0.000000e+00f;
+ conv2d_nchw[6] = 0.000000e+00f;
+ for (int rc_outer_outer = 0; rc_outer_outer < 32; ++rc_outer_outer) {
for (int ry_outer_outer = 0; ry_outer_outer < 3; ++ry_outer_outer) {
__syncthreads();
- pad_temp_shared[((int)threadIdx.x)] = (((((1 <= (((((int)threadIdx.x) % 63) / 9) + ry_outer_outer)) && ((((((int)threadIdx.x) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + ((((int)threadIdx.x) / 9) * 7)) + (ry_outer_outer * 7)) + (((int)threadIdx.x) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 196)] = (((((1 <= ((((((int)threadIdx.x) + 7) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 7) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 7) % 9))) && (((((int)threadIdx.x) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 196) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 7) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 392)] = (((((1 <= ((((((int)threadIdx.x) + 14) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 14) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 5) % 9))) && (((((int)threadIdx.x) + 5) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 392) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 5) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 588)] = (((((1 <= ((((((int)threadIdx.x) + 21) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 21) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 3) % 9))) && (((((int)threadIdx.x) + 3) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 588) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 3) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 784)] = (((((1 <= ((((((int)threadIdx.x) + 28) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 28) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 1) % 9))) && (((((int)threadIdx.x) + 1) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 784) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 1) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 980)] = (((((1 <= ((((((int)threadIdx.x) + 35) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 35) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 8) % 9))) && (((((int)threadIdx.x) + 8) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 980) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 8) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 1176)] = (((((1 <= ((((((int)threadIdx.x) + 42) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 42) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 6) % 9))) && (((((int)threadIdx.x) + 6) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1176) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 6) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 1372)] = (((((1 <= ((((((int)threadIdx.x) + 49) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 49) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 4) % 9))) && (((((int)threadIdx.x) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1372) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 4) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 1568)] = (((((1 <= ((((((int)threadIdx.x) + 56) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 56) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 2) % 9))) && (((((int)threadIdx.x) + 2) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1568) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 2) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 1764)] = (((((1 <= (((((int)threadIdx.x) % 63) / 9) + ry_outer_outer)) && ((((((int)threadIdx.x) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + ((((int)threadIdx.x) / 9) * 7)) + (ry_outer_outer * 7)) + (((int)threadIdx.x) % 9)) + 1364)] : 0.000000e+00f);
- if (((int)threadIdx.x) < 56) {
- pad_temp_shared[(((int)threadIdx.x) + 1960)] = (((((1 <= ((((((int)threadIdx.x) + 7) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 7) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 7) % 9))) && (((((int)threadIdx.x) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1960) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 7) % 9)) - 8)] : 0.000000e+00f);
- }
- kernel_shared[(((int)threadIdx.x) * 3)] = kernel[(((((((int)blockIdx.x) * 36864) + ((((int)threadIdx.x) >> 5) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) & 31) * 9)) + (ry_outer_outer * 3))];
- kernel_shared[((((int)threadIdx.x) * 3) + 1)] = kernel[((((((((int)blockIdx.x) * 36864) + ((((int)threadIdx.x) >> 5) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) & 31) * 9)) + (ry_outer_outer * 3)) + 1)];
- kernel_shared[((((int)threadIdx.x) * 3) + 2)] = kernel[((((((((int)blockIdx.x) * 36864) + ((((int)threadIdx.x) >> 5) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) & 31) * 9)) + (ry_outer_outer * 3)) + 2)];
- if (((int)threadIdx.x) < 60) {
- kernel_shared[((((int)threadIdx.x) * 3) + 588)] = kernel[(((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 196) >> 5) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 4) & 31) * 9)) + (ry_outer_outer * 3))];
- kernel_shared[((((int)threadIdx.x) * 3) + 589)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 196) >> 5) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 4) & 31) * 9)) + (ry_outer_outer * 3)) + 1)];
- kernel_shared[((((int)threadIdx.x) * 3) + 590)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 196) >> 5) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 4) & 31) * 9)) + (ry_outer_outer * 3)) + 2)];
+ pad_temp_shared[((int)threadIdx.x)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + ((((int)threadIdx.x) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((int)threadIdx.x) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 32)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) + 5) % 9))) && (((((int)threadIdx.x) + 5) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 32) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 5) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 64)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) + 1) % 9))) && (((((int)threadIdx.x) + 1) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 64) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 1) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 96)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) + 6) % 9))) && (((((int)threadIdx.x) + 6) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 96) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 6) % 9)) - 8)] : 0.000000e+00f);
+ if (((int)threadIdx.x) < 16) {
+ pad_temp_shared[(((int)threadIdx.x) + 128)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) + 2) % 9))) && (((((int)threadIdx.x) + 2) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 128) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 2) % 9)) - 8)] : 0.000000e+00f);
}
+ int4 _1;
+ int4 _2;
+ int4 _3;
+ int4 _4 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)));
+ int4 _5;
+ int4 _6;
+ int4 _7;
+ int4 _8 = make_int4(((((int)threadIdx.x) * 4))+(1*0), ((((int)threadIdx.x) * 4))+(1*1), ((((int)threadIdx.x) * 4))+(1*2), ((((int)threadIdx.x) * 4))+(1*3));
+ int4 _9 = make_int4(3, 3, 3, 3);
+ _7.x = (_8.x%_9.x);
+ _7.y = (_8.y%_9.y);
+ _7.z = (_8.z%_9.z);
+ _7.w = (_8.w%_9.w);
+ int4 _10;
+ int4 _11 = make_int4(((((int)threadIdx.x) * 4))+(1*0), ((((int)threadIdx.x) * 4))+(1*1), ((((int)threadIdx.x) * 4))+(1*2), ((((int)threadIdx.x) * 4))+(1*3));
+ int4 _12 = make_int4(3, 3, 3, 3);
+ _10.x = (_11.x/_12.x);
+ _10.y = (_11.y/_12.y);
+ _10.z = (_11.z/_12.z);
+ _10.w = (_11.w/_12.w);
+ int4 _13;
+ ushort4 _14;
+ ushort4 _15;
+ ushort4 _16;
+ int4 _17 = make_int4(3, 3, 3, 3);
+ int4 _18 = make_int4(0, 0, 0, 0);
+ _16.x = (_17.x>=_18.x);
+ _16.y = (_17.y>=_18.y);
+ _16.z = (_17.z>=_18.z);
+ _16.w = (_17.w>=_18.w);
+ ushort4 _19;
+ int4 _20 = make_int4(0, 0, 0, 0);
+ _19.x = (_7.x>=_20.x);
+ _19.y = (_7.y>=_20.y);
+ _19.z = (_7.z>=_20.z);
+ _19.w = (_7.w>=_20.w);
+ _15.x = (_16.x&&_19.x);
+ _15.y = (_16.y&&_19.y);
+ _15.z = (_16.z&&_19.z);
+ _15.w = (_16.w&&_19.w);
+ ushort4 _21;
+ ushort4 _22;
+ int4 _23 = make_int4(3, 3, 3, 3);
+ int4 _24 = make_int4(0, 0, 0, 0);
+ _22.x = (_23.x<_24.x);
+ _22.y = (_23.y<_24.y);
+ _22.z = (_23.z<_24.z);
+ _22.w = (_23.w<_24.w);
+ ushort4 _25;
+ int4 _26 = make_int4(0, 0, 0, 0);
+ _25.x = (_7.x<=_26.x);
+ _25.y = (_7.y<=_26.y);
+ _25.z = (_7.z<=_26.z);
+ _25.w = (_7.w<=_26.w);
+ _21.x = (_22.x&&_25.x);
+ _21.y = (_22.y&&_25.y);
+ _21.z = (_22.z&&_25.z);
+ _21.w = (_22.w&&_25.w);
+ _14.x = (_15.x||_21.x);
+ _14.y = (_15.y||_21.y);
+ _14.z = (_15.z||_21.z);
+ _14.w = (_15.w||_21.w);
+ int4 _27;
+ int4 _28 = make_int4(1, 1, 1, 1);
+ _27.x = (_10.x-_28.x);
+ _27.y = (_10.y-_28.y);
+ _27.z = (_10.z-_28.z);
+ _27.w = (_10.w-_28.w);
+ _13.x = (bool(_14.x)?_10.x:_27.x);
+ _13.y = (bool(_14.y)?_10.y:_27.y);
+ _13.z = (bool(_14.z)?_10.z:_27.z);
+ _13.w = (bool(_14.w)?_10.w:_27.w);
+ int4 _29 = make_int4(16, 16, 16, 16);
+ _6.x = (_13.x%_29.x);
+ _6.y = (_13.y%_29.y);
+ _6.z = (_13.z%_29.z);
+ _6.w = (_13.w%_29.w);
+ int4 _30;
+ ushort4 _31;
+ ushort4 _32;
+ ushort4 _33;
+ int4 _34 = make_int4(16, 16, 16, 16);
+ int4 _35 = make_int4(0, 0, 0, 0);
+ _33.x = (_34.x>=_35.x);
+ _33.y = (_34.y>=_35.y);
+ _33.z = (_34.z>=_35.z);
+ _33.w = (_34.w>=_35.w);
+ ushort4 _36;
+ int4 _37 = make_int4(0, 0, 0, 0);
+ _36.x = (_6.x>=_37.x);
+ _36.y = (_6.y>=_37.y);
+ _36.z = (_6.z>=_37.z);
+ _36.w = (_6.w>=_37.w);
+ _32.x = (_33.x&&_36.x);
+ _32.y = (_33.y&&_36.y);
+ _32.z = (_33.z&&_36.z);
+ _32.w = (_33.w&&_36.w);
+ ushort4 _38;
+ ushort4 _39;
+ int4 _40 = make_int4(16, 16, 16, 16);
+ int4 _41 = make_int4(0, 0, 0, 0);
+ _39.x = (_40.x<_41.x);
+ _39.y = (_40.y<_41.y);
+ _39.z = (_40.z<_41.z);
+ _39.w = (_40.w<_41.w);
+ ushort4 _42;
+ int4 _43 = make_int4(0, 0, 0, 0);
+ _42.x = (_6.x<=_43.x);
+ _42.y = (_6.y<=_43.y);
+ _42.z = (_6.z<=_43.z);
+ _42.w = (_6.w<=_43.w);
+ _38.x = (_39.x&&_42.x);
+ _38.y = (_39.y&&_42.y);
+ _38.z = (_39.z&&_42.z);
+ _38.w = (_39.w&&_42.w);
+ _31.x = (_32.x||_38.x);
+ _31.y = (_32.y||_38.y);
+ _31.z = (_32.z||_38.z);
+ _31.w = (_32.w||_38.w);
+ int4 _44;
+ int4 _45 = make_int4(16, 16, 16, 16);
+ _44.x = (_6.x+_45.x);
+ _44.y = (_6.y+_45.y);
+ _44.z = (_6.z+_45.z);
+ _44.w = (_6.w+_45.w);
+ _30.x = (bool(_31.x)?_6.x:_44.x);
+ _30.y = (bool(_31.y)?_6.y:_44.y);
+ _30.z = (bool(_31.z)?_6.z:_44.z);
+ _30.w = (bool(_31.w)?_6.w:_44.w);
+ int4 _46 = make_int4(9, 9, 9, 9);
+ _5.x = (_30.x*_46.x);
+ _5.y = (_30.y*_46.y);
+ _5.z = (_30.z*_46.z);
+ _5.w = (_30.w*_46.w);
+ _3.x = (_4.x+_5.x);
+ _3.y = (_4.y+_5.y);
+ _3.z = (_4.z+_5.z);
+ _3.w = (_4.w+_5.w);
+ int4 _47 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+ _2.x = (_3.x+_47.x);
+ _2.y = (_3.y+_47.y);
+ _2.z = (_3.z+_47.z);
+ _2.w = (_3.w+_47.w);
+ int4 _48;
+ int4 _49 = make_int4((((int)threadIdx.x))+(1*0), (((int)threadIdx.x))+(1*1), (((int)threadIdx.x))+(1*2), (((int)threadIdx.x))+(1*3));
+ int4 _50 = make_int4(3, 3, 3, 3);
+ _48.x = (_49.x%_50.x);
+ _48.y = (_49.y%_50.y);
+ _48.z = (_49.z%_50.z);
+ _48.w = (_49.w%_50.w);
+ int4 _51;
+ ushort4 _52;
+ ushort4 _53;
+ ushort4 _54;
+ int4 _55 = make_int4(3, 3, 3, 3);
+ int4 _56 = make_int4(0, 0, 0, 0);
+ _54.x = (_55.x>=_56.x);
+ _54.y = (_55.y>=_56.y);
+ _54.z = (_55.z>=_56.z);
+ _54.w = (_55.w>=_56.w);
+ ushort4 _57;
+ int4 _58 = make_int4(0, 0, 0, 0);
+ _57.x = (_48.x>=_58.x);
+ _57.y = (_48.y>=_58.y);
+ _57.z = (_48.z>=_58.z);
+ _57.w = (_48.w>=_58.w);
+ _53.x = (_54.x&&_57.x);
+ _53.y = (_54.y&&_57.y);
+ _53.z = (_54.z&&_57.z);
+ _53.w = (_54.w&&_57.w);
+ ushort4 _59;
+ ushort4 _60;
+ int4 _61 = make_int4(3, 3, 3, 3);
+ int4 _62 = make_int4(0, 0, 0, 0);
+ _60.x = (_61.x<_62.x);
+ _60.y = (_61.y<_62.y);
+ _60.z = (_61.z<_62.z);
+ _60.w = (_61.w<_62.w);
+ ushort4 _63;
+ int4 _64 = make_int4(0, 0, 0, 0);
+ _63.x = (_48.x<=_64.x);
+ _63.y = (_48.y<=_64.y);
+ _63.z = (_48.z<=_64.z);
+ _63.w = (_48.w<=_64.w);
+ _59.x = (_60.x&&_63.x);
+ _59.y = (_60.y&&_63.y);
+ _59.z = (_60.z&&_63.z);
+ _59.w = (_60.w&&_63.w);
+ _52.x = (_53.x||_59.x);
+ _52.y = (_53.y||_59.y);
+ _52.z = (_53.z||_59.z);
+ _52.w = (_53.w||_59.w);
+ int4 _65;
+ int4 _66 = make_int4(3, 3, 3, 3);
+ _65.x = (_48.x+_66.x);
+ _65.y = (_48.y+_66.y);
+ _65.z = (_48.z+_66.z);
+ _65.w = (_48.w+_66.w);
+ _51.x = (bool(_52.x)?_48.x:_65.x);
+ _51.y = (bool(_52.y)?_48.y:_65.y);
+ _51.z = (bool(_52.z)?_48.z:_65.z);
+ _51.w = (bool(_52.w)?_48.w:_65.w);
+ _1.x = (_2.x+_51.x);
+ _1.y = (_2.y+_51.y);
+ _1.z = (_2.z+_51.z);
+ _1.w = (_2.w+_51.w);
+ *(float4*)(kernel_shared + (((int)threadIdx.x) * 4)) = make_float4(kernel[_1.x],kernel[_1.y],kernel[_1.z],kernel[_1.w]);
+ int4 _67;
+ int4 _68;
+ int4 _69;
+ int4 _70 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 128) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 128) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 128) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 128) / 48) * 4608)) + (rc_outer_outer [...]
+ int4 _71;
+ int4 _72;
+ int4 _73;
+ int4 _74 = make_int4((((((int)threadIdx.x) * 4) + 128))+(1*0), (((((int)threadIdx.x) * 4) + 128))+(1*1), (((((int)threadIdx.x) * 4) + 128))+(1*2), (((((int)threadIdx.x) * 4) + 128))+(1*3));
+ int4 _75 = make_int4(3, 3, 3, 3);
+ _73.x = (_74.x%_75.x);
+ _73.y = (_74.y%_75.y);
+ _73.z = (_74.z%_75.z);
+ _73.w = (_74.w%_75.w);
+ int4 _76;
+ int4 _77 = make_int4((((((int)threadIdx.x) * 4) + 128))+(1*0), (((((int)threadIdx.x) * 4) + 128))+(1*1), (((((int)threadIdx.x) * 4) + 128))+(1*2), (((((int)threadIdx.x) * 4) + 128))+(1*3));
+ int4 _78 = make_int4(3, 3, 3, 3);
+ _76.x = (_77.x/_78.x);
+ _76.y = (_77.y/_78.y);
+ _76.z = (_77.z/_78.z);
+ _76.w = (_77.w/_78.w);
+ int4 _79;
+ ushort4 _80;
+ ushort4 _81;
+ ushort4 _82;
+ int4 _83 = make_int4(3, 3, 3, 3);
+ int4 _84 = make_int4(0, 0, 0, 0);
+ _82.x = (_83.x>=_84.x);
+ _82.y = (_83.y>=_84.y);
+ _82.z = (_83.z>=_84.z);
+ _82.w = (_83.w>=_84.w);
+ ushort4 _85;
+ int4 _86 = make_int4(0, 0, 0, 0);
+ _85.x = (_73.x>=_86.x);
+ _85.y = (_73.y>=_86.y);
+ _85.z = (_73.z>=_86.z);
+ _85.w = (_73.w>=_86.w);
+ _81.x = (_82.x&&_85.x);
+ _81.y = (_82.y&&_85.y);
+ _81.z = (_82.z&&_85.z);
+ _81.w = (_82.w&&_85.w);
+ ushort4 _87;
+ ushort4 _88;
+ int4 _89 = make_int4(3, 3, 3, 3);
+ int4 _90 = make_int4(0, 0, 0, 0);
+ _88.x = (_89.x<_90.x);
+ _88.y = (_89.y<_90.y);
+ _88.z = (_89.z<_90.z);
+ _88.w = (_89.w<_90.w);
+ ushort4 _91;
+ int4 _92 = make_int4(0, 0, 0, 0);
+ _91.x = (_73.x<=_92.x);
+ _91.y = (_73.y<=_92.y);
+ _91.z = (_73.z<=_92.z);
+ _91.w = (_73.w<=_92.w);
+ _87.x = (_88.x&&_91.x);
+ _87.y = (_88.y&&_91.y);
+ _87.z = (_88.z&&_91.z);
+ _87.w = (_88.w&&_91.w);
+ _80.x = (_81.x||_87.x);
+ _80.y = (_81.y||_87.y);
+ _80.z = (_81.z||_87.z);
+ _80.w = (_81.w||_87.w);
+ int4 _93;
+ int4 _94 = make_int4(1, 1, 1, 1);
+ _93.x = (_76.x-_94.x);
+ _93.y = (_76.y-_94.y);
+ _93.z = (_76.z-_94.z);
+ _93.w = (_76.w-_94.w);
+ _79.x = (bool(_80.x)?_76.x:_93.x);
+ _79.y = (bool(_80.y)?_76.y:_93.y);
+ _79.z = (bool(_80.z)?_76.z:_93.z);
+ _79.w = (bool(_80.w)?_76.w:_93.w);
+ int4 _95 = make_int4(16, 16, 16, 16);
+ _72.x = (_79.x%_95.x);
+ _72.y = (_79.y%_95.y);
+ _72.z = (_79.z%_95.z);
+ _72.w = (_79.w%_95.w);
+ int4 _96;
+ ushort4 _97;
+ ushort4 _98;
+ ushort4 _99;
+ int4 _100 = make_int4(16, 16, 16, 16);
+ int4 _101 = make_int4(0, 0, 0, 0);
+ _99.x = (_100.x>=_101.x);
+ _99.y = (_100.y>=_101.y);
+ _99.z = (_100.z>=_101.z);
+ _99.w = (_100.w>=_101.w);
+ ushort4 _102;
+ int4 _103 = make_int4(0, 0, 0, 0);
+ _102.x = (_72.x>=_103.x);
+ _102.y = (_72.y>=_103.y);
+ _102.z = (_72.z>=_103.z);
+ _102.w = (_72.w>=_103.w);
+ _98.x = (_99.x&&_102.x);
+ _98.y = (_99.y&&_102.y);
+ _98.z = (_99.z&&_102.z);
+ _98.w = (_99.w&&_102.w);
+ ushort4 _104;
+ ushort4 _105;
+ int4 _106 = make_int4(16, 16, 16, 16);
+ int4 _107 = make_int4(0, 0, 0, 0);
+ _105.x = (_106.x<_107.x);
+ _105.y = (_106.y<_107.y);
+ _105.z = (_106.z<_107.z);
+ _105.w = (_106.w<_107.w);
+ ushort4 _108;
+ int4 _109 = make_int4(0, 0, 0, 0);
+ _108.x = (_72.x<=_109.x);
+ _108.y = (_72.y<=_109.y);
+ _108.z = (_72.z<=_109.z);
+ _108.w = (_72.w<=_109.w);
+ _104.x = (_105.x&&_108.x);
+ _104.y = (_105.y&&_108.y);
+ _104.z = (_105.z&&_108.z);
+ _104.w = (_105.w&&_108.w);
+ _97.x = (_98.x||_104.x);
+ _97.y = (_98.y||_104.y);
+ _97.z = (_98.z||_104.z);
+ _97.w = (_98.w||_104.w);
+ int4 _110;
+ int4 _111 = make_int4(16, 16, 16, 16);
+ _110.x = (_72.x+_111.x);
+ _110.y = (_72.y+_111.y);
+ _110.z = (_72.z+_111.z);
+ _110.w = (_72.w+_111.w);
+ _96.x = (bool(_97.x)?_72.x:_110.x);
+ _96.y = (bool(_97.y)?_72.y:_110.y);
+ _96.z = (bool(_97.z)?_72.z:_110.z);
+ _96.w = (bool(_97.w)?_72.w:_110.w);
+ int4 _112 = make_int4(9, 9, 9, 9);
+ _71.x = (_96.x*_112.x);
+ _71.y = (_96.y*_112.y);
+ _71.z = (_96.z*_112.z);
+ _71.w = (_96.w*_112.w);
+ _69.x = (_70.x+_71.x);
+ _69.y = (_70.y+_71.y);
+ _69.z = (_70.z+_71.z);
+ _69.w = (_70.w+_71.w);
+ int4 _113 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+ _68.x = (_69.x+_113.x);
+ _68.y = (_69.y+_113.y);
+ _68.z = (_69.z+_113.z);
+ _68.w = (_69.w+_113.w);
+ int4 _114;
+ int4 _115 = make_int4(((((int)threadIdx.x) + 32))+(1*0), ((((int)threadIdx.x) + 32))+(1*1), ((((int)threadIdx.x) + 32))+(1*2), ((((int)threadIdx.x) + 32))+(1*3));
+ int4 _116 = make_int4(3, 3, 3, 3);
+ _114.x = (_115.x%_116.x);
+ _114.y = (_115.y%_116.y);
+ _114.z = (_115.z%_116.z);
+ _114.w = (_115.w%_116.w);
+ int4 _117;
+ ushort4 _118;
+ ushort4 _119;
+ ushort4 _120;
+ int4 _121 = make_int4(3, 3, 3, 3);
+ int4 _122 = make_int4(0, 0, 0, 0);
+ _120.x = (_121.x>=_122.x);
+ _120.y = (_121.y>=_122.y);
+ _120.z = (_121.z>=_122.z);
+ _120.w = (_121.w>=_122.w);
+ ushort4 _123;
+ int4 _124 = make_int4(0, 0, 0, 0);
+ _123.x = (_114.x>=_124.x);
+ _123.y = (_114.y>=_124.y);
+ _123.z = (_114.z>=_124.z);
+ _123.w = (_114.w>=_124.w);
+ _119.x = (_120.x&&_123.x);
+ _119.y = (_120.y&&_123.y);
+ _119.z = (_120.z&&_123.z);
+ _119.w = (_120.w&&_123.w);
+ ushort4 _125;
+ ushort4 _126;
+ int4 _127 = make_int4(3, 3, 3, 3);
+ int4 _128 = make_int4(0, 0, 0, 0);
+ _126.x = (_127.x<_128.x);
+ _126.y = (_127.y<_128.y);
+ _126.z = (_127.z<_128.z);
+ _126.w = (_127.w<_128.w);
+ ushort4 _129;
+ int4 _130 = make_int4(0, 0, 0, 0);
+ _129.x = (_114.x<=_130.x);
+ _129.y = (_114.y<=_130.y);
+ _129.z = (_114.z<=_130.z);
+ _129.w = (_114.w<=_130.w);
+ _125.x = (_126.x&&_129.x);
+ _125.y = (_126.y&&_129.y);
+ _125.z = (_126.z&&_129.z);
+ _125.w = (_126.w&&_129.w);
+ _118.x = (_119.x||_125.x);
+ _118.y = (_119.y||_125.y);
+ _118.z = (_119.z||_125.z);
+ _118.w = (_119.w||_125.w);
+ int4 _131;
+ int4 _132 = make_int4(3, 3, 3, 3);
+ _131.x = (_114.x+_132.x);
+ _131.y = (_114.y+_132.y);
+ _131.z = (_114.z+_132.z);
+ _131.w = (_114.w+_132.w);
+ _117.x = (bool(_118.x)?_114.x:_131.x);
+ _117.y = (bool(_118.y)?_114.y:_131.y);
+ _117.z = (bool(_118.z)?_114.z:_131.z);
+ _117.w = (bool(_118.w)?_114.w:_131.w);
+ _67.x = (_68.x+_117.x);
+ _67.y = (_68.y+_117.y);
+ _67.z = (_68.z+_117.z);
+ _67.w = (_68.w+_117.w);
+ *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 128)) = make_float4(kernel[_67.x],kernel[_67.y],kernel[_67.z],kernel[_67.w]);
+ int4 _133;
+ int4 _134;
+ int4 _135;
+ int4 _136 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 256) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 256) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 256) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 256) / 48) * 4608)) + (rc_outer_outer [...]
+ int4 _137;
+ int4 _138;
+ int4 _139;
+ int4 _140 = make_int4((((((int)threadIdx.x) * 4) + 256))+(1*0), (((((int)threadIdx.x) * 4) + 256))+(1*1), (((((int)threadIdx.x) * 4) + 256))+(1*2), (((((int)threadIdx.x) * 4) + 256))+(1*3));
+ int4 _141 = make_int4(3, 3, 3, 3);
+ _139.x = (_140.x%_141.x);
+ _139.y = (_140.y%_141.y);
+ _139.z = (_140.z%_141.z);
+ _139.w = (_140.w%_141.w);
+ int4 _142;
+ int4 _143 = make_int4((((((int)threadIdx.x) * 4) + 256))+(1*0), (((((int)threadIdx.x) * 4) + 256))+(1*1), (((((int)threadIdx.x) * 4) + 256))+(1*2), (((((int)threadIdx.x) * 4) + 256))+(1*3));
+ int4 _144 = make_int4(3, 3, 3, 3);
+ _142.x = (_143.x/_144.x);
+ _142.y = (_143.y/_144.y);
+ _142.z = (_143.z/_144.z);
+ _142.w = (_143.w/_144.w);
+ int4 _145;
+ ushort4 _146;
+ ushort4 _147;
+ ushort4 _148;
+ int4 _149 = make_int4(3, 3, 3, 3);
+ int4 _150 = make_int4(0, 0, 0, 0);
+ _148.x = (_149.x>=_150.x);
+ _148.y = (_149.y>=_150.y);
+ _148.z = (_149.z>=_150.z);
+ _148.w = (_149.w>=_150.w);
+ ushort4 _151;
+ int4 _152 = make_int4(0, 0, 0, 0);
+ _151.x = (_139.x>=_152.x);
+ _151.y = (_139.y>=_152.y);
+ _151.z = (_139.z>=_152.z);
+ _151.w = (_139.w>=_152.w);
+ _147.x = (_148.x&&_151.x);
+ _147.y = (_148.y&&_151.y);
+ _147.z = (_148.z&&_151.z);
+ _147.w = (_148.w&&_151.w);
+ ushort4 _153;
+ ushort4 _154;
+ int4 _155 = make_int4(3, 3, 3, 3);
+ int4 _156 = make_int4(0, 0, 0, 0);
+ _154.x = (_155.x<_156.x);
+ _154.y = (_155.y<_156.y);
+ _154.z = (_155.z<_156.z);
+ _154.w = (_155.w<_156.w);
+ ushort4 _157;
+ int4 _158 = make_int4(0, 0, 0, 0);
+ _157.x = (_139.x<=_158.x);
+ _157.y = (_139.y<=_158.y);
+ _157.z = (_139.z<=_158.z);
+ _157.w = (_139.w<=_158.w);
+ _153.x = (_154.x&&_157.x);
+ _153.y = (_154.y&&_157.y);
+ _153.z = (_154.z&&_157.z);
+ _153.w = (_154.w&&_157.w);
+ _146.x = (_147.x||_153.x);
+ _146.y = (_147.y||_153.y);
+ _146.z = (_147.z||_153.z);
+ _146.w = (_147.w||_153.w);
+ int4 _159;
+ int4 _160 = make_int4(1, 1, 1, 1);
+ _159.x = (_142.x-_160.x);
+ _159.y = (_142.y-_160.y);
+ _159.z = (_142.z-_160.z);
+ _159.w = (_142.w-_160.w);
+ _145.x = (bool(_146.x)?_142.x:_159.x);
+ _145.y = (bool(_146.y)?_142.y:_159.y);
+ _145.z = (bool(_146.z)?_142.z:_159.z);
+ _145.w = (bool(_146.w)?_142.w:_159.w);
+ int4 _161 = make_int4(16, 16, 16, 16);
+ _138.x = (_145.x%_161.x);
+ _138.y = (_145.y%_161.y);
+ _138.z = (_145.z%_161.z);
+ _138.w = (_145.w%_161.w);
+ int4 _162;
+ ushort4 _163;
+ ushort4 _164;
+ ushort4 _165;
+ int4 _166 = make_int4(16, 16, 16, 16);
+ int4 _167 = make_int4(0, 0, 0, 0);
+ _165.x = (_166.x>=_167.x);
+ _165.y = (_166.y>=_167.y);
+ _165.z = (_166.z>=_167.z);
+ _165.w = (_166.w>=_167.w);
+ ushort4 _168;
+ int4 _169 = make_int4(0, 0, 0, 0);
+ _168.x = (_138.x>=_169.x);
+ _168.y = (_138.y>=_169.y);
+ _168.z = (_138.z>=_169.z);
+ _168.w = (_138.w>=_169.w);
+ _164.x = (_165.x&&_168.x);
+ _164.y = (_165.y&&_168.y);
+ _164.z = (_165.z&&_168.z);
+ _164.w = (_165.w&&_168.w);
+ ushort4 _170;
+ ushort4 _171;
+ int4 _172 = make_int4(16, 16, 16, 16);
+ int4 _173 = make_int4(0, 0, 0, 0);
+ _171.x = (_172.x<_173.x);
+ _171.y = (_172.y<_173.y);
+ _171.z = (_172.z<_173.z);
+ _171.w = (_172.w<_173.w);
+ ushort4 _174;
+ int4 _175 = make_int4(0, 0, 0, 0);
+ _174.x = (_138.x<=_175.x);
+ _174.y = (_138.y<=_175.y);
+ _174.z = (_138.z<=_175.z);
+ _174.w = (_138.w<=_175.w);
+ _170.x = (_171.x&&_174.x);
+ _170.y = (_171.y&&_174.y);
+ _170.z = (_171.z&&_174.z);
+ _170.w = (_171.w&&_174.w);
+ _163.x = (_164.x||_170.x);
+ _163.y = (_164.y||_170.y);
+ _163.z = (_164.z||_170.z);
+ _163.w = (_164.w||_170.w);
+ int4 _176;
+ int4 _177 = make_int4(16, 16, 16, 16);
+ _176.x = (_138.x+_177.x);
+ _176.y = (_138.y+_177.y);
+ _176.z = (_138.z+_177.z);
+ _176.w = (_138.w+_177.w);
+ _162.x = (bool(_163.x)?_138.x:_176.x);
+ _162.y = (bool(_163.y)?_138.y:_176.y);
+ _162.z = (bool(_163.z)?_138.z:_176.z);
+ _162.w = (bool(_163.w)?_138.w:_176.w);
+ int4 _178 = make_int4(9, 9, 9, 9);
+ _137.x = (_162.x*_178.x);
+ _137.y = (_162.y*_178.y);
+ _137.z = (_162.z*_178.z);
+ _137.w = (_162.w*_178.w);
+ _135.x = (_136.x+_137.x);
+ _135.y = (_136.y+_137.y);
+ _135.z = (_136.z+_137.z);
+ _135.w = (_136.w+_137.w);
+ int4 _179 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+ _134.x = (_135.x+_179.x);
+ _134.y = (_135.y+_179.y);
+ _134.z = (_135.z+_179.z);
+ _134.w = (_135.w+_179.w);
+ int4 _180;
+ int4 _181 = make_int4(((((int)threadIdx.x) + 64))+(1*0), ((((int)threadIdx.x) + 64))+(1*1), ((((int)threadIdx.x) + 64))+(1*2), ((((int)threadIdx.x) + 64))+(1*3));
+ int4 _182 = make_int4(3, 3, 3, 3);
+ _180.x = (_181.x%_182.x);
+ _180.y = (_181.y%_182.y);
+ _180.z = (_181.z%_182.z);
+ _180.w = (_181.w%_182.w);
+ int4 _183;
+ ushort4 _184;
+ ushort4 _185;
+ ushort4 _186;
+ int4 _187 = make_int4(3, 3, 3, 3);
+ int4 _188 = make_int4(0, 0, 0, 0);
+ _186.x = (_187.x>=_188.x);
+ _186.y = (_187.y>=_188.y);
+ _186.z = (_187.z>=_188.z);
+ _186.w = (_187.w>=_188.w);
+ ushort4 _189;
+ int4 _190 = make_int4(0, 0, 0, 0);
+ _189.x = (_180.x>=_190.x);
+ _189.y = (_180.y>=_190.y);
+ _189.z = (_180.z>=_190.z);
+ _189.w = (_180.w>=_190.w);
+ _185.x = (_186.x&&_189.x);
+ _185.y = (_186.y&&_189.y);
+ _185.z = (_186.z&&_189.z);
+ _185.w = (_186.w&&_189.w);
+ ushort4 _191;
+ ushort4 _192;
+ int4 _193 = make_int4(3, 3, 3, 3);
+ int4 _194 = make_int4(0, 0, 0, 0);
+ _192.x = (_193.x<_194.x);
+ _192.y = (_193.y<_194.y);
+ _192.z = (_193.z<_194.z);
+ _192.w = (_193.w<_194.w);
+ ushort4 _195;
+ int4 _196 = make_int4(0, 0, 0, 0);
+ _195.x = (_180.x<=_196.x);
+ _195.y = (_180.y<=_196.y);
+ _195.z = (_180.z<=_196.z);
+ _195.w = (_180.w<=_196.w);
+ _191.x = (_192.x&&_195.x);
+ _191.y = (_192.y&&_195.y);
+ _191.z = (_192.z&&_195.z);
+ _191.w = (_192.w&&_195.w);
+ _184.x = (_185.x||_191.x);
+ _184.y = (_185.y||_191.y);
+ _184.z = (_185.z||_191.z);
+ _184.w = (_185.w||_191.w);
+ int4 _197;
+ int4 _198 = make_int4(3, 3, 3, 3);
+ _197.x = (_180.x+_198.x);
+ _197.y = (_180.y+_198.y);
+ _197.z = (_180.z+_198.z);
+ _197.w = (_180.w+_198.w);
+ _183.x = (bool(_184.x)?_180.x:_197.x);
+ _183.y = (bool(_184.y)?_180.y:_197.y);
+ _183.z = (bool(_184.z)?_180.z:_197.z);
+ _183.w = (bool(_184.w)?_180.w:_197.w);
+ _133.x = (_134.x+_183.x);
+ _133.y = (_134.y+_183.y);
+ _133.z = (_134.z+_183.z);
+ _133.w = (_134.w+_183.w);
+ *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 256)) = make_float4(kernel[_133.x],kernel[_133.y],kernel[_133.z],kernel[_133.w]);
+ int4 _199;
+ int4 _200;
+ int4 _201;
+ int4 _202 = make_int4((((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 36864), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 36864), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 36864), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 36864));
+ int4 _203;
+ int4 _204;
+ int4 _205;
+ int4 _206 = make_int4((((((int)threadIdx.x) * 4) + 384))+(1*0), (((((int)threadIdx.x) * 4) + 384))+(1*1), (((((int)threadIdx.x) * 4) + 384))+(1*2), (((((int)threadIdx.x) * 4) + 384))+(1*3));
+ int4 _207 = make_int4(3, 3, 3, 3);
+ _205.x = (_206.x%_207.x);
+ _205.y = (_206.y%_207.y);
+ _205.z = (_206.z%_207.z);
+ _205.w = (_206.w%_207.w);
+ int4 _208;
+ int4 _209 = make_int4((((((int)threadIdx.x) * 4) + 384))+(1*0), (((((int)threadIdx.x) * 4) + 384))+(1*1), (((((int)threadIdx.x) * 4) + 384))+(1*2), (((((int)threadIdx.x) * 4) + 384))+(1*3));
+ int4 _210 = make_int4(3, 3, 3, 3);
+ _208.x = (_209.x/_210.x);
+ _208.y = (_209.y/_210.y);
+ _208.z = (_209.z/_210.z);
+ _208.w = (_209.w/_210.w);
+ int4 _211;
+ ushort4 _212;
+ ushort4 _213;
+ ushort4 _214;
+ int4 _215 = make_int4(3, 3, 3, 3);
+ int4 _216 = make_int4(0, 0, 0, 0);
+ _214.x = (_215.x>=_216.x);
+ _214.y = (_215.y>=_216.y);
+ _214.z = (_215.z>=_216.z);
+ _214.w = (_215.w>=_216.w);
+ ushort4 _217;
+ int4 _218 = make_int4(0, 0, 0, 0);
+ _217.x = (_205.x>=_218.x);
+ _217.y = (_205.y>=_218.y);
+ _217.z = (_205.z>=_218.z);
+ _217.w = (_205.w>=_218.w);
+ _213.x = (_214.x&&_217.x);
+ _213.y = (_214.y&&_217.y);
+ _213.z = (_214.z&&_217.z);
+ _213.w = (_214.w&&_217.w);
+ ushort4 _219;
+ ushort4 _220;
+ int4 _221 = make_int4(3, 3, 3, 3);
+ int4 _222 = make_int4(0, 0, 0, 0);
+ _220.x = (_221.x<_222.x);
+ _220.y = (_221.y<_222.y);
+ _220.z = (_221.z<_222.z);
+ _220.w = (_221.w<_222.w);
+ ushort4 _223;
+ int4 _224 = make_int4(0, 0, 0, 0);
+ _223.x = (_205.x<=_224.x);
+ _223.y = (_205.y<=_224.y);
+ _223.z = (_205.z<=_224.z);
+ _223.w = (_205.w<=_224.w);
+ _219.x = (_220.x&&_223.x);
+ _219.y = (_220.y&&_223.y);
+ _219.z = (_220.z&&_223.z);
+ _219.w = (_220.w&&_223.w);
+ _212.x = (_213.x||_219.x);
+ _212.y = (_213.y||_219.y);
+ _212.z = (_213.z||_219.z);
+ _212.w = (_213.w||_219.w);
+ int4 _225;
+ int4 _226 = make_int4(1, 1, 1, 1);
+ _225.x = (_208.x-_226.x);
+ _225.y = (_208.y-_226.y);
+ _225.z = (_208.z-_226.z);
+ _225.w = (_208.w-_226.w);
+ _211.x = (bool(_212.x)?_208.x:_225.x);
+ _211.y = (bool(_212.y)?_208.y:_225.y);
+ _211.z = (bool(_212.z)?_208.z:_225.z);
+ _211.w = (bool(_212.w)?_208.w:_225.w);
+ int4 _227 = make_int4(16, 16, 16, 16);
+ _204.x = (_211.x%_227.x);
+ _204.y = (_211.y%_227.y);
+ _204.z = (_211.z%_227.z);
+ _204.w = (_211.w%_227.w);
+ int4 _228;
+ ushort4 _229;
+ ushort4 _230;
+ ushort4 _231;
+ int4 _232 = make_int4(16, 16, 16, 16);
+ int4 _233 = make_int4(0, 0, 0, 0);
+ _231.x = (_232.x>=_233.x);
+ _231.y = (_232.y>=_233.y);
+ _231.z = (_232.z>=_233.z);
+ _231.w = (_232.w>=_233.w);
+ ushort4 _234;
+ int4 _235 = make_int4(0, 0, 0, 0);
+ _234.x = (_204.x>=_235.x);
+ _234.y = (_204.y>=_235.y);
+ _234.z = (_204.z>=_235.z);
+ _234.w = (_204.w>=_235.w);
+ _230.x = (_231.x&&_234.x);
+ _230.y = (_231.y&&_234.y);
+ _230.z = (_231.z&&_234.z);
+ _230.w = (_231.w&&_234.w);
+ ushort4 _236;
+ ushort4 _237;
+ int4 _238 = make_int4(16, 16, 16, 16);
+ int4 _239 = make_int4(0, 0, 0, 0);
+ _237.x = (_238.x<_239.x);
+ _237.y = (_238.y<_239.y);
+ _237.z = (_238.z<_239.z);
+ _237.w = (_238.w<_239.w);
+ ushort4 _240;
+ int4 _241 = make_int4(0, 0, 0, 0);
+ _240.x = (_204.x<=_241.x);
+ _240.y = (_204.y<=_241.y);
+ _240.z = (_204.z<=_241.z);
+ _240.w = (_204.w<=_241.w);
+ _236.x = (_237.x&&_240.x);
+ _236.y = (_237.y&&_240.y);
+ _236.z = (_237.z&&_240.z);
+ _236.w = (_237.w&&_240.w);
+ _229.x = (_230.x||_236.x);
+ _229.y = (_230.y||_236.y);
+ _229.z = (_230.z||_236.z);
+ _229.w = (_230.w||_236.w);
+ int4 _242;
+ int4 _243 = make_int4(16, 16, 16, 16);
+ _242.x = (_204.x+_243.x);
+ _242.y = (_204.y+_243.y);
+ _242.z = (_204.z+_243.z);
+ _242.w = (_204.w+_243.w);
+ _228.x = (bool(_229.x)?_204.x:_242.x);
+ _228.y = (bool(_229.y)?_204.y:_242.y);
+ _228.z = (bool(_229.z)?_204.z:_242.z);
+ _228.w = (bool(_229.w)?_204.w:_242.w);
+ int4 _244 = make_int4(9, 9, 9, 9);
+ _203.x = (_228.x*_244.x);
+ _203.y = (_228.y*_244.y);
+ _203.z = (_228.z*_244.z);
+ _203.w = (_228.w*_244.w);
+ _201.x = (_202.x+_203.x);
+ _201.y = (_202.y+_203.y);
+ _201.z = (_202.z+_203.z);
+ _201.w = (_202.w+_203.w);
+ int4 _245 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+ _200.x = (_201.x+_245.x);
+ _200.y = (_201.y+_245.y);
+ _200.z = (_201.z+_245.z);
+ _200.w = (_201.w+_245.w);
+ int4 _246;
+ int4 _247 = make_int4(((((int)threadIdx.x) + 96))+(1*0), ((((int)threadIdx.x) + 96))+(1*1), ((((int)threadIdx.x) + 96))+(1*2), ((((int)threadIdx.x) + 96))+(1*3));
+ int4 _248 = make_int4(3, 3, 3, 3);
+ _246.x = (_247.x%_248.x);
+ _246.y = (_247.y%_248.y);
+ _246.z = (_247.z%_248.z);
+ _246.w = (_247.w%_248.w);
+ int4 _249;
+ ushort4 _250;
+ ushort4 _251;
+ ushort4 _252;
+ int4 _253 = make_int4(3, 3, 3, 3);
+ int4 _254 = make_int4(0, 0, 0, 0);
+ _252.x = (_253.x>=_254.x);
+ _252.y = (_253.y>=_254.y);
+ _252.z = (_253.z>=_254.z);
+ _252.w = (_253.w>=_254.w);
+ ushort4 _255;
+ int4 _256 = make_int4(0, 0, 0, 0);
+ _255.x = (_246.x>=_256.x);
+ _255.y = (_246.y>=_256.y);
+ _255.z = (_246.z>=_256.z);
+ _255.w = (_246.w>=_256.w);
+ _251.x = (_252.x&&_255.x);
+ _251.y = (_252.y&&_255.y);
+ _251.z = (_252.z&&_255.z);
+ _251.w = (_252.w&&_255.w);
+ ushort4 _257;
+ ushort4 _258;
+ int4 _259 = make_int4(3, 3, 3, 3);
+ int4 _260 = make_int4(0, 0, 0, 0);
+ _258.x = (_259.x<_260.x);
+ _258.y = (_259.y<_260.y);
+ _258.z = (_259.z<_260.z);
+ _258.w = (_259.w<_260.w);
+ ushort4 _261;
+ int4 _262 = make_int4(0, 0, 0, 0);
+ _261.x = (_246.x<=_262.x);
+ _261.y = (_246.y<=_262.y);
+ _261.z = (_246.z<=_262.z);
+ _261.w = (_246.w<=_262.w);
+ _257.x = (_258.x&&_261.x);
+ _257.y = (_258.y&&_261.y);
+ _257.z = (_258.z&&_261.z);
+ _257.w = (_258.w&&_261.w);
+ _250.x = (_251.x||_257.x);
+ _250.y = (_251.y||_257.y);
+ _250.z = (_251.z||_257.z);
+ _250.w = (_251.w||_257.w);
+ int4 _263;
+ int4 _264 = make_int4(3, 3, 3, 3);
+ _263.x = (_246.x+_264.x);
+ _263.y = (_246.y+_264.y);
+ _263.z = (_246.z+_264.z);
+ _263.w = (_246.w+_264.w);
+ _249.x = (bool(_250.x)?_246.x:_263.x);
+ _249.y = (bool(_250.y)?_246.y:_263.y);
+ _249.z = (bool(_250.z)?_246.z:_263.z);
+ _249.w = (bool(_250.w)?_246.w:_263.w);
+ _199.x = (_200.x+_249.x);
+ _199.y = (_200.y+_249.y);
+ _199.z = (_200.z+_249.z);
+ _199.w = (_200.w+_249.w);
+ *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 384)) = make_float4(kernel[_199.x],kernel[_199.y],kernel[_199.z],kernel[_199.w]);
+ int4 _265;
+ int4 _266;
+ int4 _267;
+ int4 _268 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 512) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 512) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 512) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 512) / 48) * 4608)) + (rc_outer_outer [...]
+ int4 _269;
+ int4 _270;
+ int4 _271;
+ int4 _272 = make_int4((((((int)threadIdx.x) * 4) + 512))+(1*0), (((((int)threadIdx.x) * 4) + 512))+(1*1), (((((int)threadIdx.x) * 4) + 512))+(1*2), (((((int)threadIdx.x) * 4) + 512))+(1*3));
+ int4 _273 = make_int4(3, 3, 3, 3);
+ _271.x = (_272.x%_273.x);
+ _271.y = (_272.y%_273.y);
+ _271.z = (_272.z%_273.z);
+ _271.w = (_272.w%_273.w);
+ int4 _274;
+ int4 _275 = make_int4((((((int)threadIdx.x) * 4) + 512))+(1*0), (((((int)threadIdx.x) * 4) + 512))+(1*1), (((((int)threadIdx.x) * 4) + 512))+(1*2), (((((int)threadIdx.x) * 4) + 512))+(1*3));
+ int4 _276 = make_int4(3, 3, 3, 3);
+ _274.x = (_275.x/_276.x);
+ _274.y = (_275.y/_276.y);
+ _274.z = (_275.z/_276.z);
+ _274.w = (_275.w/_276.w);
+ int4 _277;
+ ushort4 _278;
+ ushort4 _279;
+ ushort4 _280;
+ int4 _281 = make_int4(3, 3, 3, 3);
+ int4 _282 = make_int4(0, 0, 0, 0);
+ _280.x = (_281.x>=_282.x);
+ _280.y = (_281.y>=_282.y);
+ _280.z = (_281.z>=_282.z);
+ _280.w = (_281.w>=_282.w);
+ ushort4 _283;
+ int4 _284 = make_int4(0, 0, 0, 0);
+ _283.x = (_271.x>=_284.x);
+ _283.y = (_271.y>=_284.y);
+ _283.z = (_271.z>=_284.z);
+ _283.w = (_271.w>=_284.w);
+ _279.x = (_280.x&&_283.x);
+ _279.y = (_280.y&&_283.y);
+ _279.z = (_280.z&&_283.z);
+ _279.w = (_280.w&&_283.w);
+ ushort4 _285;
+ ushort4 _286;
+ int4 _287 = make_int4(3, 3, 3, 3);
+ int4 _288 = make_int4(0, 0, 0, 0);
+ _286.x = (_287.x<_288.x);
+ _286.y = (_287.y<_288.y);
+ _286.z = (_287.z<_288.z);
+ _286.w = (_287.w<_288.w);
+ ushort4 _289;
+ int4 _290 = make_int4(0, 0, 0, 0);
+ _289.x = (_271.x<=_290.x);
+ _289.y = (_271.y<=_290.y);
+ _289.z = (_271.z<=_290.z);
+ _289.w = (_271.w<=_290.w);
+ _285.x = (_286.x&&_289.x);
+ _285.y = (_286.y&&_289.y);
+ _285.z = (_286.z&&_289.z);
+ _285.w = (_286.w&&_289.w);
+ _278.x = (_279.x||_285.x);
+ _278.y = (_279.y||_285.y);
+ _278.z = (_279.z||_285.z);
+ _278.w = (_279.w||_285.w);
+ int4 _291;
+ int4 _292 = make_int4(1, 1, 1, 1);
+ _291.x = (_274.x-_292.x);
+ _291.y = (_274.y-_292.y);
+ _291.z = (_274.z-_292.z);
+ _291.w = (_274.w-_292.w);
+ _277.x = (bool(_278.x)?_274.x:_291.x);
+ _277.y = (bool(_278.y)?_274.y:_291.y);
+ _277.z = (bool(_278.z)?_274.z:_291.z);
+ _277.w = (bool(_278.w)?_274.w:_291.w);
+ int4 _293 = make_int4(16, 16, 16, 16);
+ _270.x = (_277.x%_293.x);
+ _270.y = (_277.y%_293.y);
+ _270.z = (_277.z%_293.z);
+ _270.w = (_277.w%_293.w);
+ int4 _294;
+ ushort4 _295;
+ ushort4 _296;
+ ushort4 _297;
+ int4 _298 = make_int4(16, 16, 16, 16);
+ int4 _299 = make_int4(0, 0, 0, 0);
+ _297.x = (_298.x>=_299.x);
+ _297.y = (_298.y>=_299.y);
+ _297.z = (_298.z>=_299.z);
+ _297.w = (_298.w>=_299.w);
+ ushort4 _300;
+ int4 _301 = make_int4(0, 0, 0, 0);
+ _300.x = (_270.x>=_301.x);
+ _300.y = (_270.y>=_301.y);
+ _300.z = (_270.z>=_301.z);
+ _300.w = (_270.w>=_301.w);
+ _296.x = (_297.x&&_300.x);
+ _296.y = (_297.y&&_300.y);
+ _296.z = (_297.z&&_300.z);
+ _296.w = (_297.w&&_300.w);
+ ushort4 _302;
+ ushort4 _303;
+ int4 _304 = make_int4(16, 16, 16, 16);
+ int4 _305 = make_int4(0, 0, 0, 0);
+ _303.x = (_304.x<_305.x);
+ _303.y = (_304.y<_305.y);
+ _303.z = (_304.z<_305.z);
+ _303.w = (_304.w<_305.w);
+ ushort4 _306;
+ int4 _307 = make_int4(0, 0, 0, 0);
+ _306.x = (_270.x<=_307.x);
+ _306.y = (_270.y<=_307.y);
+ _306.z = (_270.z<=_307.z);
+ _306.w = (_270.w<=_307.w);
+ _302.x = (_303.x&&_306.x);
+ _302.y = (_303.y&&_306.y);
+ _302.z = (_303.z&&_306.z);
+ _302.w = (_303.w&&_306.w);
+ _295.x = (_296.x||_302.x);
+ _295.y = (_296.y||_302.y);
+ _295.z = (_296.z||_302.z);
+ _295.w = (_296.w||_302.w);
+ int4 _308;
+ int4 _309 = make_int4(16, 16, 16, 16);
+ _308.x = (_270.x+_309.x);
+ _308.y = (_270.y+_309.y);
+ _308.z = (_270.z+_309.z);
+ _308.w = (_270.w+_309.w);
+ _294.x = (bool(_295.x)?_270.x:_308.x);
+ _294.y = (bool(_295.y)?_270.y:_308.y);
+ _294.z = (bool(_295.z)?_270.z:_308.z);
+ _294.w = (bool(_295.w)?_270.w:_308.w);
+ int4 _310 = make_int4(9, 9, 9, 9);
+ _269.x = (_294.x*_310.x);
+ _269.y = (_294.y*_310.y);
+ _269.z = (_294.z*_310.z);
+ _269.w = (_294.w*_310.w);
+ _267.x = (_268.x+_269.x);
+ _267.y = (_268.y+_269.y);
+ _267.z = (_268.z+_269.z);
+ _267.w = (_268.w+_269.w);
+ int4 _311 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+ _266.x = (_267.x+_311.x);
+ _266.y = (_267.y+_311.y);
+ _266.z = (_267.z+_311.z);
+ _266.w = (_267.w+_311.w);
+ int4 _312;
+ int4 _313 = make_int4(((((int)threadIdx.x) + 128))+(1*0), ((((int)threadIdx.x) + 128))+(1*1), ((((int)threadIdx.x) + 128))+(1*2), ((((int)threadIdx.x) + 128))+(1*3));
+ int4 _314 = make_int4(3, 3, 3, 3);
+ _312.x = (_313.x%_314.x);
+ _312.y = (_313.y%_314.y);
+ _312.z = (_313.z%_314.z);
+ _312.w = (_313.w%_314.w);
+ int4 _315;
+ ushort4 _316;
+ ushort4 _317;
+ ushort4 _318;
+ int4 _319 = make_int4(3, 3, 3, 3);
+ int4 _320 = make_int4(0, 0, 0, 0);
+ _318.x = (_319.x>=_320.x);
+ _318.y = (_319.y>=_320.y);
+ _318.z = (_319.z>=_320.z);
+ _318.w = (_319.w>=_320.w);
+ ushort4 _321;
+ int4 _322 = make_int4(0, 0, 0, 0);
+ _321.x = (_312.x>=_322.x);
+ _321.y = (_312.y>=_322.y);
+ _321.z = (_312.z>=_322.z);
+ _321.w = (_312.w>=_322.w);
+ _317.x = (_318.x&&_321.x);
+ _317.y = (_318.y&&_321.y);
+ _317.z = (_318.z&&_321.z);
+ _317.w = (_318.w&&_321.w);
+ ushort4 _323;
+ ushort4 _324;
+ int4 _325 = make_int4(3, 3, 3, 3);
+ int4 _326 = make_int4(0, 0, 0, 0);
+ _324.x = (_325.x<_326.x);
+ _324.y = (_325.y<_326.y);
+ _324.z = (_325.z<_326.z);
+ _324.w = (_325.w<_326.w);
+ ushort4 _327;
+ int4 _328 = make_int4(0, 0, 0, 0);
+ _327.x = (_312.x<=_328.x);
+ _327.y = (_312.y<=_328.y);
+ _327.z = (_312.z<=_328.z);
+ _327.w = (_312.w<=_328.w);
+ _323.x = (_324.x&&_327.x);
+ _323.y = (_324.y&&_327.y);
+ _323.z = (_324.z&&_327.z);
+ _323.w = (_324.w&&_327.w);
+ _316.x = (_317.x||_323.x);
+ _316.y = (_317.y||_323.y);
+ _316.z = (_317.z||_323.z);
+ _316.w = (_317.w||_323.w);
+ int4 _329;
+ int4 _330 = make_int4(3, 3, 3, 3);
+ _329.x = (_312.x+_330.x);
+ _329.y = (_312.y+_330.y);
+ _329.z = (_312.z+_330.z);
+ _329.w = (_312.w+_330.w);
+ _315.x = (bool(_316.x)?_312.x:_329.x);
+ _315.y = (bool(_316.y)?_312.y:_329.y);
+ _315.z = (bool(_316.z)?_312.z:_329.z);
+ _315.w = (bool(_316.w)?_312.w:_329.w);
+ _265.x = (_266.x+_315.x);
+ _265.y = (_266.y+_315.y);
+ _265.z = (_266.z+_315.z);
+ _265.w = (_266.w+_315.w);
+ *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 512)) = make_float4(kernel[_265.x],kernel[_265.y],kernel[_265.z],kernel[_265.w]);
+ int4 _331;
+ int4 _332;
+ int4 _333;
+ int4 _334 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 640) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 640) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 640) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 640) / 48) * 4608)) + (rc_outer_outer [...]
+ int4 _335;
+ int4 _336;
+ int4 _337;
+ int4 _338 = make_int4((((((int)threadIdx.x) * 4) + 640))+(1*0), (((((int)threadIdx.x) * 4) + 640))+(1*1), (((((int)threadIdx.x) * 4) + 640))+(1*2), (((((int)threadIdx.x) * 4) + 640))+(1*3));
+ int4 _339 = make_int4(3, 3, 3, 3);
+ _337.x = (_338.x%_339.x);
+ _337.y = (_338.y%_339.y);
+ _337.z = (_338.z%_339.z);
+ _337.w = (_338.w%_339.w);
+ int4 _340;
+ int4 _341 = make_int4((((((int)threadIdx.x) * 4) + 640))+(1*0), (((((int)threadIdx.x) * 4) + 640))+(1*1), (((((int)threadIdx.x) * 4) + 640))+(1*2), (((((int)threadIdx.x) * 4) + 640))+(1*3));
+ int4 _342 = make_int4(3, 3, 3, 3);
+ _340.x = (_341.x/_342.x);
+ _340.y = (_341.y/_342.y);
+ _340.z = (_341.z/_342.z);
+ _340.w = (_341.w/_342.w);
+ int4 _343;
+ ushort4 _344;
+ ushort4 _345;
+ ushort4 _346;
+ int4 _347 = make_int4(3, 3, 3, 3);
+ int4 _348 = make_int4(0, 0, 0, 0);
+ _346.x = (_347.x>=_348.x);
+ _346.y = (_347.y>=_348.y);
+ _346.z = (_347.z>=_348.z);
+ _346.w = (_347.w>=_348.w);
+ ushort4 _349;
+ int4 _350 = make_int4(0, 0, 0, 0);
+ _349.x = (_337.x>=_350.x);
+ _349.y = (_337.y>=_350.y);
+ _349.z = (_337.z>=_350.z);
+ _349.w = (_337.w>=_350.w);
+ _345.x = (_346.x&&_349.x);
+ _345.y = (_346.y&&_349.y);
+ _345.z = (_346.z&&_349.z);
+ _345.w = (_346.w&&_349.w);
+ ushort4 _351;
+ ushort4 _352;
+ int4 _353 = make_int4(3, 3, 3, 3);
+ int4 _354 = make_int4(0, 0, 0, 0);
+ _352.x = (_353.x<_354.x);
+ _352.y = (_353.y<_354.y);
+ _352.z = (_353.z<_354.z);
+ _352.w = (_353.w<_354.w);
+ ushort4 _355;
+ int4 _356 = make_int4(0, 0, 0, 0);
+ _355.x = (_337.x<=_356.x);
+ _355.y = (_337.y<=_356.y);
+ _355.z = (_337.z<=_356.z);
+ _355.w = (_337.w<=_356.w);
+ _351.x = (_352.x&&_355.x);
+ _351.y = (_352.y&&_355.y);
+ _351.z = (_352.z&&_355.z);
+ _351.w = (_352.w&&_355.w);
+ _344.x = (_345.x||_351.x);
+ _344.y = (_345.y||_351.y);
+ _344.z = (_345.z||_351.z);
+ _344.w = (_345.w||_351.w);
+ int4 _357;
+ int4 _358 = make_int4(1, 1, 1, 1);
+ _357.x = (_340.x-_358.x);
+ _357.y = (_340.y-_358.y);
+ _357.z = (_340.z-_358.z);
+ _357.w = (_340.w-_358.w);
+ _343.x = (bool(_344.x)?_340.x:_357.x);
+ _343.y = (bool(_344.y)?_340.y:_357.y);
+ _343.z = (bool(_344.z)?_340.z:_357.z);
+ _343.w = (bool(_344.w)?_340.w:_357.w);
+ int4 _359 = make_int4(16, 16, 16, 16);
+ _336.x = (_343.x%_359.x);
+ _336.y = (_343.y%_359.y);
+ _336.z = (_343.z%_359.z);
+ _336.w = (_343.w%_359.w);
+ int4 _360;
+ ushort4 _361;
+ ushort4 _362;
+ ushort4 _363;
+ int4 _364 = make_int4(16, 16, 16, 16);
+ int4 _365 = make_int4(0, 0, 0, 0);
+ _363.x = (_364.x>=_365.x);
+ _363.y = (_364.y>=_365.y);
+ _363.z = (_364.z>=_365.z);
+ _363.w = (_364.w>=_365.w);
+ ushort4 _366;
+ int4 _367 = make_int4(0, 0, 0, 0);
+ _366.x = (_336.x>=_367.x);
+ _366.y = (_336.y>=_367.y);
+ _366.z = (_336.z>=_367.z);
+ _366.w = (_336.w>=_367.w);
+ _362.x = (_363.x&&_366.x);
+ _362.y = (_363.y&&_366.y);
+ _362.z = (_363.z&&_366.z);
+ _362.w = (_363.w&&_366.w);
+ ushort4 _368;
+ ushort4 _369;
+ int4 _370 = make_int4(16, 16, 16, 16);
+ int4 _371 = make_int4(0, 0, 0, 0);
+ _369.x = (_370.x<_371.x);
+ _369.y = (_370.y<_371.y);
+ _369.z = (_370.z<_371.z);
+ _369.w = (_370.w<_371.w);
+ ushort4 _372;
+ int4 _373 = make_int4(0, 0, 0, 0);
+ _372.x = (_336.x<=_373.x);
+ _372.y = (_336.y<=_373.y);
+ _372.z = (_336.z<=_373.z);
+ _372.w = (_336.w<=_373.w);
+ _368.x = (_369.x&&_372.x);
+ _368.y = (_369.y&&_372.y);
+ _368.z = (_369.z&&_372.z);
+ _368.w = (_369.w&&_372.w);
+ _361.x = (_362.x||_368.x);
+ _361.y = (_362.y||_368.y);
+ _361.z = (_362.z||_368.z);
+ _361.w = (_362.w||_368.w);
+ int4 _374;
+ int4 _375 = make_int4(16, 16, 16, 16);
+ _374.x = (_336.x+_375.x);
+ _374.y = (_336.y+_375.y);
+ _374.z = (_336.z+_375.z);
+ _374.w = (_336.w+_375.w);
+ _360.x = (bool(_361.x)?_336.x:_374.x);
+ _360.y = (bool(_361.y)?_336.y:_374.y);
+ _360.z = (bool(_361.z)?_336.z:_374.z);
+ _360.w = (bool(_361.w)?_336.w:_374.w);
+ int4 _376 = make_int4(9, 9, 9, 9);
+ _335.x = (_360.x*_376.x);
+ _335.y = (_360.y*_376.y);
+ _335.z = (_360.z*_376.z);
+ _335.w = (_360.w*_376.w);
+ _333.x = (_334.x+_335.x);
+ _333.y = (_334.y+_335.y);
+ _333.z = (_334.z+_335.z);
+ _333.w = (_334.w+_335.w);
+ int4 _377 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+ _332.x = (_333.x+_377.x);
+ _332.y = (_333.y+_377.y);
+ _332.z = (_333.z+_377.z);
+ _332.w = (_333.w+_377.w);
+ int4 _378;
+ int4 _379 = make_int4(((((int)threadIdx.x) + 160))+(1*0), ((((int)threadIdx.x) + 160))+(1*1), ((((int)threadIdx.x) + 160))+(1*2), ((((int)threadIdx.x) + 160))+(1*3));
+ int4 _380 = make_int4(3, 3, 3, 3);
+ _378.x = (_379.x%_380.x);
+ _378.y = (_379.y%_380.y);
+ _378.z = (_379.z%_380.z);
+ _378.w = (_379.w%_380.w);
+ int4 _381;
+ ushort4 _382;
+ ushort4 _383;
+ ushort4 _384;
+ int4 _385 = make_int4(3, 3, 3, 3);
+ int4 _386 = make_int4(0, 0, 0, 0);
+ _384.x = (_385.x>=_386.x);
+ _384.y = (_385.y>=_386.y);
+ _384.z = (_385.z>=_386.z);
+ _384.w = (_385.w>=_386.w);
+ ushort4 _387;
+ int4 _388 = make_int4(0, 0, 0, 0);
+ _387.x = (_378.x>=_388.x);
+ _387.y = (_378.y>=_388.y);
+ _387.z = (_378.z>=_388.z);
+ _387.w = (_378.w>=_388.w);
+ _383.x = (_384.x&&_387.x);
+ _383.y = (_384.y&&_387.y);
+ _383.z = (_384.z&&_387.z);
+ _383.w = (_384.w&&_387.w);
+ ushort4 _389;
+ ushort4 _390;
+ int4 _391 = make_int4(3, 3, 3, 3);
+ int4 _392 = make_int4(0, 0, 0, 0);
+ _390.x = (_391.x<_392.x);
+ _390.y = (_391.y<_392.y);
+ _390.z = (_391.z<_392.z);
+ _390.w = (_391.w<_392.w);
+ ushort4 _393;
+ int4 _394 = make_int4(0, 0, 0, 0);
+ _393.x = (_378.x<=_394.x);
+ _393.y = (_378.y<=_394.y);
+ _393.z = (_378.z<=_394.z);
+ _393.w = (_378.w<=_394.w);
+ _389.x = (_390.x&&_393.x);
+ _389.y = (_390.y&&_393.y);
+ _389.z = (_390.z&&_393.z);
+ _389.w = (_390.w&&_393.w);
+ _382.x = (_383.x||_389.x);
+ _382.y = (_383.y||_389.y);
+ _382.z = (_383.z||_389.z);
+ _382.w = (_383.w||_389.w);
+ int4 _395;
+ int4 _396 = make_int4(3, 3, 3, 3);
+ _395.x = (_378.x+_396.x);
+ _395.y = (_378.y+_396.y);
+ _395.z = (_378.z+_396.z);
+ _395.w = (_378.w+_396.w);
+ _381.x = (bool(_382.x)?_378.x:_395.x);
+ _381.y = (bool(_382.y)?_378.y:_395.y);
+ _381.z = (bool(_382.z)?_378.z:_395.z);
+ _381.w = (bool(_382.w)?_378.w:_395.w);
+ _331.x = (_332.x+_381.x);
+ _331.y = (_332.y+_381.y);
+ _331.z = (_332.z+_381.z);
+ _331.w = (_332.w+_381.w);
+ *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 640)) = make_float4(kernel[_331.x],kernel[_331.y],kernel[_331.z],kernel[_331.w]);
+ int4 _397;
+ int4 _398;
+ int4 _399;
+ int4 _400 = make_int4((((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 73728), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 73728), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 73728), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 73728));
+ int4 _401;
+ int4 _402;
+ int4 _403;
+ int4 _404 = make_int4((((((int)threadIdx.x) * 4) + 768))+(1*0), (((((int)threadIdx.x) * 4) + 768))+(1*1), (((((int)threadIdx.x) * 4) + 768))+(1*2), (((((int)threadIdx.x) * 4) + 768))+(1*3));
+ int4 _405 = make_int4(3, 3, 3, 3);
+ _403.x = (_404.x%_405.x);
+ _403.y = (_404.y%_405.y);
+ _403.z = (_404.z%_405.z);
+ _403.w = (_404.w%_405.w);
+ int4 _406;
+ int4 _407 = make_int4((((((int)threadIdx.x) * 4) + 768))+(1*0), (((((int)threadIdx.x) * 4) + 768))+(1*1), (((((int)threadIdx.x) * 4) + 768))+(1*2), (((((int)threadIdx.x) * 4) + 768))+(1*3));
+ int4 _408 = make_int4(3, 3, 3, 3);
+ _406.x = (_407.x/_408.x);
+ _406.y = (_407.y/_408.y);
+ _406.z = (_407.z/_408.z);
+ _406.w = (_407.w/_408.w);
+ int4 _409;
+ ushort4 _410;
+ ushort4 _411;
+ ushort4 _412;
+ int4 _413 = make_int4(3, 3, 3, 3);
+ int4 _414 = make_int4(0, 0, 0, 0);
+ _412.x = (_413.x>=_414.x);
+ _412.y = (_413.y>=_414.y);
+ _412.z = (_413.z>=_414.z);
+ _412.w = (_413.w>=_414.w);
+ ushort4 _415;
+ int4 _416 = make_int4(0, 0, 0, 0);
+ _415.x = (_403.x>=_416.x);
+ _415.y = (_403.y>=_416.y);
+ _415.z = (_403.z>=_416.z);
+ _415.w = (_403.w>=_416.w);
+ _411.x = (_412.x&&_415.x);
+ _411.y = (_412.y&&_415.y);
+ _411.z = (_412.z&&_415.z);
+ _411.w = (_412.w&&_415.w);
+ ushort4 _417;
+ ushort4 _418;
+ int4 _419 = make_int4(3, 3, 3, 3);
+ int4 _420 = make_int4(0, 0, 0, 0);
+ _418.x = (_419.x<_420.x);
+ _418.y = (_419.y<_420.y);
+ _418.z = (_419.z<_420.z);
+ _418.w = (_419.w<_420.w);
+ ushort4 _421;
+ int4 _422 = make_int4(0, 0, 0, 0);
+ _421.x = (_403.x<=_422.x);
+ _421.y = (_403.y<=_422.y);
+ _421.z = (_403.z<=_422.z);
+ _421.w = (_403.w<=_422.w);
+ _417.x = (_418.x&&_421.x);
+ _417.y = (_418.y&&_421.y);
+ _417.z = (_418.z&&_421.z);
+ _417.w = (_418.w&&_421.w);
+ _410.x = (_411.x||_417.x);
+ _410.y = (_411.y||_417.y);
+ _410.z = (_411.z||_417.z);
+ _410.w = (_411.w||_417.w);
+ int4 _423;
+ int4 _424 = make_int4(1, 1, 1, 1);
+ _423.x = (_406.x-_424.x);
+ _423.y = (_406.y-_424.y);
+ _423.z = (_406.z-_424.z);
+ _423.w = (_406.w-_424.w);
+ _409.x = (bool(_410.x)?_406.x:_423.x);
+ _409.y = (bool(_410.y)?_406.y:_423.y);
+ _409.z = (bool(_410.z)?_406.z:_423.z);
+ _409.w = (bool(_410.w)?_406.w:_423.w);
+ int4 _425 = make_int4(16, 16, 16, 16);
+ _402.x = (_409.x%_425.x);
+ _402.y = (_409.y%_425.y);
+ _402.z = (_409.z%_425.z);
+ _402.w = (_409.w%_425.w);
+ int4 _426;
+ ushort4 _427;
+ ushort4 _428;
+ ushort4 _429;
+ int4 _430 = make_int4(16, 16, 16, 16);
+ int4 _431 = make_int4(0, 0, 0, 0);
+ _429.x = (_430.x>=_431.x);
+ _429.y = (_430.y>=_431.y);
+ _429.z = (_430.z>=_431.z);
+ _429.w = (_430.w>=_431.w);
+ ushort4 _432;
+ int4 _433 = make_int4(0, 0, 0, 0);
+ _432.x = (_402.x>=_433.x);
+ _432.y = (_402.y>=_433.y);
+ _432.z = (_402.z>=_433.z);
+ _432.w = (_402.w>=_433.w);
+ _428.x = (_429.x&&_432.x);
+ _428.y = (_429.y&&_432.y);
+ _428.z = (_429.z&&_432.z);
+ _428.w = (_429.w&&_432.w);
+ ushort4 _434;
+ ushort4 _435;
+ int4 _436 = make_int4(16, 16, 16, 16);
+ int4 _437 = make_int4(0, 0, 0, 0);
+ _435.x = (_436.x<_437.x);
+ _435.y = (_436.y<_437.y);
+ _435.z = (_436.z<_437.z);
+ _435.w = (_436.w<_437.w);
+ ushort4 _438;
+ int4 _439 = make_int4(0, 0, 0, 0);
+ _438.x = (_402.x<=_439.x);
+ _438.y = (_402.y<=_439.y);
+ _438.z = (_402.z<=_439.z);
+ _438.w = (_402.w<=_439.w);
+ _434.x = (_435.x&&_438.x);
+ _434.y = (_435.y&&_438.y);
+ _434.z = (_435.z&&_438.z);
+ _434.w = (_435.w&&_438.w);
+ _427.x = (_428.x||_434.x);
+ _427.y = (_428.y||_434.y);
+ _427.z = (_428.z||_434.z);
+ _427.w = (_428.w||_434.w);
+ int4 _440;
+ int4 _441 = make_int4(16, 16, 16, 16);
+ _440.x = (_402.x+_441.x);
+ _440.y = (_402.y+_441.y);
+ _440.z = (_402.z+_441.z);
+ _440.w = (_402.w+_441.w);
+ _426.x = (bool(_427.x)?_402.x:_440.x);
+ _426.y = (bool(_427.y)?_402.y:_440.y);
+ _426.z = (bool(_427.z)?_402.z:_440.z);
+ _426.w = (bool(_427.w)?_402.w:_440.w);
+ int4 _442 = make_int4(9, 9, 9, 9);
+ _401.x = (_426.x*_442.x);
+ _401.y = (_426.y*_442.y);
+ _401.z = (_426.z*_442.z);
+ _401.w = (_426.w*_442.w);
+ _399.x = (_400.x+_401.x);
+ _399.y = (_400.y+_401.y);
+ _399.z = (_400.z+_401.z);
+ _399.w = (_400.w+_401.w);
+ int4 _443 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+ _398.x = (_399.x+_443.x);
+ _398.y = (_399.y+_443.y);
+ _398.z = (_399.z+_443.z);
+ _398.w = (_399.w+_443.w);
+ int4 _444;
+ int4 _445 = make_int4(((((int)threadIdx.x) + 192))+(1*0), ((((int)threadIdx.x) + 192))+(1*1), ((((int)threadIdx.x) + 192))+(1*2), ((((int)threadIdx.x) + 192))+(1*3));
+ int4 _446 = make_int4(3, 3, 3, 3);
+ _444.x = (_445.x%_446.x);
+ _444.y = (_445.y%_446.y);
+ _444.z = (_445.z%_446.z);
+ _444.w = (_445.w%_446.w);
+ int4 _447;
+ ushort4 _448;
+ ushort4 _449;
+ ushort4 _450;
+ int4 _451 = make_int4(3, 3, 3, 3);
+ int4 _452 = make_int4(0, 0, 0, 0);
+ _450.x = (_451.x>=_452.x);
+ _450.y = (_451.y>=_452.y);
+ _450.z = (_451.z>=_452.z);
+ _450.w = (_451.w>=_452.w);
+ ushort4 _453;
+ int4 _454 = make_int4(0, 0, 0, 0);
+ _453.x = (_444.x>=_454.x);
+ _453.y = (_444.y>=_454.y);
+ _453.z = (_444.z>=_454.z);
+ _453.w = (_444.w>=_454.w);
+ _449.x = (_450.x&&_453.x);
+ _449.y = (_450.y&&_453.y);
+ _449.z = (_450.z&&_453.z);
+ _449.w = (_450.w&&_453.w);
+ ushort4 _455;
+ ushort4 _456;
+ int4 _457 = make_int4(3, 3, 3, 3);
+ int4 _458 = make_int4(0, 0, 0, 0);
+ _456.x = (_457.x<_458.x);
+ _456.y = (_457.y<_458.y);
+ _456.z = (_457.z<_458.z);
+ _456.w = (_457.w<_458.w);
+ ushort4 _459;
+ int4 _460 = make_int4(0, 0, 0, 0);
+ _459.x = (_444.x<=_460.x);
+ _459.y = (_444.y<=_460.y);
+ _459.z = (_444.z<=_460.z);
+ _459.w = (_444.w<=_460.w);
+ _455.x = (_456.x&&_459.x);
+ _455.y = (_456.y&&_459.y);
+ _455.z = (_456.z&&_459.z);
+ _455.w = (_456.w&&_459.w);
+ _448.x = (_449.x||_455.x);
+ _448.y = (_449.y||_455.y);
+ _448.z = (_449.z||_455.z);
+ _448.w = (_449.w||_455.w);
+ int4 _461;
+ int4 _462 = make_int4(3, 3, 3, 3);
+ _461.x = (_444.x+_462.x);
+ _461.y = (_444.y+_462.y);
+ _461.z = (_444.z+_462.z);
+ _461.w = (_444.w+_462.w);
+ _447.x = (bool(_448.x)?_444.x:_461.x);
+ _447.y = (bool(_448.y)?_444.y:_461.y);
+ _447.z = (bool(_448.z)?_444.z:_461.z);
+ _447.w = (bool(_448.w)?_444.w:_461.w);
+ _397.x = (_398.x+_447.x);
+ _397.y = (_398.y+_447.y);
+ _397.z = (_398.z+_447.z);
+ _397.w = (_398.w+_447.w);
+ *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 768)) = make_float4(kernel[_397.x],kernel[_397.y],kernel[_397.z],kernel[_397.w]);
+ int4 _463;
+ int4 _464;
+ int4 _465;
+ int4 _466 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 896) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 896) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 896) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 896) / 48) * 4608)) + (rc_outer_outer [...]
+ int4 _467;
+ int4 _468;
+ int4 _469;
+ int4 _470 = make_int4((((((int)threadIdx.x) * 4) + 896))+(1*0), (((((int)threadIdx.x) * 4) + 896))+(1*1), (((((int)threadIdx.x) * 4) + 896))+(1*2), (((((int)threadIdx.x) * 4) + 896))+(1*3));
+ int4 _471 = make_int4(3, 3, 3, 3);
+ _469.x = (_470.x%_471.x);
+ _469.y = (_470.y%_471.y);
+ _469.z = (_470.z%_471.z);
+ _469.w = (_470.w%_471.w);
+ int4 _472;
+ int4 _473 = make_int4((((((int)threadIdx.x) * 4) + 896))+(1*0), (((((int)threadIdx.x) * 4) + 896))+(1*1), (((((int)threadIdx.x) * 4) + 896))+(1*2), (((((int)threadIdx.x) * 4) + 896))+(1*3));
+ int4 _474 = make_int4(3, 3, 3, 3);
+ _472.x = (_473.x/_474.x);
+ _472.y = (_473.y/_474.y);
+ _472.z = (_473.z/_474.z);
+ _472.w = (_473.w/_474.w);
+ int4 _475;
+ ushort4 _476;
+ ushort4 _477;
+ ushort4 _478;
+ int4 _479 = make_int4(3, 3, 3, 3);
+ int4 _480 = make_int4(0, 0, 0, 0);
+ _478.x = (_479.x>=_480.x);
+ _478.y = (_479.y>=_480.y);
+ _478.z = (_479.z>=_480.z);
+ _478.w = (_479.w>=_480.w);
+ ushort4 _481;
+ int4 _482 = make_int4(0, 0, 0, 0);
+ _481.x = (_469.x>=_482.x);
+ _481.y = (_469.y>=_482.y);
+ _481.z = (_469.z>=_482.z);
+ _481.w = (_469.w>=_482.w);
+ _477.x = (_478.x&&_481.x);
+ _477.y = (_478.y&&_481.y);
+ _477.z = (_478.z&&_481.z);
+ _477.w = (_478.w&&_481.w);
+ ushort4 _483;
+ ushort4 _484;
+ int4 _485 = make_int4(3, 3, 3, 3);
+ int4 _486 = make_int4(0, 0, 0, 0);
+ _484.x = (_485.x<_486.x);
+ _484.y = (_485.y<_486.y);
+ _484.z = (_485.z<_486.z);
+ _484.w = (_485.w<_486.w);
+ ushort4 _487;
+ int4 _488 = make_int4(0, 0, 0, 0);
+ _487.x = (_469.x<=_488.x);
+ _487.y = (_469.y<=_488.y);
+ _487.z = (_469.z<=_488.z);
+ _487.w = (_469.w<=_488.w);
+ _483.x = (_484.x&&_487.x);
+ _483.y = (_484.y&&_487.y);
+ _483.z = (_484.z&&_487.z);
+ _483.w = (_484.w&&_487.w);
+ _476.x = (_477.x||_483.x);
+ _476.y = (_477.y||_483.y);
+ _476.z = (_477.z||_483.z);
+ _476.w = (_477.w||_483.w);
+ int4 _489;
+ int4 _490 = make_int4(1, 1, 1, 1);
+ _489.x = (_472.x-_490.x);
+ _489.y = (_472.y-_490.y);
+ _489.z = (_472.z-_490.z);
+ _489.w = (_472.w-_490.w);
+ _475.x = (bool(_476.x)?_472.x:_489.x);
+ _475.y = (bool(_476.y)?_472.y:_489.y);
+ _475.z = (bool(_476.z)?_472.z:_489.z);
+ _475.w = (bool(_476.w)?_472.w:_489.w);
+ int4 _491 = make_int4(16, 16, 16, 16);
+ _468.x = (_475.x%_491.x);
+ _468.y = (_475.y%_491.y);
+ _468.z = (_475.z%_491.z);
+ _468.w = (_475.w%_491.w);
+ int4 _492;
+ ushort4 _493;
+ ushort4 _494;
+ ushort4 _495;
+ int4 _496 = make_int4(16, 16, 16, 16);
+ int4 _497 = make_int4(0, 0, 0, 0);
+ _495.x = (_496.x>=_497.x);
+ _495.y = (_496.y>=_497.y);
+ _495.z = (_496.z>=_497.z);
+ _495.w = (_496.w>=_497.w);
+ ushort4 _498;
+ int4 _499 = make_int4(0, 0, 0, 0);
+ _498.x = (_468.x>=_499.x);
+ _498.y = (_468.y>=_499.y);
+ _498.z = (_468.z>=_499.z);
+ _498.w = (_468.w>=_499.w);
+ _494.x = (_495.x&&_498.x);
+ _494.y = (_495.y&&_498.y);
+ _494.z = (_495.z&&_498.z);
+ _494.w = (_495.w&&_498.w);
+ ushort4 _500;
+ ushort4 _501;
+ int4 _502 = make_int4(16, 16, 16, 16);
+ int4 _503 = make_int4(0, 0, 0, 0);
+ _501.x = (_502.x<_503.x);
+ _501.y = (_502.y<_503.y);
+ _501.z = (_502.z<_503.z);
+ _501.w = (_502.w<_503.w);
+ ushort4 _504;
+ int4 _505 = make_int4(0, 0, 0, 0);
+ _504.x = (_468.x<=_505.x);
+ _504.y = (_468.y<=_505.y);
+ _504.z = (_468.z<=_505.z);
+ _504.w = (_468.w<=_505.w);
+ _500.x = (_501.x&&_504.x);
+ _500.y = (_501.y&&_504.y);
+ _500.z = (_501.z&&_504.z);
+ _500.w = (_501.w&&_504.w);
+ _493.x = (_494.x||_500.x);
+ _493.y = (_494.y||_500.y);
+ _493.z = (_494.z||_500.z);
+ _493.w = (_494.w||_500.w);
+ int4 _506;
+ int4 _507 = make_int4(16, 16, 16, 16);
+ _506.x = (_468.x+_507.x);
+ _506.y = (_468.y+_507.y);
+ _506.z = (_468.z+_507.z);
+ _506.w = (_468.w+_507.w);
+ _492.x = (bool(_493.x)?_468.x:_506.x);
+ _492.y = (bool(_493.y)?_468.y:_506.y);
+ _492.z = (bool(_493.z)?_468.z:_506.z);
+ _492.w = (bool(_493.w)?_468.w:_506.w);
+ int4 _508 = make_int4(9, 9, 9, 9);
+ _467.x = (_492.x*_508.x);
+ _467.y = (_492.y*_508.y);
+ _467.z = (_492.z*_508.z);
+ _467.w = (_492.w*_508.w);
+ _465.x = (_466.x+_467.x);
+ _465.y = (_466.y+_467.y);
+ _465.z = (_466.z+_467.z);
+ _465.w = (_466.w+_467.w);
+ int4 _509 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+ _464.x = (_465.x+_509.x);
+ _464.y = (_465.y+_509.y);
+ _464.z = (_465.z+_509.z);
+ _464.w = (_465.w+_509.w);
+ int4 _510;
+ int4 _511 = make_int4(((((int)threadIdx.x) + 224))+(1*0), ((((int)threadIdx.x) + 224))+(1*1), ((((int)threadIdx.x) + 224))+(1*2), ((((int)threadIdx.x) + 224))+(1*3));
+ int4 _512 = make_int4(3, 3, 3, 3);
+ _510.x = (_511.x%_512.x);
+ _510.y = (_511.y%_512.y);
+ _510.z = (_511.z%_512.z);
+ _510.w = (_511.w%_512.w);
+ int4 _513;
+ ushort4 _514;
+ ushort4 _515;
+ ushort4 _516;
+ int4 _517 = make_int4(3, 3, 3, 3);
+ int4 _518 = make_int4(0, 0, 0, 0);
+ _516.x = (_517.x>=_518.x);
+ _516.y = (_517.y>=_518.y);
+ _516.z = (_517.z>=_518.z);
+ _516.w = (_517.w>=_518.w);
+ ushort4 _519;
+ int4 _520 = make_int4(0, 0, 0, 0);
+ _519.x = (_510.x>=_520.x);
+ _519.y = (_510.y>=_520.y);
+ _519.z = (_510.z>=_520.z);
+ _519.w = (_510.w>=_520.w);
+ _515.x = (_516.x&&_519.x);
+ _515.y = (_516.y&&_519.y);
+ _515.z = (_516.z&&_519.z);
+ _515.w = (_516.w&&_519.w);
+ ushort4 _521;
+ ushort4 _522;
+ int4 _523 = make_int4(3, 3, 3, 3);
+ int4 _524 = make_int4(0, 0, 0, 0);
+ _522.x = (_523.x<_524.x);
+ _522.y = (_523.y<_524.y);
+ _522.z = (_523.z<_524.z);
+ _522.w = (_523.w<_524.w);
+ ushort4 _525;
+ int4 _526 = make_int4(0, 0, 0, 0);
+ _525.x = (_510.x<=_526.x);
+ _525.y = (_510.y<=_526.y);
+ _525.z = (_510.z<=_526.z);
+ _525.w = (_510.w<=_526.w);
+ _521.x = (_522.x&&_525.x);
+ _521.y = (_522.y&&_525.y);
+ _521.z = (_522.z&&_525.z);
+ _521.w = (_522.w&&_525.w);
+ _514.x = (_515.x||_521.x);
+ _514.y = (_515.y||_521.y);
+ _514.z = (_515.z||_521.z);
+ _514.w = (_515.w||_521.w);
+ int4 _527;
+ int4 _528 = make_int4(3, 3, 3, 3);
+ _527.x = (_510.x+_528.x);
+ _527.y = (_510.y+_528.y);
+ _527.z = (_510.z+_528.z);
+ _527.w = (_510.w+_528.w);
+ _513.x = (bool(_514.x)?_510.x:_527.x);
+ _513.y = (bool(_514.y)?_510.y:_527.y);
+ _513.z = (bool(_514.z)?_510.z:_527.z);
+ _513.w = (bool(_514.w)?_510.w:_527.w);
+ _463.x = (_464.x+_513.x);
+ _463.y = (_464.y+_513.y);
+ _463.z = (_464.z+_513.z);
+ _463.w = (_464.w+_513.w);
+ *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 896)) = make_float4(kernel[_463.x],kernel[_463.y],kernel[_463.z],kernel[_463.w]);
+ int4 _529;
+ int4 _530;
+ int4 _531;
+ int4 _532 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1024) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1024) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1024) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1024) / 48) * 4608)) + (rc_outer_o [...]
+ int4 _533;
+ int4 _534;
+ int4 _535;
+ int4 _536 = make_int4((((((int)threadIdx.x) * 4) + 1024))+(1*0), (((((int)threadIdx.x) * 4) + 1024))+(1*1), (((((int)threadIdx.x) * 4) + 1024))+(1*2), (((((int)threadIdx.x) * 4) + 1024))+(1*3));
+ int4 _537 = make_int4(3, 3, 3, 3);
+ _535.x = (_536.x%_537.x);
+ _535.y = (_536.y%_537.y);
+ _535.z = (_536.z%_537.z);
+ _535.w = (_536.w%_537.w);
+ int4 _538;
+ int4 _539 = make_int4((((((int)threadIdx.x) * 4) + 1024))+(1*0), (((((int)threadIdx.x) * 4) + 1024))+(1*1), (((((int)threadIdx.x) * 4) + 1024))+(1*2), (((((int)threadIdx.x) * 4) + 1024))+(1*3));
+ int4 _540 = make_int4(3, 3, 3, 3);
+ _538.x = (_539.x/_540.x);
+ _538.y = (_539.y/_540.y);
+ _538.z = (_539.z/_540.z);
+ _538.w = (_539.w/_540.w);
+ int4 _541;
+ ushort4 _542;
+ ushort4 _543;
+ ushort4 _544;
+ int4 _545 = make_int4(3, 3, 3, 3);
+ int4 _546 = make_int4(0, 0, 0, 0);
+ _544.x = (_545.x>=_546.x);
+ _544.y = (_545.y>=_546.y);
+ _544.z = (_545.z>=_546.z);
+ _544.w = (_545.w>=_546.w);
+ ushort4 _547;
+ int4 _548 = make_int4(0, 0, 0, 0);
+ _547.x = (_535.x>=_548.x);
+ _547.y = (_535.y>=_548.y);
+ _547.z = (_535.z>=_548.z);
+ _547.w = (_535.w>=_548.w);
+ _543.x = (_544.x&&_547.x);
+ _543.y = (_544.y&&_547.y);
+ _543.z = (_544.z&&_547.z);
+ _543.w = (_544.w&&_547.w);
+ ushort4 _549;
+ ushort4 _550;
+ int4 _551 = make_int4(3, 3, 3, 3);
+ int4 _552 = make_int4(0, 0, 0, 0);
+ _550.x = (_551.x<_552.x);
+ _550.y = (_551.y<_552.y);
+ _550.z = (_551.z<_552.z);
+ _550.w = (_551.w<_552.w);
+ ushort4 _553;
+ int4 _554 = make_int4(0, 0, 0, 0);
+ _553.x = (_535.x<=_554.x);
+ _553.y = (_535.y<=_554.y);
+ _553.z = (_535.z<=_554.z);
+ _553.w = (_535.w<=_554.w);
+ _549.x = (_550.x&&_553.x);
+ _549.y = (_550.y&&_553.y);
+ _549.z = (_550.z&&_553.z);
+ _549.w = (_550.w&&_553.w);
+ _542.x = (_543.x||_549.x);
+ _542.y = (_543.y||_549.y);
+ _542.z = (_543.z||_549.z);
+ _542.w = (_543.w||_549.w);
+ int4 _555;
+ int4 _556 = make_int4(1, 1, 1, 1);
+ _555.x = (_538.x-_556.x);
+ _555.y = (_538.y-_556.y);
+ _555.z = (_538.z-_556.z);
+ _555.w = (_538.w-_556.w);
+ _541.x = (bool(_542.x)?_538.x:_555.x);
+ _541.y = (bool(_542.y)?_538.y:_555.y);
+ _541.z = (bool(_542.z)?_538.z:_555.z);
+ _541.w = (bool(_542.w)?_538.w:_555.w);
+ int4 _557 = make_int4(16, 16, 16, 16);
+ _534.x = (_541.x%_557.x);
+ _534.y = (_541.y%_557.y);
+ _534.z = (_541.z%_557.z);
+ _534.w = (_541.w%_557.w);
+ int4 _558;
+ ushort4 _559;
+ ushort4 _560;
+ ushort4 _561;
+ int4 _562 = make_int4(16, 16, 16, 16);
+ int4 _563 = make_int4(0, 0, 0, 0);
+ _561.x = (_562.x>=_563.x);
+ _561.y = (_562.y>=_563.y);
+ _561.z = (_562.z>=_563.z);
+ _561.w = (_562.w>=_563.w);
+ ushort4 _564;
+ int4 _565 = make_int4(0, 0, 0, 0);
+ _564.x = (_534.x>=_565.x);
+ _564.y = (_534.y>=_565.y);
+ _564.z = (_534.z>=_565.z);
+ _564.w = (_534.w>=_565.w);
+ _560.x = (_561.x&&_564.x);
+ _560.y = (_561.y&&_564.y);
+ _560.z = (_561.z&&_564.z);
+ _560.w = (_561.w&&_564.w);
+ ushort4 _566;
+ ushort4 _567;
+ int4 _568 = make_int4(16, 16, 16, 16);
+ int4 _569 = make_int4(0, 0, 0, 0);
+ _567.x = (_568.x<_569.x);
+ _567.y = (_568.y<_569.y);
+ _567.z = (_568.z<_569.z);
+ _567.w = (_568.w<_569.w);
+ ushort4 _570;
+ int4 _571 = make_int4(0, 0, 0, 0);
+ _570.x = (_534.x<=_571.x);
+ _570.y = (_534.y<=_571.y);
+ _570.z = (_534.z<=_571.z);
+ _570.w = (_534.w<=_571.w);
+ _566.x = (_567.x&&_570.x);
+ _566.y = (_567.y&&_570.y);
+ _566.z = (_567.z&&_570.z);
+ _566.w = (_567.w&&_570.w);
+ _559.x = (_560.x||_566.x);
+ _559.y = (_560.y||_566.y);
+ _559.z = (_560.z||_566.z);
+ _559.w = (_560.w||_566.w);
+ int4 _572;
+ int4 _573 = make_int4(16, 16, 16, 16);
+ _572.x = (_534.x+_573.x);
+ _572.y = (_534.y+_573.y);
+ _572.z = (_534.z+_573.z);
+ _572.w = (_534.w+_573.w);
+ _558.x = (bool(_559.x)?_534.x:_572.x);
+ _558.y = (bool(_559.y)?_534.y:_572.y);
+ _558.z = (bool(_559.z)?_534.z:_572.z);
+ _558.w = (bool(_559.w)?_534.w:_572.w);
+ int4 _574 = make_int4(9, 9, 9, 9);
+ _533.x = (_558.x*_574.x);
+ _533.y = (_558.y*_574.y);
+ _533.z = (_558.z*_574.z);
+ _533.w = (_558.w*_574.w);
+ _531.x = (_532.x+_533.x);
+ _531.y = (_532.y+_533.y);
+ _531.z = (_532.z+_533.z);
+ _531.w = (_532.w+_533.w);
+ int4 _575 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+ _530.x = (_531.x+_575.x);
+ _530.y = (_531.y+_575.y);
+ _530.z = (_531.z+_575.z);
+ _530.w = (_531.w+_575.w);
+ int4 _576;
+ int4 _577 = make_int4(((((int)threadIdx.x) + 256))+(1*0), ((((int)threadIdx.x) + 256))+(1*1), ((((int)threadIdx.x) + 256))+(1*2), ((((int)threadIdx.x) + 256))+(1*3));
+ int4 _578 = make_int4(3, 3, 3, 3);
+ _576.x = (_577.x%_578.x);
+ _576.y = (_577.y%_578.y);
+ _576.z = (_577.z%_578.z);
+ _576.w = (_577.w%_578.w);
+ int4 _579;
+ ushort4 _580;
+ ushort4 _581;
+ ushort4 _582;
+ int4 _583 = make_int4(3, 3, 3, 3);
+ int4 _584 = make_int4(0, 0, 0, 0);
+ _582.x = (_583.x>=_584.x);
+ _582.y = (_583.y>=_584.y);
+ _582.z = (_583.z>=_584.z);
+ _582.w = (_583.w>=_584.w);
+ ushort4 _585;
+ int4 _586 = make_int4(0, 0, 0, 0);
+ _585.x = (_576.x>=_586.x);
+ _585.y = (_576.y>=_586.y);
+ _585.z = (_576.z>=_586.z);
+ _585.w = (_576.w>=_586.w);
+ _581.x = (_582.x&&_585.x);
+ _581.y = (_582.y&&_585.y);
+ _581.z = (_582.z&&_585.z);
+ _581.w = (_582.w&&_585.w);
+ ushort4 _587;
+ ushort4 _588;
+ int4 _589 = make_int4(3, 3, 3, 3);
+ int4 _590 = make_int4(0, 0, 0, 0);
+ _588.x = (_589.x<_590.x);
+ _588.y = (_589.y<_590.y);
+ _588.z = (_589.z<_590.z);
+ _588.w = (_589.w<_590.w);
+ ushort4 _591;
+ int4 _592 = make_int4(0, 0, 0, 0);
+ _591.x = (_576.x<=_592.x);
+ _591.y = (_576.y<=_592.y);
+ _591.z = (_576.z<=_592.z);
+ _591.w = (_576.w<=_592.w);
+ _587.x = (_588.x&&_591.x);
+ _587.y = (_588.y&&_591.y);
+ _587.z = (_588.z&&_591.z);
+ _587.w = (_588.w&&_591.w);
+ _580.x = (_581.x||_587.x);
+ _580.y = (_581.y||_587.y);
+ _580.z = (_581.z||_587.z);
+ _580.w = (_581.w||_587.w);
+ int4 _593;
+ int4 _594 = make_int4(3, 3, 3, 3);
+ _593.x = (_576.x+_594.x);
+ _593.y = (_576.y+_594.y);
+ _593.z = (_576.z+_594.z);
+ _593.w = (_576.w+_594.w);
+ _579.x = (bool(_580.x)?_576.x:_593.x);
+ _579.y = (bool(_580.y)?_576.y:_593.y);
+ _579.z = (bool(_580.z)?_576.z:_593.z);
+ _579.w = (bool(_580.w)?_576.w:_593.w);
+ _529.x = (_530.x+_579.x);
+ _529.y = (_530.y+_579.y);
+ _529.z = (_530.z+_579.z);
+ _529.w = (_530.w+_579.w);
+ *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 1024)) = make_float4(kernel[_529.x],kernel[_529.y],kernel[_529.z],kernel[_529.w]);
+ int4 _595;
+ int4 _596;
+ int4 _597;
+ int4 _598 = make_int4((((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 110592), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 110592), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 110592), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 110592));
+ int4 _599;
+ int4 _600;
+ int4 _601;
+ int4 _602 = make_int4((((((int)threadIdx.x) * 4) + 1152))+(1*0), (((((int)threadIdx.x) * 4) + 1152))+(1*1), (((((int)threadIdx.x) * 4) + 1152))+(1*2), (((((int)threadIdx.x) * 4) + 1152))+(1*3));
+ int4 _603 = make_int4(3, 3, 3, 3);
+ _601.x = (_602.x%_603.x);
+ _601.y = (_602.y%_603.y);
+ _601.z = (_602.z%_603.z);
+ _601.w = (_602.w%_603.w);
+ int4 _604;
+ int4 _605 = make_int4((((((int)threadIdx.x) * 4) + 1152))+(1*0), (((((int)threadIdx.x) * 4) + 1152))+(1*1), (((((int)threadIdx.x) * 4) + 1152))+(1*2), (((((int)threadIdx.x) * 4) + 1152))+(1*3));
+ int4 _606 = make_int4(3, 3, 3, 3);
+ _604.x = (_605.x/_606.x);
+ _604.y = (_605.y/_606.y);
+ _604.z = (_605.z/_606.z);
+ _604.w = (_605.w/_606.w);
+ int4 _607;
+ ushort4 _608;
+ ushort4 _609;
+ ushort4 _610;
+ int4 _611 = make_int4(3, 3, 3, 3);
+ int4 _612 = make_int4(0, 0, 0, 0);
+ _610.x = (_611.x>=_612.x);
+ _610.y = (_611.y>=_612.y);
+ _610.z = (_611.z>=_612.z);
+ _610.w = (_611.w>=_612.w);
+ ushort4 _613;
+ int4 _614 = make_int4(0, 0, 0, 0);
+ _613.x = (_601.x>=_614.x);
+ _613.y = (_601.y>=_614.y);
+ _613.z = (_601.z>=_614.z);
+ _613.w = (_601.w>=_614.w);
+ _609.x = (_610.x&&_613.x);
+ _609.y = (_610.y&&_613.y);
+ _609.z = (_610.z&&_613.z);
+ _609.w = (_610.w&&_613.w);
+ ushort4 _615;
+ ushort4 _616;
+ int4 _617 = make_int4(3, 3, 3, 3);
+ int4 _618 = make_int4(0, 0, 0, 0);
+ _616.x = (_617.x<_618.x);
+ _616.y = (_617.y<_618.y);
+ _616.z = (_617.z<_618.z);
+ _616.w = (_617.w<_618.w);
+ ushort4 _619;
+ int4 _620 = make_int4(0, 0, 0, 0);
+ _619.x = (_601.x<=_620.x);
+ _619.y = (_601.y<=_620.y);
+ _619.z = (_601.z<=_620.z);
+ _619.w = (_601.w<=_620.w);
+ _615.x = (_616.x&&_619.x);
+ _615.y = (_616.y&&_619.y);
+ _615.z = (_616.z&&_619.z);
+ _615.w = (_616.w&&_619.w);
+ _608.x = (_609.x||_615.x);
+ _608.y = (_609.y||_615.y);
+ _608.z = (_609.z||_615.z);
+ _608.w = (_609.w||_615.w);
+ int4 _621;
+ int4 _622 = make_int4(1, 1, 1, 1);
+ _621.x = (_604.x-_622.x);
+ _621.y = (_604.y-_622.y);
+ _621.z = (_604.z-_622.z);
+ _621.w = (_604.w-_622.w);
+ _607.x = (bool(_608.x)?_604.x:_621.x);
+ _607.y = (bool(_608.y)?_604.y:_621.y);
+ _607.z = (bool(_608.z)?_604.z:_621.z);
+ _607.w = (bool(_608.w)?_604.w:_621.w);
+ int4 _623 = make_int4(16, 16, 16, 16);
+ _600.x = (_607.x%_623.x);
+ _600.y = (_607.y%_623.y);
+ _600.z = (_607.z%_623.z);
+ _600.w = (_607.w%_623.w);
+ int4 _624;
+ ushort4 _625;
+ ushort4 _626;
+ ushort4 _627;
+ int4 _628 = make_int4(16, 16, 16, 16);
+ int4 _629 = make_int4(0, 0, 0, 0);
+ _627.x = (_628.x>=_629.x);
+ _627.y = (_628.y>=_629.y);
+ _627.z = (_628.z>=_629.z);
+ _627.w = (_628.w>=_629.w);
+ ushort4 _630;
+ int4 _631 = make_int4(0, 0, 0, 0);
+ _630.x = (_600.x>=_631.x);
+ _630.y = (_600.y>=_631.y);
+ _630.z = (_600.z>=_631.z);
+ _630.w = (_600.w>=_631.w);
+ _626.x = (_627.x&&_630.x);
+ _626.y = (_627.y&&_630.y);
+ _626.z = (_627.z&&_630.z);
+ _626.w = (_627.w&&_630.w);
+ ushort4 _632;
+ ushort4 _633;
+ int4 _634 = make_int4(16, 16, 16, 16);
+ int4 _635 = make_int4(0, 0, 0, 0);
+ _633.x = (_634.x<_635.x);
+ _633.y = (_634.y<_635.y);
+ _633.z = (_634.z<_635.z);
+ _633.w = (_634.w<_635.w);
+ ushort4 _636;
+ int4 _637 = make_int4(0, 0, 0, 0);
+ _636.x = (_600.x<=_637.x);
+ _636.y = (_600.y<=_637.y);
+ _636.z = (_600.z<=_637.z);
+ _636.w = (_600.w<=_637.w);
+ _632.x = (_633.x&&_636.x);
+ _632.y = (_633.y&&_636.y);
+ _632.z = (_633.z&&_636.z);
+ _632.w = (_633.w&&_636.w);
+ _625.x = (_626.x||_632.x);
+ _625.y = (_626.y||_632.y);
+ _625.z = (_626.z||_632.z);
+ _625.w = (_626.w||_632.w);
+ int4 _638;
+ int4 _639 = make_int4(16, 16, 16, 16);
+ _638.x = (_600.x+_639.x);
+ _638.y = (_600.y+_639.y);
+ _638.z = (_600.z+_639.z);
+ _638.w = (_600.w+_639.w);
+ _624.x = (bool(_625.x)?_600.x:_638.x);
+ _624.y = (bool(_625.y)?_600.y:_638.y);
+ _624.z = (bool(_625.z)?_600.z:_638.z);
+ _624.w = (bool(_625.w)?_600.w:_638.w);
+ int4 _640 = make_int4(9, 9, 9, 9);
+ _599.x = (_624.x*_640.x);
+ _599.y = (_624.y*_640.y);
+ _599.z = (_624.z*_640.z);
+ _599.w = (_624.w*_640.w);
+ _597.x = (_598.x+_599.x);
+ _597.y = (_598.y+_599.y);
+ _597.z = (_598.z+_599.z);
+ _597.w = (_598.w+_599.w);
+ int4 _641 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+ _596.x = (_597.x+_641.x);
+ _596.y = (_597.y+_641.y);
+ _596.z = (_597.z+_641.z);
+ _596.w = (_597.w+_641.w);
+ int4 _642;
+ int4 _643 = make_int4(((((int)threadIdx.x) + 288))+(1*0), ((((int)threadIdx.x) + 288))+(1*1), ((((int)threadIdx.x) + 288))+(1*2), ((((int)threadIdx.x) + 288))+(1*3));
+ int4 _644 = make_int4(3, 3, 3, 3);
+ _642.x = (_643.x%_644.x);
+ _642.y = (_643.y%_644.y);
+ _642.z = (_643.z%_644.z);
+ _642.w = (_643.w%_644.w);
+ int4 _645;
+ ushort4 _646;
+ ushort4 _647;
+ ushort4 _648;
+ int4 _649 = make_int4(3, 3, 3, 3);
+ int4 _650 = make_int4(0, 0, 0, 0);
+ _648.x = (_649.x>=_650.x);
+ _648.y = (_649.y>=_650.y);
+ _648.z = (_649.z>=_650.z);
+ _648.w = (_649.w>=_650.w);
+ ushort4 _651;
+ int4 _652 = make_int4(0, 0, 0, 0);
+ _651.x = (_642.x>=_652.x);
+ _651.y = (_642.y>=_652.y);
+ _651.z = (_642.z>=_652.z);
+ _651.w = (_642.w>=_652.w);
+ _647.x = (_648.x&&_651.x);
+ _647.y = (_648.y&&_651.y);
+ _647.z = (_648.z&&_651.z);
+ _647.w = (_648.w&&_651.w);
+ ushort4 _653;
+ ushort4 _654;
+ int4 _655 = make_int4(3, 3, 3, 3);
+ int4 _656 = make_int4(0, 0, 0, 0);
+ _654.x = (_655.x<_656.x);
+ _654.y = (_655.y<_656.y);
+ _654.z = (_655.z<_656.z);
+ _654.w = (_655.w<_656.w);
+ ushort4 _657;
+ int4 _658 = make_int4(0, 0, 0, 0);
+ _657.x = (_642.x<=_658.x);
+ _657.y = (_642.y<=_658.y);
+ _657.z = (_642.z<=_658.z);
+ _657.w = (_642.w<=_658.w);
+ _653.x = (_654.x&&_657.x);
+ _653.y = (_654.y&&_657.y);
+ _653.z = (_654.z&&_657.z);
+ _653.w = (_654.w&&_657.w);
+ _646.x = (_647.x||_653.x);
+ _646.y = (_647.y||_653.y);
+ _646.z = (_647.z||_653.z);
+ _646.w = (_647.w||_653.w);
+ int4 _659;
+ int4 _660 = make_int4(3, 3, 3, 3);
+ _659.x = (_642.x+_660.x);
+ _659.y = (_642.y+_660.y);
+ _659.z = (_642.z+_660.z);
+ _659.w = (_642.w+_660.w);
+ _645.x = (bool(_646.x)?_642.x:_659.x);
+ _645.y = (bool(_646.y)?_642.y:_659.y);
+ _645.z = (bool(_646.z)?_642.z:_659.z);
+ _645.w = (bool(_646.w)?_642.w:_659.w);
+ _595.x = (_596.x+_645.x);
+ _595.y = (_596.y+_645.y);
+ _595.z = (_596.z+_645.z);
+ _595.w = (_596.w+_645.w);
+ *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 1152)) = make_float4(kernel[_595.x],kernel[_595.y],kernel[_595.z],kernel[_595.w]);
+ int4 _661;
+ int4 _662;
+ int4 _663;
+ int4 _664 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1280) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1280) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1280) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1280) / 48) * 4608)) + (rc_outer_o [...]
+ int4 _665;
+ int4 _666;
+ int4 _667;
+ int4 _668 = make_int4((((((int)threadIdx.x) * 4) + 1280))+(1*0), (((((int)threadIdx.x) * 4) + 1280))+(1*1), (((((int)threadIdx.x) * 4) + 1280))+(1*2), (((((int)threadIdx.x) * 4) + 1280))+(1*3));
+ int4 _669 = make_int4(3, 3, 3, 3);
+ _667.x = (_668.x%_669.x);
+ _667.y = (_668.y%_669.y);
+ _667.z = (_668.z%_669.z);
+ _667.w = (_668.w%_669.w);
+ int4 _670;
+ int4 _671 = make_int4((((((int)threadIdx.x) * 4) + 1280))+(1*0), (((((int)threadIdx.x) * 4) + 1280))+(1*1), (((((int)threadIdx.x) * 4) + 1280))+(1*2), (((((int)threadIdx.x) * 4) + 1280))+(1*3));
+ int4 _672 = make_int4(3, 3, 3, 3);
+ _670.x = (_671.x/_672.x);
+ _670.y = (_671.y/_672.y);
+ _670.z = (_671.z/_672.z);
+ _670.w = (_671.w/_672.w);
+ int4 _673;
+ ushort4 _674;
+ ushort4 _675;
+ ushort4 _676;
+ int4 _677 = make_int4(3, 3, 3, 3);
+ int4 _678 = make_int4(0, 0, 0, 0);
+ _676.x = (_677.x>=_678.x);
+ _676.y = (_677.y>=_678.y);
+ _676.z = (_677.z>=_678.z);
+ _676.w = (_677.w>=_678.w);
+ ushort4 _679;
+ int4 _680 = make_int4(0, 0, 0, 0);
+ _679.x = (_667.x>=_680.x);
+ _679.y = (_667.y>=_680.y);
+ _679.z = (_667.z>=_680.z);
+ _679.w = (_667.w>=_680.w);
+ _675.x = (_676.x&&_679.x);
+ _675.y = (_676.y&&_679.y);
+ _675.z = (_676.z&&_679.z);
+ _675.w = (_676.w&&_679.w);
+ ushort4 _681;
+ ushort4 _682;
+ int4 _683 = make_int4(3, 3, 3, 3);
+ int4 _684 = make_int4(0, 0, 0, 0);
+ _682.x = (_683.x<_684.x);
+ _682.y = (_683.y<_684.y);
+ _682.z = (_683.z<_684.z);
+ _682.w = (_683.w<_684.w);
+ ushort4 _685;
+ int4 _686 = make_int4(0, 0, 0, 0);
+ _685.x = (_667.x<=_686.x);
+ _685.y = (_667.y<=_686.y);
+ _685.z = (_667.z<=_686.z);
+ _685.w = (_667.w<=_686.w);
+ _681.x = (_682.x&&_685.x);
+ _681.y = (_682.y&&_685.y);
+ _681.z = (_682.z&&_685.z);
+ _681.w = (_682.w&&_685.w);
+ _674.x = (_675.x||_681.x);
+ _674.y = (_675.y||_681.y);
+ _674.z = (_675.z||_681.z);
+ _674.w = (_675.w||_681.w);
+ int4 _687;
+ int4 _688 = make_int4(1, 1, 1, 1);
+ _687.x = (_670.x-_688.x);
+ _687.y = (_670.y-_688.y);
+ _687.z = (_670.z-_688.z);
+ _687.w = (_670.w-_688.w);
+ _673.x = (bool(_674.x)?_670.x:_687.x);
+ _673.y = (bool(_674.y)?_670.y:_687.y);
+ _673.z = (bool(_674.z)?_670.z:_687.z);
+ _673.w = (bool(_674.w)?_670.w:_687.w);
+ int4 _689 = make_int4(16, 16, 16, 16);
+ _666.x = (_673.x%_689.x);
+ _666.y = (_673.y%_689.y);
+ _666.z = (_673.z%_689.z);
+ _666.w = (_673.w%_689.w);
+ int4 _690;
+ ushort4 _691;
+ ushort4 _692;
+ ushort4 _693;
+ int4 _694 = make_int4(16, 16, 16, 16);
+ int4 _695 = make_int4(0, 0, 0, 0);
+ _693.x = (_694.x>=_695.x);
+ _693.y = (_694.y>=_695.y);
+ _693.z = (_694.z>=_695.z);
+ _693.w = (_694.w>=_695.w);
+ ushort4 _696;
+ int4 _697 = make_int4(0, 0, 0, 0);
+ _696.x = (_666.x>=_697.x);
+ _696.y = (_666.y>=_697.y);
+ _696.z = (_666.z>=_697.z);
+ _696.w = (_666.w>=_697.w);
+ _692.x = (_693.x&&_696.x);
+ _692.y = (_693.y&&_696.y);
+ _692.z = (_693.z&&_696.z);
+ _692.w = (_693.w&&_696.w);
+ ushort4 _698;
+ ushort4 _699;
+ int4 _700 = make_int4(16, 16, 16, 16);
+ int4 _701 = make_int4(0, 0, 0, 0);
+ _699.x = (_700.x<_701.x);
+ _699.y = (_700.y<_701.y);
+ _699.z = (_700.z<_701.z);
+ _699.w = (_700.w<_701.w);
+ ushort4 _702;
+ int4 _703 = make_int4(0, 0, 0, 0);
+ _702.x = (_666.x<=_703.x);
+ _702.y = (_666.y<=_703.y);
+ _702.z = (_666.z<=_703.z);
+ _702.w = (_666.w<=_703.w);
+ _698.x = (_699.x&&_702.x);
+ _698.y = (_699.y&&_702.y);
+ _698.z = (_699.z&&_702.z);
+ _698.w = (_699.w&&_702.w);
+ _691.x = (_692.x||_698.x);
+ _691.y = (_692.y||_698.y);
+ _691.z = (_692.z||_698.z);
+ _691.w = (_692.w||_698.w);
+ int4 _704;
+ int4 _705 = make_int4(16, 16, 16, 16);
+ _704.x = (_666.x+_705.x);
+ _704.y = (_666.y+_705.y);
+ _704.z = (_666.z+_705.z);
+ _704.w = (_666.w+_705.w);
+ _690.x = (bool(_691.x)?_666.x:_704.x);
+ _690.y = (bool(_691.y)?_666.y:_704.y);
+ _690.z = (bool(_691.z)?_666.z:_704.z);
+ _690.w = (bool(_691.w)?_666.w:_704.w);
+ int4 _706 = make_int4(9, 9, 9, 9);
+ _665.x = (_690.x*_706.x);
+ _665.y = (_690.y*_706.y);
+ _665.z = (_690.z*_706.z);
+ _665.w = (_690.w*_706.w);
+ _663.x = (_664.x+_665.x);
+ _663.y = (_664.y+_665.y);
+ _663.z = (_664.z+_665.z);
+ _663.w = (_664.w+_665.w);
+ int4 _707 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+ _662.x = (_663.x+_707.x);
+ _662.y = (_663.y+_707.y);
+ _662.z = (_663.z+_707.z);
+ _662.w = (_663.w+_707.w);
+ int4 _708;
+ int4 _709 = make_int4(((((int)threadIdx.x) + 320))+(1*0), ((((int)threadIdx.x) + 320))+(1*1), ((((int)threadIdx.x) + 320))+(1*2), ((((int)threadIdx.x) + 320))+(1*3));
+ int4 _710 = make_int4(3, 3, 3, 3);
+ _708.x = (_709.x%_710.x);
+ _708.y = (_709.y%_710.y);
+ _708.z = (_709.z%_710.z);
+ _708.w = (_709.w%_710.w);
+ int4 _711;
+ ushort4 _712;
+ ushort4 _713;
+ ushort4 _714;
+ int4 _715 = make_int4(3, 3, 3, 3);
+ int4 _716 = make_int4(0, 0, 0, 0);
+ _714.x = (_715.x>=_716.x);
+ _714.y = (_715.y>=_716.y);
+ _714.z = (_715.z>=_716.z);
+ _714.w = (_715.w>=_716.w);
+ ushort4 _717;
+ int4 _718 = make_int4(0, 0, 0, 0);
+ _717.x = (_708.x>=_718.x);
+ _717.y = (_708.y>=_718.y);
+ _717.z = (_708.z>=_718.z);
+ _717.w = (_708.w>=_718.w);
+ _713.x = (_714.x&&_717.x);
+ _713.y = (_714.y&&_717.y);
+ _713.z = (_714.z&&_717.z);
+ _713.w = (_714.w&&_717.w);
+ ushort4 _719;
+ ushort4 _720;
+ int4 _721 = make_int4(3, 3, 3, 3);
+ int4 _722 = make_int4(0, 0, 0, 0);
+ _720.x = (_721.x<_722.x);
+ _720.y = (_721.y<_722.y);
+ _720.z = (_721.z<_722.z);
+ _720.w = (_721.w<_722.w);
+ ushort4 _723;
+ int4 _724 = make_int4(0, 0, 0, 0);
+ _723.x = (_708.x<=_724.x);
+ _723.y = (_708.y<=_724.y);
+ _723.z = (_708.z<=_724.z);
+ _723.w = (_708.w<=_724.w);
+ _719.x = (_720.x&&_723.x);
+ _719.y = (_720.y&&_723.y);
+ _719.z = (_720.z&&_723.z);
+ _719.w = (_720.w&&_723.w);
+ _712.x = (_713.x||_719.x);
+ _712.y = (_713.y||_719.y);
+ _712.z = (_713.z||_719.z);
+ _712.w = (_713.w||_719.w);
+ int4 _725;
+ int4 _726 = make_int4(3, 3, 3, 3);
+ _725.x = (_708.x+_726.x);
+ _725.y = (_708.y+_726.y);
+ _725.z = (_708.z+_726.z);
+ _725.w = (_708.w+_726.w);
+ _711.x = (bool(_712.x)?_708.x:_725.x);
+ _711.y = (bool(_712.y)?_708.y:_725.y);
+ _711.z = (bool(_712.z)?_708.z:_725.z);
+ _711.w = (bool(_712.w)?_708.w:_725.w);
+ _661.x = (_662.x+_711.x);
+ _661.y = (_662.y+_711.y);
+ _661.z = (_662.z+_711.z);
+ _661.w = (_662.w+_711.w);
+ *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 1280)) = make_float4(kernel[_661.x],kernel[_661.y],kernel[_661.z],kernel[_661.w]);
+ int4 _727;
+ int4 _728;
+ int4 _729;
+ int4 _730 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1408) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1408) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1408) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1408) / 48) * 4608)) + (rc_outer_o [...]
+ int4 _731;
+ int4 _732;
+ int4 _733;
+ int4 _734 = make_int4((((((int)threadIdx.x) * 4) + 1408))+(1*0), (((((int)threadIdx.x) * 4) + 1408))+(1*1), (((((int)threadIdx.x) * 4) + 1408))+(1*2), (((((int)threadIdx.x) * 4) + 1408))+(1*3));
+ int4 _735 = make_int4(3, 3, 3, 3);
+ _733.x = (_734.x%_735.x);
+ _733.y = (_734.y%_735.y);
+ _733.z = (_734.z%_735.z);
+ _733.w = (_734.w%_735.w);
+ int4 _736;
+ int4 _737 = make_int4((((((int)threadIdx.x) * 4) + 1408))+(1*0), (((((int)threadIdx.x) * 4) + 1408))+(1*1), (((((int)threadIdx.x) * 4) + 1408))+(1*2), (((((int)threadIdx.x) * 4) + 1408))+(1*3));
+ int4 _738 = make_int4(3, 3, 3, 3);
+ _736.x = (_737.x/_738.x);
+ _736.y = (_737.y/_738.y);
+ _736.z = (_737.z/_738.z);
+ _736.w = (_737.w/_738.w);
+ int4 _739;
+ ushort4 _740;
+ ushort4 _741;
+ ushort4 _742;
+ int4 _743 = make_int4(3, 3, 3, 3);
+ int4 _744 = make_int4(0, 0, 0, 0);
+ _742.x = (_743.x>=_744.x);
+ _742.y = (_743.y>=_744.y);
+ _742.z = (_743.z>=_744.z);
+ _742.w = (_743.w>=_744.w);
+ ushort4 _745;
+ int4 _746 = make_int4(0, 0, 0, 0);
+ _745.x = (_733.x>=_746.x);
+ _745.y = (_733.y>=_746.y);
+ _745.z = (_733.z>=_746.z);
+ _745.w = (_733.w>=_746.w);
+ _741.x = (_742.x&&_745.x);
+ _741.y = (_742.y&&_745.y);
+ _741.z = (_742.z&&_745.z);
+ _741.w = (_742.w&&_745.w);
+ ushort4 _747;
+ ushort4 _748;
+ int4 _749 = make_int4(3, 3, 3, 3);
+ int4 _750 = make_int4(0, 0, 0, 0);
+ _748.x = (_749.x<_750.x);
+ _748.y = (_749.y<_750.y);
+ _748.z = (_749.z<_750.z);
+ _748.w = (_749.w<_750.w);
+ ushort4 _751;
+ int4 _752 = make_int4(0, 0, 0, 0);
+ _751.x = (_733.x<=_752.x);
+ _751.y = (_733.y<=_752.y);
+ _751.z = (_733.z<=_752.z);
+ _751.w = (_733.w<=_752.w);
+ _747.x = (_748.x&&_751.x);
+ _747.y = (_748.y&&_751.y);
+ _747.z = (_748.z&&_751.z);
+ _747.w = (_748.w&&_751.w);
+ _740.x = (_741.x||_747.x);
+ _740.y = (_741.y||_747.y);
+ _740.z = (_741.z||_747.z);
+ _740.w = (_741.w||_747.w);
+ int4 _753;
+ int4 _754 = make_int4(1, 1, 1, 1);
+ _753.x = (_736.x-_754.x);
+ _753.y = (_736.y-_754.y);
+ _753.z = (_736.z-_754.z);
+ _753.w = (_736.w-_754.w);
+ _739.x = (bool(_740.x)?_736.x:_753.x);
+ _739.y = (bool(_740.y)?_736.y:_753.y);
+ _739.z = (bool(_740.z)?_736.z:_753.z);
+ _739.w = (bool(_740.w)?_736.w:_753.w);
+ int4 _755 = make_int4(16, 16, 16, 16);
+ _732.x = (_739.x%_755.x);
+ _732.y = (_739.y%_755.y);
+ _732.z = (_739.z%_755.z);
+ _732.w = (_739.w%_755.w);
+ int4 _756;
+ ushort4 _757;
+ ushort4 _758;
+ ushort4 _759;
+ int4 _760 = make_int4(16, 16, 16, 16);
+ int4 _761 = make_int4(0, 0, 0, 0);
+ _759.x = (_760.x>=_761.x);
+ _759.y = (_760.y>=_761.y);
+ _759.z = (_760.z>=_761.z);
+ _759.w = (_760.w>=_761.w);
+ ushort4 _762;
+ int4 _763 = make_int4(0, 0, 0, 0);
+ _762.x = (_732.x>=_763.x);
+ _762.y = (_732.y>=_763.y);
+ _762.z = (_732.z>=_763.z);
+ _762.w = (_732.w>=_763.w);
+ _758.x = (_759.x&&_762.x);
+ _758.y = (_759.y&&_762.y);
+ _758.z = (_759.z&&_762.z);
+ _758.w = (_759.w&&_762.w);
+ ushort4 _764;
+ ushort4 _765;
+ int4 _766 = make_int4(16, 16, 16, 16);
+ int4 _767 = make_int4(0, 0, 0, 0);
+ _765.x = (_766.x<_767.x);
+ _765.y = (_766.y<_767.y);
+ _765.z = (_766.z<_767.z);
+ _765.w = (_766.w<_767.w);
+ ushort4 _768;
+ int4 _769 = make_int4(0, 0, 0, 0);
+ _768.x = (_732.x<=_769.x);
+ _768.y = (_732.y<=_769.y);
+ _768.z = (_732.z<=_769.z);
+ _768.w = (_732.w<=_769.w);
+ _764.x = (_765.x&&_768.x);
+ _764.y = (_765.y&&_768.y);
+ _764.z = (_765.z&&_768.z);
+ _764.w = (_765.w&&_768.w);
+ _757.x = (_758.x||_764.x);
+ _757.y = (_758.y||_764.y);
+ _757.z = (_758.z||_764.z);
+ _757.w = (_758.w||_764.w);
+ int4 _770;
+ int4 _771 = make_int4(16, 16, 16, 16);
+ _770.x = (_732.x+_771.x);
+ _770.y = (_732.y+_771.y);
+ _770.z = (_732.z+_771.z);
+ _770.w = (_732.w+_771.w);
+ _756.x = (bool(_757.x)?_732.x:_770.x);
+ _756.y = (bool(_757.y)?_732.y:_770.y);
+ _756.z = (bool(_757.z)?_732.z:_770.z);
+ _756.w = (bool(_757.w)?_732.w:_770.w);
+ int4 _772 = make_int4(9, 9, 9, 9);
+ _731.x = (_756.x*_772.x);
+ _731.y = (_756.y*_772.y);
+ _731.z = (_756.z*_772.z);
+ _731.w = (_756.w*_772.w);
+ _729.x = (_730.x+_731.x);
+ _729.y = (_730.y+_731.y);
+ _729.z = (_730.z+_731.z);
+ _729.w = (_730.w+_731.w);
+ int4 _773 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+ _728.x = (_729.x+_773.x);
+ _728.y = (_729.y+_773.y);
+ _728.z = (_729.z+_773.z);
+ _728.w = (_729.w+_773.w);
+ int4 _774;
+ int4 _775 = make_int4(((((int)threadIdx.x) + 352))+(1*0), ((((int)threadIdx.x) + 352))+(1*1), ((((int)threadIdx.x) + 352))+(1*2), ((((int)threadIdx.x) + 352))+(1*3));
+ int4 _776 = make_int4(3, 3, 3, 3);
+ _774.x = (_775.x%_776.x);
+ _774.y = (_775.y%_776.y);
+ _774.z = (_775.z%_776.z);
+ _774.w = (_775.w%_776.w);
+ int4 _777;
+ ushort4 _778;
+ ushort4 _779;
+ ushort4 _780;
+ int4 _781 = make_int4(3, 3, 3, 3);
+ int4 _782 = make_int4(0, 0, 0, 0);
+ _780.x = (_781.x>=_782.x);
+ _780.y = (_781.y>=_782.y);
+ _780.z = (_781.z>=_782.z);
+ _780.w = (_781.w>=_782.w);
+ ushort4 _783;
+ int4 _784 = make_int4(0, 0, 0, 0);
+ _783.x = (_774.x>=_784.x);
+ _783.y = (_774.y>=_784.y);
+ _783.z = (_774.z>=_784.z);
+ _783.w = (_774.w>=_784.w);
+ _779.x = (_780.x&&_783.x);
+ _779.y = (_780.y&&_783.y);
+ _779.z = (_780.z&&_783.z);
+ _779.w = (_780.w&&_783.w);
+ ushort4 _785;
+ ushort4 _786;
+ int4 _787 = make_int4(3, 3, 3, 3);
+ int4 _788 = make_int4(0, 0, 0, 0);
+ _786.x = (_787.x<_788.x);
+ _786.y = (_787.y<_788.y);
+ _786.z = (_787.z<_788.z);
+ _786.w = (_787.w<_788.w);
+ ushort4 _789;
+ int4 _790 = make_int4(0, 0, 0, 0);
+ _789.x = (_774.x<=_790.x);
+ _789.y = (_774.y<=_790.y);
+ _789.z = (_774.z<=_790.z);
+ _789.w = (_774.w<=_790.w);
+ _785.x = (_786.x&&_789.x);
+ _785.y = (_786.y&&_789.y);
+ _785.z = (_786.z&&_789.z);
+ _785.w = (_786.w&&_789.w);
+ _778.x = (_779.x||_785.x);
+ _778.y = (_779.y||_785.y);
+ _778.z = (_779.z||_785.z);
+ _778.w = (_779.w||_785.w);
+ int4 _791;
+ int4 _792 = make_int4(3, 3, 3, 3);
+ _791.x = (_774.x+_792.x);
+ _791.y = (_774.y+_792.y);
+ _791.z = (_774.z+_792.z);
+ _791.w = (_774.w+_792.w);
+ _777.x = (bool(_778.x)?_774.x:_791.x);
+ _777.y = (bool(_778.y)?_774.y:_791.y);
+ _777.z = (bool(_778.z)?_774.z:_791.z);
+ _777.w = (bool(_778.w)?_774.w:_791.w);
+ _727.x = (_728.x+_777.x);
+ _727.y = (_728.y+_777.y);
+ _727.z = (_728.z+_777.z);
+ _727.w = (_728.w+_777.w);
+ *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 1408)) = make_float4(kernel[_727.x],kernel[_727.y],kernel[_727.z],kernel[_727.w]);
__syncthreads();
- for (int rc_outer_inner = 0; rc_outer_inner < 16; ++rc_outer_inner) {
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7))] * kernel_shared[(((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6))]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7))] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 384)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 1)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 1)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 1)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 385)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 2)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 2)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 2)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 386)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 3)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 387)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 64)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 4)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 64)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 388)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 65)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 5)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 65)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 389)]));
+ for (int rc_outer_inner = 0; rc_outer_inner < 8; ++rc_outer_inner) {
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(rc_outer_inner * 18)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 18) + 1)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_outer_inner * 18) + 2)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_outer_inner * 18) + 3)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_outer_inner * 18) + 4)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_outer_inner * 18) + 5)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_outer_inner * 18) + 6)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 18) + 1)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 18) + 2)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_outer_inner * 18) + 3)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_outer_inner * 18) + 4)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_outer_inner * 18) + 5)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_outer_inner * 18) + 6)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_outer_inner * 18) + 7)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 18) + 2)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 18) + 3)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_outer_inner * 18) + 4)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_outer_inner * 18) + 5)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_outer_inner * 18) + 6)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_outer_inner * 18) + 7)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_outer_inner * 18) + 8)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 18) + 9)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 18) + 10)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_outer_inner * 18) + 11)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_outer_inner * 18) + 12)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_outer_inner * 18) + 13)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_outer_inner * 18) + 14)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_outer_inner * 18) + 15)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 18) + 10)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 18) + 11)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_outer_inner * 18) + 12)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_outer_inner * 18) + 13)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_outer_inner * 18) + 14)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_outer_inner * 18) + 15)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_outer_inner * 18) + 16)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 18) + 11)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 18) + 12)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_outer_inner * 18) + 13)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_outer_inner * 18) + 14)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_outer_inner * 18) + 15)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_outer_inner * 18) + 16)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_outer_inner * 18) + 17)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
}
}
}
- compute[((((int)blockIdx.x) * 392) + ((int)threadIdx.x))] = max((conv2d_nchw[0] + bias[((((int)blockIdx.x) * 8) + (((int)threadIdx.x) / 49))]), 0.000000e+00f);
- compute[(((((int)blockIdx.x) * 392) + ((int)threadIdx.x)) + 196)] = max((conv2d_nchw[1] + bias[(((((int)blockIdx.x) * 8) + (((int)threadIdx.x) / 49)) + 4)]), 0.000000e+00f);
+ for (int i3_inner = 0; i3_inner < 7; ++i3_inner) {
+ compute[(((((((int)blockIdx.x) / 7) * 1568) + (((int)threadIdx.x) * 49)) + ((((int)blockIdx.x) % 7) * 7)) + i3_inner)] = max((conv2d_nchw[i3_inner] + bias[(((((int)blockIdx.x) / 7) * 32) + ((int)threadIdx.x))]), 0.000000e+00f);
+ }
}
@@ -589,7 +3151,7 @@ In the example below we resume the status and do more 5 trials.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 2 minutes 35.530 seconds)
+ **Total running time of the script:** ( 2 minutes 34.321 seconds)
.. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
index 6e750f68b..f9381f273 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
@@ -646,7 +646,7 @@ so we can read the log file and load the best schedules.
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 9.6964 9.7275 9.7331 9.6288 0.0479
+ 9.8793 9.8894 9.9270 9.8216 0.0436
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
index f8ab78cf1..1df3ba618 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
@@ -665,7 +665,7 @@ so we can read the log file and load the best schedules.
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 752.6694 752.9576 753.1499 751.9008 0.5491
+ 757.9333 757.6261 759.1621 757.0116 0.9044
@@ -693,7 +693,7 @@ Other Tips
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 19.801 seconds)
+ **Total running time of the script:** ( 1 minutes 20.685 seconds)
.. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_network_x86.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
index f46917a5b..78a29bc42 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
@@ -396,12 +396,12 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
placeholder_4: Buffer(placeholder_14: Pointer(float32), float32, [65536], []),
compute: Buffer(compute_2: Pointer(float32), float32, [65536], [])}
buffer_map = {placeholder_5: placeholder, placeholder_6: placeholder_1, placeholder_7: placeholder_2, placeholder_8: placeholder_3, placeholder_9: placeholder_4, compute_1: compute}
- preflattened_buffer_map = {placeholder_5: placeholder_15: Buffer(placeholder_10, float32, [128, 256], []), placeholder_7: placeholder_16: Buffer(placeholder_12, int32, [4916], []), placeholder_8: placeholder_17: Buffer(placeholder_13, int32, [33], []), compute_1: compute_3: Buffer(compute_2, float32, [128, 512], []), placeholder_9: placeholder_18: Buffer(placeholder_14, float32, [128, 512], []), placeholder_6: placeholder_19: Buffer(placeholder_11, float32, [4916, 16, 1], [])} {
+ preflattened_buffer_map = {placeholder_7: placeholder_15: Buffer(placeholder_12, int32, [4916], []), placeholder_6: placeholder_16: Buffer(placeholder_11, float32, [4916, 16, 1], []), placeholder_8: placeholder_17: Buffer(placeholder_13, int32, [33], []), placeholder_5: placeholder_18: Buffer(placeholder_10, float32, [128, 256], []), placeholder_9: placeholder_19: Buffer(placeholder_14, float32, [128, 512], []), compute_1: compute_3: Buffer(compute_2, float32, [128, 512], [])} {
for (i0.outer.i1.outer.fused: int32, 0, 32) "parallel" {
allocate(compute_4: Pointer(global float32), float32, [2048]), storage_scope = global {
- for (nb_j.inner: int32, 0, 2) {
- for (i.inner.init: int32, 0, 64) {
- let cse_var_1: int32 = ((i.inner.init*32) + (nb_j.inner*16))
+ for (i.outer.inner: int32, 0, 4) {
+ for (i.inner.init: int32, 0, 32) {
+ let cse_var_1: int32 = ((i.outer.inner*512) + (i.inner.init*16))
{
compute_5: Buffer(compute_4, float32, [2048], [])[cse_var_1] = 0f32
compute_5[(cse_var_1 + 1)] = 0f32
@@ -421,51 +421,78 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
compute_5[(cse_var_1 + 15)] = 0f32
}
}
- for (elem_idx: int32, 0, let cse_var_2: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner) in (placeholder_3[(cse_var_2 + 1)] - placeholder_3[cse_var_2])) {
- for (i.inner: int32, 0, 64) {
- let cse_var_21: int32 = (elem_idx*16)
- let cse_var_20: int32 = ((i.inner*32) + (nb_j.inner*16))
- let cse_var_19: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)
- let cse_var_18: int32 = ((floordiv(i0.outer.i1.outer.fused, 16)*16384) + (i.inner*256))
- let cse_var_17: int32 = (cse_var_20 + 9)
- let cse_var_16: int32 = (cse_var_20 + 8)
- let cse_var_15: int32 = (cse_var_20 + 7)
- let cse_var_14: int32 = (cse_var_20 + 6)
- let cse_var_13: int32 = (cse_var_20 + 5)
- let cse_var_12: int32 = (cse_var_20 + 4)
- let cse_var_11: int32 = (cse_var_20 + 3)
- let cse_var_10: int32 = (cse_var_20 + 2)
- let cse_var_9: int32 = (cse_var_20 + 15)
- let cse_var_8: int32 = (cse_var_20 + 14)
- let cse_var_7: int32 = (cse_var_20 + 13)
- let cse_var_6: int32 = (cse_var_20 + 12)
- let cse_var_5: int32 = (cse_var_20 + 11)
- let cse_var_4: int32 = (cse_var_20 + 10)
- let cse_var_3: int32 = (cse_var_20 + 1)
- {
- compute_5[cse_var_20] = (compute_5[cse_var_20] + (placeholder_1[((placeholder_3[cse_var_19]*16) + cse_var_21)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_3] = (compute_5[cse_var_3] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 1)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 2)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 3)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 4)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 5)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 6)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 7)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 8)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 9)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_4] = (compute_5[cse_var_4] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 10)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_5] = (compute_5[cse_var_5] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 11)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_6] = (compute_5[cse_var_6] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 12)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 13)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 14)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 15)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
+ for (elem_idx: int32, 0, (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])) {
+ for (i.inner: int32, 0, 32) {
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_2: int32 = ((i.outer.inner*512) + (i.inner*16))
+ compute_5[cse_var_2] = (compute_5[cse_var_2] + (placeholder_1[((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16))]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_3: int32 = (((i.outer.inner*512) + (i.inner*16)) + 1)
+ compute_5[cse_var_3] = (compute_5[cse_var_3] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 1)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_4: int32 = (((i.outer.inner*512) + (i.inner*16)) + 2)
+ compute_5[cse_var_4] = (compute_5[cse_var_4] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 2)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_5: int32 = (((i.outer.inner*512) + (i.inner*16)) + 3)
+ compute_5[cse_var_5] = (compute_5[cse_var_5] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 3)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_6: int32 = (((i.outer.inner*512) + (i.inner*16)) + 4)
+ compute_5[cse_var_6] = (compute_5[cse_var_6] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 4)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_7: int32 = (((i.outer.inner*512) + (i.inner*16)) + 5)
+ compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 5)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_8: int32 = (((i.outer.inner*512) + (i.inner*16)) + 6)
+ compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 6)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_9: int32 = (((i.outer.inner*512) + (i.inner*16)) + 7)
+ compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 7)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_10: int32 = (((i.outer.inner*512) + (i.inner*16)) + 8)
+ compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 8)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_11: int32 = (((i.outer.inner*512) + (i.inner*16)) + 9)
+ compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 9)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_12: int32 = (((i.outer.inner*512) + (i.inner*16)) + 10)
+ compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 10)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_13: int32 = (((i.outer.inner*512) + (i.inner*16)) + 11)
+ compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 11)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_14: int32 = (((i.outer.inner*512) + (i.inner*16)) + 12)
+ compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 12)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_15: int32 = (((i.outer.inner*512) + (i.inner*16)) + 13)
+ compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 13)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_16: int32 = (((i.outer.inner*512) + (i.inner*16)) + 14)
+ compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 14)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_17: int32 = (((i.outer.inner*512) + (i.inner*16)) + 15)
+ compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 15)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
}
}
}
}
- for (i0.inner: int32, 0, 64) {
- let cse_var_22: int32 = (((floordiv(i0.outer.i1.outer.fused, 16)*32768) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 16)*32))
- compute[ramp(cse_var_22, 1, 32)] = max((compute_5[ramp((i0.inner*32), 1, 32)] + placeholder_4[ramp(cse_var_22, 1, 32)]), broadcast(0f32, 32))
+ for (i0.inner: int32, 0, 128) {
+ let cse_var_18: int32 = ((i0.inner*512) + (i0.outer.i1.outer.fused*16))
+ compute[ramp(cse_var_18, 1, 16)] = max((compute_5[ramp((i0.inner*16), 1, 16)] + placeholder_4[ramp(cse_var_18, 1, 16)]), broadcast(0f32, 16))
}
}
}
@@ -521,7 +548,7 @@ We build the binary and check its correctness and performance.
.. code-block:: none
- Execution time of this operator: 1.855 ms
+ Execution time of this operator: 1.730 ms
diff --git a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
index d6dc25f26..f50bf6718 100644
--- a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
@@ -5,12 +5,12 @@
Computation times
=================
-**00:43.277** total execution time for **how_to_tune_with_autotvm** files:
+**00:43.532** total execution time for **how_to_tune_with_autotvm** files:
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``) | 00:43.248 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``) | 00:43.499 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``) | 00:00.015 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``) | 00:00.019 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_cuda.py` (``tune_relay_cuda.py``) | 00:00.005 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
index 5bd98d84b..8d7b4fb1f 100644
--- a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
@@ -879,8 +879,8 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 4, 32]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 128]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2885496
- No: 6 GFLOPS: 110.83/110.83 result: MeasureResult(costs=(0.002088788229166667,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.8140833377838135, timestamp=1655930458.0250723) [('tile_f', [-1, 1, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3754080
- No: 7 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+ No: 6 GFLOPS: 110.46/110.46 result: MeasureResult(costs=(0.00209571425,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.82222318649292, timestamp=1655930909.2240996) [('tile_f', [-1, 1, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3754080
+ No: 7 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1003,7 +1003,7 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 16, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 256, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6225319
- No: 8 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+ No: 8 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1126,7 +1126,7 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 1, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 8, 64]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,943546
- No: 9 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+ No: 9 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1249,7 +1249,7 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 16, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 16, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2868708
- No: 10 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+ No: 10 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 142, in build
res = future.result()
File "/usr/lib/python3.7/concurrent/futures/_base.py", line 435, in result
@@ -1267,7 +1267,7 @@ for this template
TimeoutError
[('tile_f', [-1, 32, 2, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4691833
- No: 11 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+ No: 11 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1390,7 +1390,7 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 2, 64]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1042124
- No: 12 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+ No: 12 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1513,7 +1513,7 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 32, 1, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 32, 16]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10013405
- No: 13 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+ No: 13 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1636,7 +1636,7 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 8, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 32]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6732082
- No: 14 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+ No: 14 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1759,7 +1759,7 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 4, 32]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 128]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7536735
- No: 15 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+ No: 15 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1882,7 +1882,7 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 1, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 128, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,482121
- No: 16 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+ No: 16 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2005,7 +2005,7 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 1, 16]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 32, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2824525
- No: 17 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+ No: 17 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2128,7 +2128,7 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 64, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4559286
- No: 18 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+ No: 18 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2251,7 +2251,7 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 32, 16]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 512]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9677544
- No: 19 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+ No: 19 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 738, in __call__
yield remote, remote.load_module(os.path.split(build_result.filename)[1])
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 702, in run_through_rpc
@@ -2339,7 +2339,7 @@ for this template
15: _PyEval_EvalFrameDefault
14: 0x0000000000537c30
13: _PyObject_FastCallKeywords
- 12: 0x00007ffab91b6fa2
+ 12: 0x00007f97971cffa2
11: _ctypes_callproc
10: ffi_call
9: ffi_call_unix64
@@ -2404,7 +2404,7 @@ for this template
21: _PyFunction_FastCallKeywords
20: _PyEval_EvalFrameDefault
19: _PyFunction_FastCall [('tile_f', [-1, 8, 2, 16]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6390073
- No: 20 GFLOPS: 144.17/144.17 result: MeasureResult(costs=(0.00160570644,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4077048301696777, timestamp=1655930484.4690123) [('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9881539
+ No: 20 GFLOPS: 144.77/144.77 result: MeasureResult(costs=(0.00159906648,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4326999187469482, timestamp=1655930935.759078) [('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9881539
@@ -2461,7 +2461,7 @@ and measure running time.
Best config:
[('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9881539
Finish loading 20 records
- Time cost of this operator: 0.001957
+ Time cost of this operator: 0.002022
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
index 277337fc1..db3b92130 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
@@ -328,10 +328,10 @@ Timing the untuned program
########## Build without Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs
--------- --- -------- ------- ----- ------ -------
- tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 318.6 98.76 (1, 2, 10, 10, 3) 2 1
- tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.076 0.953 (1, 6, 10, 10) 1 1
- tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.924 0.286 (1, 1, 10, 10, 3) 1 1
- Total_time - 322.6 - - - -
+ tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 315.3 98.749 (1, 2, 10, 10, 3) 2 1
+ tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.079 0.964 (1, 6, 10, 10) 1 1
+ tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.916 0.287 (1, 1, 10, 10, 3) 1 1
+ Total_time - 319.295 - - - -
@@ -397,10 +397,10 @@ Timing the tuned program
########## Build with Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs
--------- --- -------- ------- ----- ------ -------
- tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 190.6 98.591 (1, 1, 10, 10, 6) 2 1
- tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.902 0.984 (1, 6, 10, 10) 1 1
- tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.823 0.426 (1, 3, 10, 10, 1) 1 1
- Total_time - 193.325 - - - -
+ tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 135.1 98.066 (1, 6, 10, 10, 1) 2 1
+ tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.748 1.269 (1, 6, 10, 10) 1 1
+ tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.916 0.665 (1, 1, 10, 10, 3) 1 1
+ Total_time - 137.765 - - - -
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
index 3d4e94f14..af47ad316 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
@@ -225,7 +225,7 @@ take about **2 minutes** to download the Stanford Cars, while COCO 2017 validati
.. code-block:: none
- '/tmp/tmp5adly3xq/images/random'
+ '/tmp/tmptfwkoswh/images/random'
@@ -325,8 +325,8 @@ objects to other stuff? We can display some examples from our datasets using ``m
.. code-block:: none
- /tmp/tmp5adly3xq/images/target contains 8144 images
- /tmp/tmp5adly3xq/images/random contains 5000 images
+ /tmp/tmptfwkoswh/images/target contains 8144 images
+ /tmp/tmptfwkoswh/images/random contains 5000 images
@@ -501,13 +501,13 @@ the time on our validation set).
.. code-block:: none
Epoch 1/3
- 328/328 - 55s - loss: 0.2082 - accuracy: 0.9277 - val_loss: 0.1597 - val_accuracy: 0.9535
+ 328/328 - 55s - loss: 0.2163 - accuracy: 0.9270 - val_loss: 0.1386 - val_accuracy: 0.9528
Epoch 2/3
- 328/328 - 52s - loss: 0.0971 - accuracy: 0.9631 - val_loss: 0.1233 - val_accuracy: 0.9630
+ 328/328 - 52s - loss: 0.0909 - accuracy: 0.9670 - val_loss: 0.1189 - val_accuracy: 0.9581
Epoch 3/3
- 328/328 - 52s - loss: 0.0650 - accuracy: 0.9757 - val_loss: 0.1162 - val_accuracy: 0.9600
+ 328/328 - 52s - loss: 0.0696 - accuracy: 0.9729 - val_loss: 0.1105 - val_accuracy: 0.9641
- <keras.callbacks.History object at 0x7f7ed75d5e10>
+ <keras.callbacks.History object at 0x7f489d506f10>
@@ -864,7 +864,7 @@ Arduino tutorial for how to do that `on GitHub <https://github.com/guberti/tvm-a
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 8 minutes 7.957 seconds)
+ **Total running time of the script:** ( 10 minutes 12.692 seconds)
.. _sphx_glr_download_how_to_work_with_microtvm_micro_train.py:
diff --git a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
index e8d504d06..2d363b24e 100644
--- a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
Computation times
=================
-**08:52.919** total execution time for **how_to_work_with_microtvm** files:
+**10:58.917** total execution time for **how_to_work_with_microtvm** files:
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py` (``micro_train.py``) | 08:07.957 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py` (``micro_train.py``) | 10:12.692 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``) | 00:41.550 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``) | 00:42.778 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``) | 00:03.412 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``) | 00:03.447 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_microtvm_micro_ethosu.py` (``micro_ethosu.py``) | 00:00.000 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
index f6f05fc14..debf159b7 100644
--- a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
@@ -5,12 +5,12 @@
Computation times
=================
-**00:11.361** total execution time for **how_to_work_with_relay** files:
+**00:11.381** total execution time for **how_to_work_with_relay** files:
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``) | 00:09.853 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``) | 00:09.879 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``) | 00:01.502 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``) | 00:01.496 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_relay_using_relay_viz.py` (``using_relay_viz.py``) | 00:00.006 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt b/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
index 02586ca07..d14bdadc6 100644
--- a/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
@@ -259,7 +259,7 @@ The following example customizes CUDA lowering rule for :code:`exp`.
.. code-block:: none
- <function my_cuda_math_rule at 0x7f7e3fb81440>
+ <function my_cuda_math_rule at 0x7f4810e0c950>
diff --git a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
index 98ae84c67..d73fb9320 100644
--- a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
@@ -5,22 +5,22 @@
Computation times
=================
-**00:04.002** total execution time for **how_to_work_with_schedules** files:
+**00:03.981** total execution time for **how_to_work_with_schedules** files:
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``) | 00:01.863 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``) | 00:01.854 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``) | 00:00.949 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``) | 00:00.930 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``) | 00:00.514 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``) | 00:00.522 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``) | 00:00.505 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``) | 00:00.503 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_schedules_extern_op.py` (``extern_op.py``) | 00:00.099 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_schedules_schedule_primitives.py` (``schedule_primitives.py``) | 00:00.033 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tedd.py` (``tedd.py``) | 00:00.026 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tedd.py` (``tedd.py``) | 00:00.027 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tuple_inputs.py` (``tuple_inputs.py``) | 00:00.013 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tuple_inputs.py` (``tuple_inputs.py``) | 00:00.012 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
index 44a9f0b57..733c8c49b 100644
--- a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
@@ -346,7 +346,7 @@ The importing needs to happen before the tensorized GEMV being executed.
C: Buffer(C_2: Pointer(float32), float32, [524288], [])}
buffer_map = {A_1: A, B_1: B, C_1: C}
preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [1024, 64], []), B_1: B_3: Buffer(B_2, float32, [512, 64], []), C_1: C_3: Buffer(C_2, float32, [1024, 512], [])} {
- attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmpa_jm11ei/input0.cc'\nsource_filename = \"/tmp/tmpa_jm11ei/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = alloca float*, align 8\n %8 = alloca float*, align 8\n %9 = alloca floa [...]
+ attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmpbs1xqx8t/input0.cc'\nsource_filename = \"/tmp/tmpbs1xqx8t/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = alloca float*, align 8\n %8 = alloca float*, align 8\n %9 = alloca floa [...]
for (i, 0, 1024) {
for (j.outer: int32, 0, 32) {
@tir.call_extern("gemv_update", @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), C_2, ((i*512) + (j.outer*16)), 16, 2, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), A_2, (i*64), 64, 1, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), B_2, (j.outer*1024), 1024, 1, dtype=handle), 16, 64, 64, dtype=int32)
diff --git a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
index d4b6d5080..18ff1a40c 100644
--- a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**00:20.522** total execution time for **topic_vta_tutorials_autotvm** files:
+**00:20.859** total execution time for **topic_vta_tutorials_autotvm** files:
+---------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``) | 00:20.515 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``) | 00:20.852 | 0.0 MB |
+---------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_alu_vta.py` (``tune_alu_vta.py``) | 00:00.006 | 0.0 MB |
+---------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
index 05803d903..062b34a16 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
@@ -291,7 +291,7 @@ The compilation steps are:
DeprecationWarning,
/workspace/vta/tutorials/frontend/deploy_classification.py:213: DeprecationWarning: legacy graph executor behavior of producing json / lib / params will be removed in the next release. Please see documents of tvm.contrib.graph_executor.GraphModule for the new recommended usage.
relay_prog, target=tvm.target.Target(target, host=env.target_host), params=params
- resnet18_v1 inference graph built in 21.99s!
+ resnet18_v1 inference graph built in 22.46s!
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
index 15d700130..9cd5de20d 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
@@ -335,7 +335,7 @@ The compilation steps are:
"target_host parameter is going to be deprecated. "
/workspace/python/tvm/relay/build_module.py:389: DeprecationWarning: Please use input parameter mod (tvm.IRModule) instead of deprecated parameter mod (tvm.relay.function.Function)
DeprecationWarning,
- yolov3-tiny inference graph built in 15.41s!
+ yolov3-tiny inference graph built in 15.72s!
diff --git a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
index 216ab2850..be0509189 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**01:29.548** total execution time for **topic_vta_tutorials_frontend** files:
+**01:29.341** total execution time for **topic_vta_tutorials_frontend** files:
+------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``) | 00:47.517 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``) | 00:47.066 | 0.0 MB |
+------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``) | 00:42.031 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``) | 00:42.275 | 0.0 MB |
+------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
index fe4fe00cf..48b3815be 100644
--- a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**00:03.245** total execution time for **topic_vta_tutorials_optimize** files:
+**00:03.243** total execution time for **topic_vta_tutorials_optimize** files:
+--------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``) | 00:02.862 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``) | 00:00.383 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``) | 00:00.381 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
index 952e0ed5e..dac94a0f1 100644
--- a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**00:00.709** total execution time for **topic_vta_tutorials** files:
+**00:00.690** total execution time for **topic_vta_tutorials** files:
+---------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``) | 00:00.383 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``) | 00:00.369 | 0.0 MB |
+---------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``) | 00:00.326 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``) | 00:00.322 | 0.0 MB |
+---------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
index 8e775daa3..f2ab90a40 100644
--- a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
@@ -204,13 +204,6 @@ trials, we can load the best schedule from the log file and apply it.
-.. rst-class:: sphx-glr-script-out
-
- .. code-block:: none
-
- *E*E
-
-
@@ -334,7 +327,7 @@ We build the binary and check its correctness and performance.
.. code-block:: none
- Execution time of this operator: 93.585 ms
+ Execution time of this operator: 93.717 ms
@@ -452,7 +445,7 @@ operations.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 32.785 seconds)
+ **Total running time of the script:** ( 1 minutes 13.161 seconds)
.. _sphx_glr_download_tutorial_auto_scheduler_matmul_x86.py:
diff --git a/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt b/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
index 3d5fd78d1..30a8f8096 100644
--- a/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
@@ -449,16 +449,16 @@ reduce variance, we take 5 measurements and average them.
waiting for device...
device available
Get devices for measurement successfully!
- No: 1 GFLOPS: 9.49/9.49 result: MeasureResult(costs=(0.028283824800000003,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5838947296142578, timestamp=1655929265.7724662) [('tile_y', [-1, 1]), ('tile_x', [-1, 256])],None,80
- No: 2 GFLOPS: 2.47/9.49 result: MeasureResult(costs=(0.10865412719999998,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.8847618103027344, timestamp=1655929267.6746712) [('tile_y', [-1, 4]), ('tile_x', [-1, 8])],None,32
- No: 3 GFLOPS: 11.79/11.79 result: MeasureResult(costs=(0.022765395,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5643162727355957, timestamp=1655929268.7010412) [('tile_y', [-1, 64]), ('tile_x', [-1, 32])],None,56
- No: 4 GFLOPS: 1.48/11.79 result: MeasureResult(costs=(0.18186187920000002,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.0187907218933105, timestamp=1655929272.2771149) [('tile_y', [-1, 1]), ('tile_x', [-1, 4])],None,20
- No: 5 GFLOPS: 3.63/11.79 result: MeasureResult(costs=(0.07400362120000001,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.3209781646728516, timestamp=1655929273.7276344) [('tile_y', [-1, 256]), ('tile_x', [-1, 16])],None,48
- No: 6 GFLOPS: 1.77/11.79 result: MeasureResult(costs=(0.151268727,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.6089162826538086, timestamp=1655929276.384843) [('tile_y', [-1, 512]), ('tile_x', [-1, 4])],None,29
- No: 7 GFLOPS: 0.87/11.79 result: MeasureResult(costs=(0.3093742644,), error_no=MeasureErrorNo.NO_ERROR, all_cost=5.074639320373535, timestamp=1655929281.990054) [('tile_y', [-1, 512]), ('tile_x', [-1, 2])],None,19
- No: 8 GFLOPS: 10.63/11.79 result: MeasureResult(costs=(0.0252409828,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5472111701965332, timestamp=1655929282.556275) [('tile_y', [-1, 4]), ('tile_x', [-1, 64])],None,62
- No: 9 GFLOPS: 1.90/11.79 result: MeasureResult(costs=(0.140921808,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.353353977203369, timestamp=1655929285.0292299) [('tile_y', [-1, 2]), ('tile_x', [-1, 2])],None,11
- No: 10 GFLOPS: 2.73/11.79 result: MeasureResult(costs=(0.098435465,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.6813583374023438, timestamp=1655929286.7710526) [('tile_y', [-1, 4]), ('tile_x', [-1, 4])],None,22
+ No: 1 GFLOPS: 9.76/9.76 result: MeasureResult(costs=(0.0275145092,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5797626972198486, timestamp=1655929709.3550327) [('tile_y', [-1, 1]), ('tile_x', [-1, 256])],None,80
+ No: 2 GFLOPS: 2.69/9.76 result: MeasureResult(costs=(0.0997637056,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.743319034576416, timestamp=1655929711.115279) [('tile_y', [-1, 4]), ('tile_x', [-1, 8])],None,32
+ No: 3 GFLOPS: 11.75/11.75 result: MeasureResult(costs=(0.0228372184,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5632002353668213, timestamp=1655929712.157808) [('tile_y', [-1, 64]), ('tile_x', [-1, 32])],None,56
+ No: 4 GFLOPS: 1.85/11.75 result: MeasureResult(costs=(0.1453076866,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.452601671218872, timestamp=1655929715.1464045) [('tile_y', [-1, 1]), ('tile_x', [-1, 4])],None,20
+ No: 5 GFLOPS: 3.60/11.75 result: MeasureResult(costs=(0.07447396660000001,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.3235578536987305, timestamp=1655929716.597595) [('tile_y', [-1, 256]), ('tile_x', [-1, 16])],None,48
+ No: 6 GFLOPS: 1.79/11.75 result: MeasureResult(costs=(0.14989059440000002,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.566101312637329, timestamp=1655929719.2095697) [('tile_y', [-1, 512]), ('tile_x', [-1, 4])],None,29
+ No: 7 GFLOPS: 0.86/11.75 result: MeasureResult(costs=(0.31043540859999996,), error_no=MeasureErrorNo.NO_ERROR, all_cost=5.087138652801514, timestamp=1655929724.8370092) [('tile_y', [-1, 512]), ('tile_x', [-1, 2])],None,19
+ No: 8 GFLOPS: 10.57/11.75 result: MeasureResult(costs=(0.0253910224,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5524759292602539, timestamp=1655929725.4101107) [('tile_y', [-1, 4]), ('tile_x', [-1, 64])],None,62
+ No: 9 GFLOPS: 1.84/11.75 result: MeasureResult(costs=(0.145611129,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.4285926818847656, timestamp=1655929727.9581838) [('tile_y', [-1, 2]), ('tile_x', [-1, 2])],None,11
+ No: 10 GFLOPS: 2.76/11.75 result: MeasureResult(costs=(0.097386445,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.669588565826416, timestamp=1655929729.6872263) [('tile_y', [-1, 4]), ('tile_x', [-1, 4])],None,22
diff --git a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
index 5dbc8538a..97d5e1222 100644
--- a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
@@ -314,7 +314,7 @@ standard deviation.
.. code-block:: none
- {'mean': 494.4699098900002, 'median': 494.50663965000103, 'std': 0.9450088039594712}
+ {'mean': 496.8772450499864, 'median': 496.6672595999853, 'std': 1.4294658444955355}
@@ -550,31 +550,31 @@ the tuning data to.
/workspace/python/tvm/driver/build_module.py:264: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
"target_host parameter is going to be deprecated. "
-
[Task 1/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 1/25] Current/Best: 17.53/ 17.53 GFLOPS | Progress: (4/20) | 6.17 s
[Task 1/25] Current/Best: 6.16/ 17.53 GFLOPS | Progress: (8/20) | 9.11 s
[Task 1/25] Current/Best: 11.49/ 22.81 GFLOPS | Progress: (12/20) | 11.59 s
[Task 1/25] Current/Best: 16.83/ 22.81 GFLOPS | Progress: (16/20) | 13.27 s
[Task 1/25] Current/Best: 11.54/ 23.79 GFLOPS | Progress: (20/20) | 15.00 s Done.
-
[Task 2/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 2/25] Current/Best: 12.26/ 12.98 GFLOPS | Progress: (4/20) | 3.83 s
[Task 2/25] Current/Best: 13.02/ 18.40 GFLOPS | Progress: (8/20) | 5.16 s
[Task 2/25] Current/Best: 20.95/ 20.95 GFLOPS | Progress: (12/20) | 6.46 s
[Task 2/25] Current/Best: 12.27/ 20.95 GFLOPS | Progress: (16/20) | 7.76 s
[Task 2/25] Current/Best: 19.37/ 20.95 GFLOPS | Progress: (20/20) | 9.35 s Done.
-
[Task 3/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 3/25] Current/Best: 1.63/ 10.57 GFLOPS | Progress: (4/20) | 5.80 s
[Task 3/25] Current/Best: 15.61/ 16.89 GFLOPS | Progress: (8/20) | 7.70 s
[Task 3/25] Current/Best: 14.93/ 16.89 GFLOPS | Progress: (12/20) | 9.43 s
[Task 3/25] Current/Best: 7.20/ 23.78 GFLOPS | Progress: (16/20) | 11.37 s
[Task 3/25] Current/Best: 12.32/ 23.78 GFLOPS | Progress: (20/20) | 15.96 s Done.
-
[Task 4/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 4/25] Current/Best: 9.55/ 20.52 GFLOPS | Progress: (4/20) | 2.33 s
[Task 4/25] Current/Best: 6.75/ 20.52 GFLOPS | Progress: (8/20) | 7.03 s
[Task 4/25] Current/Best: 22.36/ 22.36 GFLOPS | Progress: (12/20) | 12.00 s
[Task 4/25] Current/Best: 16.24/ 22.36 GFLOPS | Progress: (16/20) | 14.38 s
[Task 4/25] Current/Best: 13.35/ 22.36 GFLOPS | Progress: (20/20) | 16.34 s Done.
-
[Task 5/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 5/25] Current/Best: 9.61/ 10.29 GFLOPS | Progress: (4/20) | 2.54 s
[Task 5/25] Current/Best: 11.65/ 12.31 GFLOPS | Progress: (8/20) | 4.63 s
[Task 5/25] Current/Best: 11.81/ 18.02 GFLOPS | Progress: (12/20) | 7.79 s
[Task 5/25] Current/Best: 11.54/ 22.76 GFLOPS | Progress: (16/20) | 9.26 s
[Task 5/25] Current/Best: 12.02/ 22.76 GFLOPS | Progress: (20/20) | 11.15 s Done.
-
[Task 6/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 6/25] Current/Best: 12.14/ 20.74 GFLOPS | Progress: (4/20) | 4.05 s
[Task 6/25] Current/Best: 18.93/ 20.74 GFLOPS | Progress: (8/20) | 5.82 s
[Task 6/25] Current/Best: 13.35/ 20.74 GFLOPS | Progress: (12/20) | 7.76 s
[Task 6/25] Current/Best: 19.98/ 20.74 GFLOPS | Progress: (16/20) | 10.04 s
[Task 6/25] Current/Best: 3.71/ 20.74 GFLOPS | Progress: (20/20) | 12.57 s Done.
-
[Task 7/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 7/25] Current/Best: 11.12/ 12.99 GFLOPS | Progress: (4/20) | 3.48 s
[Task 7/25] Current/Best: 20.22/ 21.08 GFLOPS | Progress: (8/20) | 4.99 s
[Task 7/25] Current/Best: 16.19/ 21.08 GFLOPS | Progress: (12/20) | 6.87 s
[Task 7/25] Current/Best: 12.23/ 21.08 GFLOPS | Progress: (16/20) | 8.92 s
[Task 7/25] Current/Best: 6.38/ 21.75 GFLOPS | Progress: (20/20) | 11.40 s Done.
-
[Task 8/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 8/25] Current/Best: 9.89/ 14.11 GFLOPS | Progress: (4/20) | 2.84 s
[Task 8/25] Current/Best: 9.97/ 14.11 GFLOPS | Progress: (8/20) | 7.83 s
[Task 8/25] Current/Best: 12.59/ 14.11 GFLOPS | Progress: (12/20) | 14.31 s
[Task 8/25] Current/Best: 18.78/ 18.78 GFLOPS | Progress: (16/20) | 16.43 s
[Task 8/25] Current/Best: 20.16/ 20.16 GFLOPS | Progress: (20/20) | 23.47 s Done.
-
[Task 9/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 9/25] Current/Best: 14.34/ 15.73 GFLOPS | Progress: (4/20) | 11.87 s
[Task 9/25] Current/Best: 23.44/ 23.44 GFLOPS | Progress: (8/20) | 13.59 s
[Task 9/25] Current/Best: 8.28/ 23.44 GFLOPS | Progress: (12/20) | 16.07 s
[Task 9/25] Current/Best: 17.85/ 23.44 GFLOPS | Progress: (16/20) | 18.93 s
[Task 9/25] Current/Best: 9.02/ 23.44 GFLOPS | Progress: (20/20) | 27.44 s
[Task 10/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 10/25] Current/Best: 18.31/ 18.31 GFLOPS | Progress: (4/20) | 2.53 s
[Task 10/25] Current/Best: 15.62/ 18.31 GFLOPS | Progress: (8/20) | 4.15 s
[Task 10/25] Current/Best: 12.42/ 18.93 GFLOPS | Progress: (12/20) | 5.68 s
[Task 10/25] Current/Best: 19.08/ 20.41 GFLOPS | Progress: (16/20) | 6.79 s
[Task 10/25] Current/Best: 8.86/ 20.41 GFLOPS | Progress: (20/20
) | 8.31 s Done.
-
[Task 11/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 11/25] Current/Best: 12.33/ 18.13 GFLOPS | Progress: (4/20) | 3.28 s
[Task 11/25] Current/Best: 16.81/ 18.13 GFLOPS | Progress: (8/20) | 6.10 s
[Task 11/25] Current/Best: 18.28/ 18.28 GFLOPS | Progress: (12/20) | 8.16 s
[Task 11/25] Current/Best: 13.54/ 21.20 GFLOPS | Progress: (16/20) | 11.10 s
[Task 11/25] Current/Best: 19.53/ 21.55 GFLOPS | Progress: (20/20) | 13.20 s Done.
-
[Task 12/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 12/25] Current/Best: 7.82/ 17.96 GFLOPS | Progress: (4/20) | 5.61 s
[Task 12/25] Current/Best: 5.20/ 17.96 GFLOPS | Progress: (8/20) | 9.51 s
[Task 12/25] Current/Best: 18.96/ 18.96 GFLOPS | Progress: (12/20) | 11.51 s
[Task 12/25] Current/Best: 15.47/ 18.96 GFLOPS | Progress: (16/20) | 14.40 s
[Task 12/25] Current/Best: 15.06/ 18.96 GFLOPS | Progress: (20/20) | 16.31 s Done.
-
[Task 13/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 13/25] Current/Best: 8.27/ 17.38 GFLOPS | Progress: (4/20) | 3.71 s
[Task 13/25] Current/Best: 16.08/ 21.03 GFLOPS | Progress: (8/20) | 6.27 s
[Task 13/25] Current/Best: 19.63/ 21.80 GFLOPS | Progress: (12/20) | 9.32 s
[Task 13/25] Current/Best: 12.31/ 21.80 GFLOPS | Progress: (16/20) | 12.80 s
[Task 13/25] Current/Best: 18.52/ 21.80 GFLOPS | Progress: (20/20) | 15.12 s Done.
-
[Task 14/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 14/25] Current/Best: 13.58/ 13.58 GFLOPS | Progress: (4/20) | 3.36 s
[Task 14/25] Current/Best: 6.11/ 13.58 GFLOPS | Progress: (8/20) | 5.59 s
[Task 14/25] Current/Best: 20.28/ 20.28 GFLOPS | Progress: (12/20) | 8.23 s
[Task 14/25] Current/Best: 16.09/ 20.28 GFLOPS | Progress: (16/20) | 9.90 s Done.
-
[Task 14/25] Current/Best: 16.91/ 20.28 GFLOPS | Progress: (20/20) | 11.61 s
[Task 15/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 15/25] Current/Best: 16.17/ 17.64 GFLOPS | Progress: (4/20) | 2.61 s
[Task 15/25] Current/Best: 14.33/ 17.90 GFLOPS | Progress: (8/20) | 3.94 s
[Task 15/25] Current/Best: 10.39/ 22.38 GFLOPS | Progress: (12/20) | 6.16 s
[Task 15/25] Current/Best: 20.35/ 22.38 GFLOPS | Progress: (16/20) | 9.21 s
[Task 15/25] Current/Best: 9.66/ 22.38 GFLOPS | Progress: (20/20) | 10.23 s
[Task 16/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 16/25] Current/Best: 20.34/ 20.34 GFLOPS | Progress: (4/20) | 2.84 s
[Task 16/25] Current/Best: 3.04/ 20.34 GFLOPS | Progress: (8/20) | 4.44 s
[Task 16/25] Current/Best: 19.42/ 20.34 GFLOPS | Progress: (12/20) | 5.64 s
[Task 16/25] Current/Best: 17.71/ 20.34 GFLOPS | Progress: (16/20) |
7.01 s
[Task 16/25] Current/Best: 10.00/ 22.27 GFLOPS | Progress: (20/20) | 9.15 s Done.
-
[Task 17/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 17/25] Current/Best: 12.96/ 18.83 GFLOPS | Progress: (4/20) | 4.75 s
[Task 17/25] Current/Best: 14.47/ 23.41 GFLOPS | Progress: (8/20) | 7.60 s
[Task 17/25] Current/Best: 16.99/ 23.41 GFLOPS | Progress: (12/20) | 9.64 s
[Task 17/25] Current/Best: 16.49/ 23.41 GFLOPS | Progress: (16/20) | 11.88 s
[Task 17/25] Current/Best: 9.92/ 23.41 GFLOPS | Progress: (20/20) | 14.02 s Done.
-
[Task 18/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 18/25] Current/Best: 11.24/ 17.82 GFLOPS | Progress: (4/20) | 3.72 s
[Task 18/25] Current/Best: 10.48/ 19.75 GFLOPS | Progress: (8/20) | 7.37 s
[Task 18/25] Current/Best: 19.35/ 19.75 GFLOPS | Progress: (12/20) | 9.30 s
[Task 18/25] Current/Best: 10.09/ 19.75 GFLOPS | Progress: (16/20) | 13.10 s
[Task 18/25] Current/Best: 20.38/ 20.38 GFLOPS | Progress: (20/20) | 14.62 s Done.
-
[Task 19/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 19/25] Current/Best: 7.14/ 20.44 GFLOPS | Progress: (4/20) | 6.08 s
[Task 19/25] Current/Best: 2.61/ 20.44 GFLOPS | Progress: (8/20) | 9.43 s
[Task 19/25] Current/Best: 20.41/ 21.88 GFLOPS | Progress: (12/20) | 12.40 s
[Task 19/25] Current/Best: 14.26/ 22.24 GFLOPS | Progress: (16/20) | 15.43 s
[Task 19/25] Current/Best: 2.70/ 23.19 GFLOPS | Progress: (20/20) | 18.24 s Done.
-
[Task 20/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 20/25] Current/Best: 9.04/ 15.20 GFLOPS | Progress: (4/20) | 3.29 s Done.
+
[Task 1/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 1/25] Current/Best: 17.49/ 17.49 GFLOPS | Progress: (4/20) | 6.10 s
[Task 1/25] Current/Best: 6.16/ 17.49 GFLOPS | Progress: (8/20) | 9.04 s
[Task 1/25] Current/Best: 11.53/ 22.73 GFLOPS | Progress: (12/20) | 11.53 s
[Task 1/25] Current/Best: 16.82/ 22.82 GFLOPS | Progress: (16/20) | 13.21 s
[Task 1/25] Current/Best: 11.58/ 23.73 GFLOPS | Progress: (20/20) | 14.94 s Done.
+
[Task 2/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 2/25] Current/Best: 12.16/ 12.80 GFLOPS | Progress: (4/20) | 3.78 s
[Task 2/25] Current/Best: 14.18/ 17.42 GFLOPS | Progress: (8/20) | 5.11 s
[Task 2/25] Current/Best: 21.42/ 21.42 GFLOPS | Progress: (12/20) | 6.42 s
[Task 2/25] Current/Best: 12.44/ 21.42 GFLOPS | Progress: (16/20) | 7.67 s
[Task 2/25] Current/Best: 20.07/ 21.42 GFLOPS | Progress: (20/20) | 9.29 s Done.
+
[Task 3/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 3/25] Current/Best: 1.63/ 10.55 GFLOPS | Progress: (4/20) | 5.82 s
[Task 3/25] Current/Best: 15.58/ 16.86 GFLOPS | Progress: (8/20) | 7.76 s
[Task 3/25] Current/Best: 14.87/ 16.86 GFLOPS | Progress: (12/20) | 9.50 s
[Task 3/25] Current/Best: 7.20/ 23.80 GFLOPS | Progress: (16/20) | 11.41 s
[Task 3/25] Current/Best: 12.13/ 23.80 GFLOPS | Progress: (20/20) | 15.98 s Done.
+
[Task 4/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 4/25] Current/Best: 9.54/ 20.11 GFLOPS | Progress: (4/20) | 2.34 s
[Task 4/25] Current/Best: 6.61/ 20.11 GFLOPS | Progress: (8/20) | 7.05 s
[Task 4/25] Current/Best: 21.58/ 21.58 GFLOPS | Progress: (12/20) | 11.94 s
[Task 4/25] Current/Best: 17.29/ 21.58 GFLOPS | Progress: (16/20) | 14.32 s
[Task 4/25] Current/Best: 13.28/ 21.58 GFLOPS | Progress: (20/20) | 16.38 s Done.
+
[Task 5/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 5/25] Current/Best: 9.62/ 10.22 GFLOPS | Progress: (4/20) | 2.55 s
[Task 5/25] Current/Best: 11.65/ 12.71 GFLOPS | Progress: (8/20) | 4.61 s
[Task 5/25] Current/Best: 11.16/ 18.08 GFLOPS | Progress: (12/20) | 7.66 s
[Task 5/25] Current/Best: 11.63/ 22.78 GFLOPS | Progress: (16/20) | 9.10 s
[Task 5/25] Current/Best: 11.95/ 22.78 GFLOPS | Progress: (20/20) | 11.03 s Done.
+
[Task 6/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 6/25] Current/Best: 12.21/ 20.77 GFLOPS | Progress: (4/20) | 4.07 s
[Task 6/25] Current/Best: 18.77/ 20.77 GFLOPS | Progress: (8/20) | 5.85 s
[Task 6/25] Current/Best: 13.21/ 20.77 GFLOPS | Progress: (12/20) | 7.80 s
[Task 6/25] Current/Best: 19.93/ 20.77 GFLOPS | Progress: (16/20) | 10.06 s
[Task 6/25] Current/Best: 3.70/ 20.77 GFLOPS | Progress: (20/20) | 12.57 s Done.
+
[Task 7/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 7/25] Current/Best: 10.92/ 12.28 GFLOPS | Progress: (4/20) | 3.59 s
[Task 7/25] Current/Best: 20.22/ 20.85 GFLOPS | Progress: (8/20) | 5.10 s
[Task 7/25] Current/Best: 15.73/ 20.89 GFLOPS | Progress: (12/20) | 7.01 s
[Task 7/25] Current/Best: 12.23/ 20.89 GFLOPS | Progress: (16/20) | 9.06 s
[Task 7/25] Current/Best: 6.39/ 21.75 GFLOPS | Progress: (20/20) | 11.51 s Done.
+
[Task 8/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 8/25] Current/Best: 9.95/ 14.15 GFLOPS | Progress: (4/20) | 2.86 s
[Task 8/25] Current/Best: 10.30/ 14.15 GFLOPS | Progress: (8/20) | 7.96 s
[Task 8/25] Current/Best: 12.64/ 14.15 GFLOPS | Progress: (12/20) | 14.54 s
[Task 8/25] Current/Best: 18.88/ 18.88 GFLOPS | Progress: (16/20) | 16.64 s
[Task 8/25] Current/Best: 19.95/ 19.95 GFLOPS | Progress: (20/20) | 23.68 s Done.
+
[Task 9/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 9/25] Current/Best: 14.13/ 14.38 GFLOPS | Progress: (4/20) | 11.90 s
[Task 9/25] Current/Best: 23.38/ 23.38 GFLOPS | Progress: (8/20) | 13.73 s
[Task 9/25] Current/Best: 8.23/ 23.38 GFLOPS | Progress: (12/20) | 16.29 s
[Task 9/25] Current/Best: 17.99/ 23.38 GFLOPS | Progress: (16/20) | 19.13 s
[Task 9/25] Current/Best: 9.01/ 23.38 GFLOPS | Progress: (20/20) | 27.66 s
[Task 10/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 10/25] Current/Best: 18.39/ 18.39 GFLOPS | Progress: (4/20) | 2.47 s
[Task 10/25] Current/Best: 15.68/ 18.39 GFLOPS | Progress: (8/20) | 4.13 s
[Task 10/25] Current/Best: 12.56/ 19.13 GFLOPS | Progress: (12/20) | 5.68 s
[Task 10/25] Current/Best: 19.00/ 20.20 GFLOPS | Progress: (16/20) | 6.78 s
[Task 10/25] Current/Best: 8.92/ 20.20 GFLOPS | Progress: (20/20
) | 8.34 s Done.
+
[Task 11/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 11/25] Current/Best: 12.17/ 18.05 GFLOPS | Progress: (4/20) | 3.31 s
[Task 11/25] Current/Best: 16.81/ 18.05 GFLOPS | Progress: (8/20) | 6.10 s
[Task 11/25] Current/Best: 18.16/ 18.16 GFLOPS | Progress: (12/20) | 8.18 s
[Task 11/25] Current/Best: 13.40/ 21.09 GFLOPS | Progress: (16/20) | 11.16 s
[Task 11/25] Current/Best: 19.42/ 21.59 GFLOPS | Progress: (20/20) | 13.23 s Done.
+
[Task 12/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 12/25] Current/Best: 7.80/ 18.14 GFLOPS | Progress: (4/20) | 5.65 s
[Task 12/25] Current/Best: 5.23/ 18.14 GFLOPS | Progress: (8/20) | 9.58 s
[Task 12/25] Current/Best: 18.88/ 18.88 GFLOPS | Progress: (12/20) | 11.57 s
[Task 12/25] Current/Best: 15.26/ 18.88 GFLOPS | Progress: (16/20) | 14.52 s
[Task 12/25] Current/Best: 15.13/ 18.99 GFLOPS | Progress: (20/20) | 16.44 s Done.
+
[Task 13/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 13/25] Current/Best: 8.66/ 17.31 GFLOPS | Progress: (4/20) | 3.71 s
[Task 13/25] Current/Best: 16.04/ 20.93 GFLOPS | Progress: (8/20) | 6.32 s
[Task 13/25] Current/Best: 19.45/ 21.48 GFLOPS | Progress: (12/20) | 9.42 s
[Task 13/25] Current/Best: 12.24/ 21.48 GFLOPS | Progress: (16/20) | 12.87 s
[Task 13/25] Current/Best: 18.62/ 21.48 GFLOPS | Progress: (20/20) | 15.18 s Done.
+
[Task 14/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 14/25] Current/Best: 13.56/ 13.56 GFLOPS | Progress: (4/20) | 3.28 s
[Task 14/25] Current/Best: 6.10/ 13.56 GFLOPS | Progress: (8/20) | 5.51 s
[Task 14/25] Current/Best: 20.48/ 20.48 GFLOPS | Progress: (12/20) | 8.20 s
[Task 14/25] Current/Best: 16.65/ 20.48 GFLOPS | Progress: (16/20) | 9.84 s Done.
+
[Task 14/25] Current/Best: 17.12/ 20.48 GFLOPS | Progress: (20/20) | 11.64 s
[Task 15/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 15/25] Current/Best: 16.16/ 17.66 GFLOPS | Progress: (4/20) | 2.64 s
[Task 15/25] Current/Best: 14.28/ 18.02 GFLOPS | Progress: (8/20) | 4.00 s
[Task 15/25] Current/Best: 10.37/ 22.21 GFLOPS | Progress: (12/20) | 6.23 s
[Task 15/25] Current/Best: 20.38/ 22.21 GFLOPS | Progress: (16/20) | 9.57 s
[Task 15/25] Current/Best: 9.70/ 22.21 GFLOPS | Progress: (20/20) | 10.59 s
[Task 16/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 16/25] Current/Best: 19.70/ 19.70 GFLOPS | Progress: (4/20) | 2.88 s
[Task 16/25] Current/Best: 3.04/ 19.70 GFLOPS | Progress: (8/20) | 4.51 s
[Task 16/25] Current/Best: 19.64/ 19.70 GFLOPS | Progress: (12/20) | 5.74 s
[Task 16/25] Current/Best: 17.98/ 19.70 GFLOPS | Progress: (16/20) |
7.10 s
[Task 16/25] Current/Best: 10.01/ 19.70 GFLOPS | Progress: (20/20) | 9.27 s Done.
+
[Task 17/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 17/25] Current/Best: 13.39/ 18.84 GFLOPS | Progress: (4/20) | 4.77 s
[Task 17/25] Current/Best: 14.43/ 23.23 GFLOPS | Progress: (8/20) | 7.65 s
[Task 17/25] Current/Best: 17.08/ 23.23 GFLOPS | Progress: (12/20) | 9.71 s
[Task 17/25] Current/Best: 16.53/ 23.23 GFLOPS | Progress: (16/20) | 11.91 s
[Task 17/25] Current/Best: 10.03/ 23.23 GFLOPS | Progress: (20/20) | 14.08 s Done.
+
[Task 18/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 18/25] Current/Best: 11.33/ 17.84 GFLOPS | Progress: (4/20) | 3.77 s
[Task 18/25] Current/Best: 10.55/ 19.85 GFLOPS | Progress: (8/20) | 7.48 s
[Task 18/25] Current/Best: 19.19/ 19.85 GFLOPS | Progress: (12/20) | 9.40 s
[Task 18/25] Current/Best: 10.05/ 19.85 GFLOPS | Progress: (16/20) | 13.23 s
[Task 18/25] Current/Best: 20.90/ 20.90 GFLOPS | Progress: (20/20) | 14.74 s Done.
+
[Task 19/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 19/25] Current/Best: 7.06/ 20.32 GFLOPS | Progress: (4/20) | 6.08 s
[Task 19/25] Current/Best: 2.60/ 20.32 GFLOPS | Progress: (8/20) | 9.45 s
[Task 19/25] Current/Best: 19.76/ 21.53 GFLOPS | Progress: (12/20) | 12.40 s
[Task 19/25] Current/Best: 15.38/ 21.64 GFLOPS | Progress: (16/20) | 15.38 s
[Task 19/25] Current/Best: 2.70/ 23.46 GFLOPS | Progress: (20/20) | 18.19 s Done.
+
[Task 20/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 20/25] Current/Best: 9.24/ 15.19 GFLOPS | Progress: (4/20) | 3.32 s Done.
Done.
-
[Task 20/25] Current/Best: 9.75/ 15.20 GFLOPS | Progress: (8/20) | 6.86 s
[Task 20/25] Current/Best: 2.32/ 16.72 GFLOPS | Progress: (12/20) | 10.80 s
[Task 20/25] Current/Best: 12.28/ 16.72 GFLOPS | Progress: (16/20) | 14.59 s
[Task 20/25] Current/Best: 13.02/ 22.23 GFLOPS | Progress: (20/20) | 16.70 s
[Task 21/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 21/25] Current/Best: 6.42/ 17.59 GFLOPS | Progress: (4/20) | 3.21 s
[Task 21/25] Current/Best: 14.64/ 17.59 GFLOPS | Progress: (8/20) | 4.85 s
[Task 21/25] Current/Best: 1.61/ 17.59 GFLOPS | Progress: (12/20) | 6.94 s
[Task 21/25] Current/Best: 18.15/ 18.15 GFLOPS | Progress: (16/20) | 10.47 s
[Task 21/25] Current/Best: 4.46/ 18.15 GFLOPS | Progress: (20/20) | 17.74 s
[Task 22/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 22/25] Current/Best: 2.70/ 17.06 GFLOPS | Progress: (4/20
) | 2.63 s
[Task 22/25] Current/Best: 8.75/ 21.78 GFLOPS | Progress: (8/20) | 4.70 s
[Task 22/25] Current/Best: 20.02/ 21.78 GFLOPS | Progress: (12/20) | 7.06 s
[Task 22/25] Current/Best: 15.38/ 21.78 GFLOPS | Progress: (16/20) | 9.18 s
[Task 22/25] Current/Best: 14.29/ 21.78 GFLOPS | Progress: (20/20) | 10.92 s Done.
-
[Task 23/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 23/25] Current/Best: 17.17/ 20.65 GFLOPS | Progress: (4/20) | 3.22 s
[Task 23/25] Current/Best: 13.80/ 20.65 GFLOPS | Progress: (8/20) | 6.72 s
[Task 23/25] Current/Best: 20.95/ 21.57 GFLOPS | Progress: (12/20) | 8.62 s
[Task 23/25] Current/Best: 6.42/ 21.57 GFLOPS | Progress: (16/20) | 15.77 s
[Task 23/25] Current/Best: 7.84/ 21.57 GFLOPS | Progress: (20/20) | 20.02 s Done.
-
[Task 24/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 24/25] Current/Best: 8.38/ 8.38 GFLOPS | Progress: (4/20) | 11.75 s
[Task 24/25] Current/Best: 2.13/ 8.38 GFLOPS | Progress: (8/20) | 22.71 s
[Task 24/25] Current/Best: 4.40/ 8.38 GFLOPS | Progress: (12/20) | 34.20 s Done.
+
[Task 20/25] Current/Best: 10.05/ 15.19 GFLOPS | Progress: (8/20) | 6.73 s
[Task 20/25] Current/Best: 2.32/ 16.76 GFLOPS | Progress: (12/20) | 10.69 s
[Task 20/25] Current/Best: 12.52/ 16.76 GFLOPS | Progress: (16/20) | 14.65 s
[Task 20/25] Current/Best: 13.42/ 22.02 GFLOPS | Progress: (20/20) | 16.78 s
[Task 21/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 21/25] Current/Best: 6.40/ 17.61 GFLOPS | Progress: (4/20) | 3.24 s
[Task 21/25] Current/Best: 14.60/ 17.61 GFLOPS | Progress: (8/20) | 4.84 s
[Task 21/25] Current/Best: 1.61/ 17.61 GFLOPS | Progress: (12/20) | 6.99 s
[Task 21/25] Current/Best: 18.00/ 18.00 GFLOPS | Progress: (16/20) | 10.51 s
[Task 21/25] Current/Best: 4.47/ 18.00 GFLOPS | Progress: (20/20) | 17.92 s
[Task 22/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 22/25] Current/Best: 2.70/ 17.02 GFLOPS | Progress: (4/20
) | 2.64 s
[Task 22/25] Current/Best: 8.91/ 21.90 GFLOPS | Progress: (8/20) | 4.70 s
[Task 22/25] Current/Best: 19.94/ 21.90 GFLOPS | Progress: (12/20) | 7.08 s
[Task 22/25] Current/Best: 15.34/ 21.90 GFLOPS | Progress: (16/20) | 9.25 s
[Task 22/25] Current/Best: 14.68/ 21.90 GFLOPS | Progress: (20/20) | 10.99 s Done.
+
[Task 23/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 23/25] Current/Best: 17.45/ 20.55 GFLOPS | Progress: (4/20) | 3.18 s
[Task 23/25] Current/Best: 15.65/ 20.55 GFLOPS | Progress: (8/20) | 6.56 s
[Task 23/25] Current/Best: 20.84/ 21.61 GFLOPS | Progress: (12/20) | 8.43 s
[Task 23/25] Current/Best: 6.33/ 21.61 GFLOPS | Progress: (16/20) | 15.49 s
[Task 23/25] Current/Best: 7.83/ 21.61 GFLOPS | Progress: (20/20) | 19.75 s Done.
+
[Task 24/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 24/25] Current/Best: 8.21/ 8.21 GFLOPS | Progress: (4/20) | 11.76 s
[Task 24/25] Current/Best: 3.60/ 8.21 GFLOPS | Progress: (8/20) | 22.97 s
[Task 24/25] Current/Best: 4.07/ 8.21 GFLOPS | Progress: (12/20) | 33.71 s Done.
Done.
-
[Task 24/25] Current/Best: 6.65/ 8.60 GFLOPS | Progress: (16/20) | 40.04 s
[Task 24/25] Current/Best: 3.39/ 8.60 GFLOPS | Progress: (20/20) | 46.07 s Done.
-
[Task 25/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 25/25] Current/Best: 1.55/ 2.72 GFLOPS | Progress: (4/20) | 11.53 s
[Task 25/25] Current/Best: 6.03/ 7.79 GFLOPS | Progress: (8/20) | 22.73 s
[Task 25/25] Current/Best: 5.94/ 7.79 GFLOPS | Progress: (12/20) | 33.99 s
[Task 25/25] Current/Best: 5.83/ 8.53 GFLOPS | Progress: (16/20) | 35.82 s
[Task 25/25] Current/Best: 2.87/ 8.70 GFLOPS | Progress: (20/20) | 46.49 s
+
[Task 24/25] Current/Best: 6.77/ 8.80 GFLOPS | Progress: (16/20) | 39.47 s
[Task 24/25] Current/Best: 3.31/ 9.11 GFLOPS | Progress: (20/20) | 45.59 s Done.
+
[Task 25/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 25/25] Current/Best: 1.55/ 2.78 GFLOPS | Progress: (4/20) | 11.55 s
[Task 25/25] Current/Best: 5.65/ 7.91 GFLOPS | Progress: (8/20) | 22.76 s
[Task 25/25] Current/Best: 6.04/ 7.91 GFLOPS | Progress: (12/20) | 34.03 s
[Task 25/25] Current/Best: 5.83/ 9.44 GFLOPS | Progress: (16/20) | 35.75 s
[Task 25/25] Current/Best: 2.94/ 9.44 GFLOPS | Progress: (20/20) | 46.44 s
@@ -735,8 +735,8 @@ improvement in comparing the optimized model to the unoptimized model.
.. code-block:: none
- optimized: {'mean': 411.37254193999524, 'median': 411.352962549995, 'std': 0.3513869974955923}
- unoptimized: {'mean': 494.4699098900002, 'median': 494.50663965000103, 'std': 0.9450088039594712}
+ optimized: {'mean': 414.72895694999806, 'median': 414.71009494998725, 'std': 0.2884264773767498}
+ unoptimized: {'mean': 496.8772450499864, 'median': 496.6672595999853, 'std': 1.4294658444955355}
@@ -759,7 +759,7 @@ profiling/benchmarking.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 10 minutes 19.842 seconds)
+ **Total running time of the script:** ( 10 minutes 23.018 seconds)
.. _sphx_glr_download_tutorial_autotvm_relay_x86.py:
diff --git a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
index 66e687fb9..baba208db 100644
--- a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
+++ b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
@@ -269,7 +269,7 @@ device and returns the measured cost. Network overhead is excluded.
.. code-block:: none
- 1.298e-07 secs/op
+ 1.292e-07 secs/op
diff --git a/docs/_sources/tutorial/intro_topi.rst.txt b/docs/_sources/tutorial/intro_topi.rst.txt
index ccbad5116..7f05afcca 100644
--- a/docs/_sources/tutorial/intro_topi.rst.txt
+++ b/docs/_sources/tutorial/intro_topi.rst.txt
@@ -262,7 +262,7 @@ As you can see, scheduled stages of computation have been accumulated and we can
.. code-block:: none
- [stage(a, placeholder(a, 0xce7d210)), stage(b, placeholder(b, 0xbc13d50)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min= [...]
+ [stage(a, placeholder(a, 0x219218b0)), stage(b, placeholder(b, 0xe250130)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min [...]
diff --git a/docs/_sources/tutorial/sg_execution_times.rst.txt b/docs/_sources/tutorial/sg_execution_times.rst.txt
index ec1499958..092ac15e5 100644
--- a/docs/_sources/tutorial/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorial/sg_execution_times.rst.txt
@@ -5,29 +5,29 @@
Computation times
=================
-**13:47.370** total execution time for **tutorial** files:
+**13:30.797** total execution time for **tutorial** files:
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``) | 10:19.842 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``) | 10:23.018 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``) | 01:32.785 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``) | 01:13.161 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``) | 01:00.519 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``) | 01:01.382 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``) | 00:27.674 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``) | 00:28.210 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``) | 00:24.343 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``) | 00:23.705 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``) | 00:01.404 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``) | 00:00.666 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``) | 00:00.658 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``) | 00:00.512 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``) | 00:00.144 | 0.0 MB |
-+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tvmc_command_line_driver.py` (``tvmc_command_line_driver.py``) | 00:00.000 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``) | 00:00.143 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_tutorial_introduction.py` (``introduction.py``) | 00:00.000 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_tutorial_tvmc_command_line_driver.py` (``tvmc_command_line_driver.py``) | 00:00.000 | 0.0 MB |
++------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_tutorial_tvmc_python.py` (``tvmc_python.py``) | 00:00.000 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_tutorial_install.py` (``install.py``) | 00:00.000 | 0.0 MB |
diff --git a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
index eb22ecf27..b4a8c5f98 100644
--- a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
+++ b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
@@ -288,8 +288,8 @@ helper function to run a profile of the TVM generated code.
.. code-block:: none
- Numpy running time: 0.000009
- naive: 0.000006
+ Numpy running time: 0.000008
+ naive: 0.000007
@@ -499,10 +499,10 @@ We can now compare the different schedules
.. code-block:: none
Operator Timing Performance
- numpy 8.620539999810716e-06 1.0
- naive 5.8315e-06 0.6764657434601595
- parallel 6.9705e-06 0.8085920371755195
- vector 2.46388e-05 2.858150417554005
+ numpy 8.178030002454761e-06 1.0
+ naive 6.837e-06 0.8360204105325817
+ parallel 6.962200000000001e-06 0.8513297209609392
+ vector 2.4714099999999998e-05 3.0220114125995723
@@ -923,7 +923,7 @@ matrix multiplication.
.. code-block:: none
- Numpy running time: 0.018293
+ Numpy running time: 0.018520
@@ -983,7 +983,7 @@ optimizations.
/workspace/python/tvm/driver/build_module.py:264: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
"target_host parameter is going to be deprecated. "
- none: 3.389984
+ none: 3.452831
@@ -1088,7 +1088,7 @@ schedule.
/workspace/python/tvm/driver/build_module.py:264: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
"target_host parameter is going to be deprecated. "
- blocking: 0.301645
+ blocking: 0.292013
@@ -1186,7 +1186,7 @@ already cache friendly from our previous optimizations.
/workspace/python/tvm/driver/build_module.py:264: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
"target_host parameter is going to be deprecated. "
- vectorization: 0.334908
+ vectorization: 0.336959
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1262,7 +1262,7 @@ more cache friendly.
/workspace/python/tvm/driver/build_module.py:264: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
"target_host parameter is going to be deprecated. "
- loop permutation: 0.115885
+ loop permutation: 0.121260
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1363,7 +1363,7 @@ optimized schedule.
/workspace/python/tvm/driver/build_module.py:264: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
"target_host parameter is going to be deprecated. "
- array packing: 0.108837
+ array packing: 0.111406
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1458,7 +1458,7 @@ to `C` when all the block results are ready.
/workspace/python/tvm/driver/build_module.py:264: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
"target_host parameter is going to be deprecated. "
- block caching: 0.111126
+ block caching: 0.111786
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1546,7 +1546,7 @@ of thread-level parallelization.
/workspace/python/tvm/driver/build_module.py:264: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
"target_host parameter is going to be deprecated. "
- parallelization: 0.144419
+ parallelization: 0.145750
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1627,13 +1627,13 @@ working, we can compare the results.
.. code-block:: none
Operator Timing Performance
- none 3.3899836249 1.0
- blocking 0.3016445303 0.08898111721967332
- vectorization 0.3349076508 0.09879329455754504
- loop permutation 0.11588464980000002 0.03418442760277889
- array packing 0.1088372546 0.032105539920774855
- block caching 0.11112594270000001 0.03278067241498197
- parallelization 0.1444187422 0.04260160466240015
+ none 3.4528312396000005 1.0
+ blocking 0.2920132543 0.08457211894718285
+ vectorization 0.33695932559999997 0.09758928317592366
+ loop permutation 0.12125966699999999 0.035118909261851884
+ array packing 0.1114056159 0.03226500462064459
+ block caching 0.1117857104 0.032375086600800684
+ parallelization 0.14575047530000002 0.042211873441253024
@@ -1675,7 +1675,7 @@ the computation for specific platforms.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 0.519 seconds)
+ **Total running time of the script:** ( 1 minutes 1.382 seconds)
.. _sphx_glr_download_tutorial_tensor_expr_get_started.py:
diff --git a/docs/commit_hash b/docs/commit_hash
index 2622cd117..1d81f4bcb 100644
--- a/docs/commit_hash
+++ b/docs/commit_hash
@@ -1 +1 @@
-caa0d59c335713d29b1e63714395fc2ba3d979dc
+c334790bf88694db8d748d2299f50f2b04c46486
diff --git a/docs/how_to/compile_models/from_mxnet.html b/docs/how_to/compile_models/from_mxnet.html
index b8ac33426..45d0bdb23 100644
--- a/docs/how_to/compile_models/from_mxnet.html
+++ b/docs/how_to/compile_models/from_mxnet.html
@@ -422,7 +422,7 @@ to download the full example code</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"x"</span><span class="p">,</span> <a href="https://docs.python.org/3/library/stdtypes.html#tuple" title="builtins.tuple" class="sphx-glr-backref-module-builtins sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">x</span><span class="o">.</span><span class="n">shape</span></a><span class="p">)</span>
</pre></div>
</div>
-<img src="../../_images/sphx_glr_from_mxnet_001.png" srcset="../../_images/sphx_glr_from_mxnet_001.png" alt="from mxnet" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip812ab175-3adc-45fc-a776-1bd65330f280 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+<img src="../../_images/sphx_glr_from_mxnet_001.png" srcset="../../_images/sphx_glr_from_mxnet_001.png" alt="from mxnet" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip2cde5cc7-4677-4c09-92ea-a4a047eb10a2 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
x (1, 3, 224, 224)
</pre></div>
</div>
diff --git a/docs/how_to/compile_models/from_oneflow.html b/docs/how_to/compile_models/from_oneflow.html
index 6991f26a1..adb3a72c9 100644
--- a/docs/how_to/compile_models/from_oneflow.html
+++ b/docs/how_to/compile_models/from_oneflow.html
@@ -427,105 +427,143 @@ python3 -m pip install -f https://release.oneflow.info <span class="nv">oneflow<
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip" to /workspace/.oneflow/flowvision_cache/resnet18.zip
0%| | 0.00/41.5M [00:00<?, ?B/s]
- 0%| | 16.0k/41.5M [00:00<07:57, 91.1kB/s]
- 0%| | 40.0k/41.5M [00:00<06:09, 117kB/s]
- 0%| | 72.0k/41.5M [00:00<04:56, 147kB/s]
- 0%| | 96.0k/41.5M [00:00<05:05, 142kB/s]
- 0%| | 128k/41.5M [00:00<04:37, 156kB/s]
- 0%| | 160k/41.5M [00:01<04:23, 165kB/s]
- 0%| | 192k/41.5M [00:01<04:14, 170kB/s]
- 1%| | 232k/41.5M [00:01<03:50, 188kB/s]
- 1%| | 264k/41.5M [00:01<03:52, 186kB/s]
- 1%| | 304k/41.5M [00:01<03:37, 198kB/s]
- 1%| | 352k/41.5M [00:01<03:15, 221kB/s]
- 1%| | 392k/41.5M [00:02<03:13, 222kB/s]
- 1%|1 | 440k/41.5M [00:02<03:01, 237kB/s]
- 1%|1 | 488k/41.5M [00:02<02:53, 248kB/s]
- 1%|1 | 536k/41.5M [00:02<02:48, 255kB/s]
- 1%|1 | 584k/41.5M [00:02<02:45, 260kB/s]
- 2%|1 | 640k/41.5M [00:03<02:34, 277kB/s]
- 2%|1 | 696k/41.5M [00:03<02:28, 289kB/s]
- 2%|1 | 760k/41.5M [00:03<02:17, 311kB/s]
- 2%|1 | 824k/41.5M [00:03<02:10, 326kB/s]
- 2%|2 | 888k/41.5M [00:03<02:06, 337kB/s]
- 2%|2 | 960k/41.5M [00:03<01:58, 358kB/s]
- 2%|2 | 1.01M/41.5M [00:04<01:53, 373kB/s]
- 3%|2 | 1.09M/41.5M [00:04<01:46, 397kB/s]
- 3%|2 | 1.16M/41.5M [00:04<01:42, 413kB/s]
- 3%|2 | 1.24M/41.5M [00:04<01:39, 425kB/s]
- 3%|3 | 1.33M/41.5M [00:04<01:34, 447kB/s]
- 3%|3 | 1.42M/41.5M [00:05<01:28, 476kB/s]
- 4%|3 | 1.52M/41.5M [00:05<01:24, 496kB/s]
- 4%|3 | 1.62M/41.5M [00:05<01:19, 524kB/s]
- 4%|4 | 1.73M/41.5M [00:05<01:14, 557kB/s]
- 4%|4 | 1.84M/41.5M [00:05<01:11, 580kB/s]
- 5%|4 | 1.95M/41.5M [00:05<01:08, 610kB/s]
- 5%|5 | 2.09M/41.5M [00:06<01:02, 658kB/s]
- 5%|5 | 2.23M/41.5M [00:06<00:58, 705kB/s]
- 6%|5 | 2.38M/41.5M [00:06<00:54, 751kB/s]
- 6%|6 | 2.54M/41.5M [00:06<00:50, 811kB/s]
- 7%|6 | 2.71M/41.5M [00:06<00:46, 866kB/s]
- 7%|6 | 2.90M/41.5M [00:07<00:43, 932kB/s]
- 7%|7 | 3.09M/41.5M [00:07<00:40, 992kB/s]
- 8%|7 | 3.31M/41.5M [00:07<00:37, 1.07MB/s]
- 9%|8 | 3.54M/41.5M [00:07<00:34, 1.15MB/s]
- 9%|9 | 3.78M/41.5M [00:07<00:32, 1.22MB/s]
- 10%|9 | 4.04M/41.5M [00:07<00:30, 1.30MB/s]
- 10%|# | 4.31M/41.5M [00:08<00:28, 1.39MB/s]
- 11%|#1 | 4.61M/41.5M [00:08<00:25, 1.49MB/s]
- 12%|#1 | 4.91M/41.5M [00:08<00:22, 1.74MB/s]
- 13%|#2 | 5.24M/41.5M [00:08<00:19, 1.93MB/s]
- 13%|#3 | 5.44M/41.5M [00:08<00:19, 1.95MB/s]
- 14%|#3 | 5.63M/41.5M [00:08<00:22, 1.66MB/s]
- 14%|#4 | 5.98M/41.5M [00:08<00:18, 1.99MB/s]
- 15%|#5 | 6.37M/41.5M [00:09<00:16, 2.27MB/s]
- 16%|#5 | 6.59M/41.5M [00:09<00:16, 2.29MB/s]
- 16%|#6 | 6.82M/41.5M [00:09<00:18, 1.93MB/s]
- 17%|#7 | 7.23M/41.5M [00:09<00:15, 2.36MB/s]
- 19%|#8 | 7.69M/41.5M [00:09<00:12, 2.91MB/s]
- 19%|#9 | 7.99M/41.5M [00:09<00:12, 2.71MB/s]
- 20%|#9 | 8.27M/41.5M [00:09<00:15, 2.32MB/s]
- 21%|##1 | 8.72M/41.5M [00:10<00:12, 2.72MB/s]
- 22%|##2 | 9.26M/41.5M [00:10<00:09, 3.40MB/s]
- 23%|##3 | 9.62M/41.5M [00:10<00:10, 3.16MB/s]
- 24%|##3 | 9.95M/41.5M [00:10<00:12, 2.71MB/s]
- 25%|##5 | 10.5M/41.5M [00:10<00:11, 2.92MB/s]
- 27%|##6 | 11.1M/41.5M [00:10<00:08, 3.61MB/s]
- 28%|##8 | 11.8M/41.5M [00:10<00:07, 4.27MB/s]
- 29%|##9 | 12.2M/41.5M [00:11<00:08, 3.82MB/s]
- 30%|### | 12.6M/41.5M [00:11<00:09, 3.28MB/s]
- 32%|###1 | 13.2M/41.5M [00:11<00:08, 3.58MB/s]
- 34%|###3 | 14.0M/41.5M [00:11<00:06, 4.40MB/s]
- 36%|###5 | 14.8M/41.5M [00:11<00:05, 5.20MB/s]
- 37%|###7 | 15.4M/41.5M [00:11<00:05, 4.65MB/s]
- 38%|###8 | 15.8M/41.5M [00:11<00:06, 4.01MB/s]
- 40%|###9 | 16.6M/41.5M [00:12<00:05, 4.86MB/s]
- 42%|####2 | 17.5M/41.5M [00:12<00:04, 5.71MB/s]
- 44%|####3 | 18.1M/41.5M [00:12<00:04, 5.20MB/s]
- 45%|####4 | 18.6M/41.5M [00:12<00:05, 4.49MB/s]
- 47%|####7 | 19.5M/41.5M [00:12<00:04, 5.68MB/s]
- 50%|####9 | 20.6M/41.5M [00:12<00:03, 6.64MB/s]
- 51%|#####1 | 21.2M/41.5M [00:12<00:03, 6.02MB/s]
- 53%|#####2 | 21.9M/41.5M [00:13<00:03, 5.19MB/s]
- 55%|#####5 | 23.0M/41.5M [00:13<00:02, 6.58MB/s]
- 58%|#####8 | 24.1M/41.5M [00:13<00:02, 7.67MB/s]
- 60%|###### | 24.9M/41.5M [00:13<00:02, 6.96MB/s]
- 62%|######1 | 25.6M/41.5M [00:13<00:02, 6.02MB/s]
- 65%|######4 | 26.9M/41.5M [00:13<00:02, 7.55MB/s]
- 68%|######8 | 28.2M/41.5M [00:13<00:01, 8.86MB/s]
- 70%|####### | 29.2M/41.5M [00:13<00:01, 8.01MB/s]
- 72%|#######2 | 30.0M/41.5M [00:14<00:01, 6.94MB/s]
- 75%|#######5 | 31.3M/41.5M [00:14<00:01, 8.35MB/s]
- 79%|#######8 | 32.6M/41.5M [00:14<00:00, 9.59MB/s]
- 81%|########1 | 33.6M/41.5M [00:14<00:00, 8.55MB/s]
- 83%|########3 | 34.5M/41.5M [00:14<00:00, 7.40MB/s]
- 86%|########6 | 35.7M/41.5M [00:14<00:00, 8.45MB/s]
- 89%|########9 | 37.1M/41.5M [00:14<00:00, 9.58MB/s]
- 92%|#########1| 38.0M/41.5M [00:15<00:00, 8.60MB/s]
- 94%|#########3| 38.9M/41.5M [00:15<00:00, 7.43MB/s]
- 97%|#########6| 40.1M/41.5M [00:15<00:00, 8.47MB/s]
-100%|#########9| 41.5M/41.5M [00:15<00:00, 9.60MB/s]
-100%|##########| 41.5M/41.5M [00:15<00:00, 2.82MB/s]
+ 0%| | 16.0k/41.5M [00:00<08:17, 87.5kB/s]
+ 0%| | 32.0k/41.5M [00:00<08:18, 87.2kB/s]
+ 0%| | 48.0k/41.5M [00:00<08:18, 87.1kB/s]
+ 0%| | 64.0k/41.5M [00:00<08:18, 87.1kB/s]
+ 0%| | 80.0k/41.5M [00:00<08:19, 87.0kB/s]
+ 0%| | 96.0k/41.5M [00:01<08:18, 87.0kB/s]
+ 0%| | 112k/41.5M [00:01<08:18, 87.0kB/s]
+ 0%| | 128k/41.5M [00:01<08:18, 87.0kB/s]
+ 0%| | 144k/41.5M [00:01<08:18, 87.0kB/s]
+ 0%| | 168k/41.5M [00:01<07:11, 100kB/s]
+ 0%| | 184k/41.5M [00:02<07:29, 96.3kB/s]
+ 0%| | 208k/41.5M [00:02<06:45, 107kB/s]
+ 1%| | 232k/41.5M [00:02<06:19, 114kB/s]
+ 1%| | 256k/41.5M [00:02<06:03, 119kB/s]
+ 1%| | 280k/41.5M [00:02<05:53, 122kB/s]
+ 1%| | 304k/41.5M [00:03<05:45, 125kB/s]
+ 1%| | 336k/41.5M [00:03<05:09, 140kB/s]
+ 1%| | 368k/41.5M [00:03<04:47, 150kB/s]
+ 1%| | 400k/41.5M [00:03<04:34, 157kB/s]
+ 1%|1 | 440k/41.5M [00:03<04:05, 175kB/s]
+ 1%|1 | 480k/41.5M [00:03<03:48, 188kB/s]
+ 1%|1 | 528k/41.5M [00:04<03:24, 210kB/s]
+ 1%|1 | 584k/41.5M [00:04<03:00, 238kB/s]
+ 2%|1 | 640k/41.5M [00:04<02:45, 258kB/s]
+ 2%|1 | 696k/41.5M [00:04<02:37, 272kB/s]
+ 2%|1 | 768k/41.5M [00:04<02:18, 308kB/s]
+ 2%|1 | 848k/41.5M [00:05<02:03, 346kB/s]
+ 2%|2 | 928k/41.5M [00:05<01:54, 373kB/s]
+ 2%|2 | 1.00M/41.5M [00:05<01:41, 418kB/s]
+ 3%|2 | 1.10M/41.5M [00:05<01:31, 461kB/s]
+ 3%|2 | 1.15M/41.5M [00:06<02:48, 250kB/s]
+ 3%|3 | 1.43M/41.5M [00:06<01:19, 529kB/s]
+ 4%|3 | 1.52M/41.5M [00:06<01:21, 516kB/s]
+ 4%|3 | 1.60M/41.5M [00:06<01:22, 507kB/s]
+ 4%|4 | 1.69M/41.5M [00:06<01:23, 500kB/s]
+ 4%|4 | 1.79M/41.5M [00:07<01:20, 517kB/s]
+ 5%|4 | 1.88M/41.5M [00:07<01:19, 519kB/s]
+ 5%|4 | 1.98M/41.5M [00:07<01:17, 533kB/s]
+ 5%|5 | 2.09M/41.5M [00:07<01:16, 542kB/s]
+ 5%|5 | 2.20M/41.5M [00:07<01:13, 562kB/s]
+ 6%|5 | 2.30M/41.5M [00:08<01:13, 563kB/s]
+ 6%|5 | 2.41M/41.5M [00:08<01:11, 577kB/s]
+ 6%|6 | 2.52M/41.5M [00:08<01:09, 586kB/s]
+ 6%|6 | 2.63M/41.5M [00:08<01:07, 606kB/s]
+ 7%|6 | 2.74M/41.5M [00:08<01:06, 607kB/s]
+ 7%|6 | 2.85M/41.5M [00:09<01:06, 608kB/s]
+ 7%|7 | 2.97M/41.5M [00:09<01:05, 621kB/s]
+ 7%|7 | 3.09M/41.5M [00:09<01:03, 631kB/s]
+ 8%|7 | 3.20M/41.5M [00:09<01:04, 624kB/s]
+ 8%|7 | 3.31M/41.5M [00:09<01:03, 633kB/s]
+ 8%|8 | 3.43M/41.5M [00:09<01:02, 639kB/s]
+ 9%|8 | 3.54M/41.5M [00:10<01:03, 630kB/s]
+ 9%|8 | 3.66M/41.5M [00:10<01:02, 637kB/s]
+ 9%|9 | 3.77M/41.5M [00:10<01:02, 628kB/s]
+ 9%|9 | 3.88M/41.5M [00:10<01:02, 636kB/s]
+ 10%|9 | 3.99M/41.5M [00:10<01:02, 628kB/s]
+ 10%|9 | 4.11M/41.5M [00:11<01:01, 635kB/s]
+ 10%|# | 4.23M/41.5M [00:11<01:01, 641kB/s]
+ 10%|# | 4.34M/41.5M [00:11<01:01, 631kB/s]
+ 11%|# | 4.45M/41.5M [00:11<01:00, 638kB/s]
+ 11%|#1 | 4.57M/41.5M [00:11<01:00, 642kB/s]
+ 11%|#1 | 4.69M/41.5M [00:12<00:59, 645kB/s]
+ 12%|#1 | 4.80M/41.5M [00:12<00:59, 648kB/s]
+ 12%|#1 | 4.92M/41.5M [00:12<00:59, 649kB/s]
+ 12%|#2 | 5.04M/41.5M [00:12<00:58, 650kB/s]
+ 12%|#2 | 5.16M/41.5M [00:12<00:58, 651kB/s]
+ 13%|#2 | 5.28M/41.5M [00:12<00:57, 665kB/s]
+ 13%|#3 | 5.41M/41.5M [00:13<00:56, 674kB/s]
+ 13%|#3 | 5.53M/41.5M [00:13<00:55, 681kB/s]
+ 14%|#3 | 5.66M/41.5M [00:13<00:53, 698kB/s]
+ 14%|#3 | 5.80M/41.5M [00:13<00:52, 711kB/s]
+ 14%|#4 | 5.94M/41.5M [00:13<00:50, 732kB/s]
+ 15%|#4 | 6.08M/41.5M [00:14<00:49, 748kB/s]
+ 15%|#4 | 6.22M/41.5M [00:14<00:48, 758kB/s]
+ 15%|#5 | 6.37M/41.5M [00:14<00:47, 779kB/s]
+ 16%|#5 | 6.52M/41.5M [00:14<00:45, 806kB/s]
+ 16%|#6 | 6.69M/41.5M [00:14<00:43, 839kB/s]
+ 17%|#6 | 6.85M/41.5M [00:15<00:42, 861kB/s]
+ 17%|#6 | 7.03M/41.5M [00:15<00:40, 903kB/s]
+ 17%|#7 | 7.21M/41.5M [00:15<00:38, 932kB/s]
+ 18%|#7 | 7.41M/41.5M [00:15<00:36, 979kB/s]
+ 18%|#8 | 7.60M/41.5M [00:15<00:35, 1.01MB/s]
+ 19%|#8 | 7.81M/41.5M [00:16<00:33, 1.06MB/s]
+ 19%|#9 | 8.02M/41.5M [00:16<00:32, 1.10MB/s]
+ 20%|#9 | 8.26M/41.5M [00:16<00:30, 1.16MB/s]
+ 20%|## | 8.49M/41.5M [00:16<00:28, 1.20MB/s]
+ 21%|##1 | 8.74M/41.5M [00:16<00:27, 1.26MB/s]
+ 22%|##1 | 9.01M/41.5M [00:16<00:25, 1.33MB/s]
+ 22%|##2 | 9.28M/41.5M [00:17<00:24, 1.38MB/s]
+ 23%|##3 | 9.57M/41.5M [00:17<00:23, 1.45MB/s]
+ 24%|##3 | 9.87M/41.5M [00:17<00:21, 1.51MB/s]
+ 25%|##4 | 10.2M/41.5M [00:17<00:20, 1.58MB/s]
+ 25%|##5 | 10.5M/41.5M [00:17<00:19, 1.67MB/s]
+ 26%|##6 | 10.9M/41.5M [00:18<00:18, 1.77MB/s]
+ 27%|##7 | 11.2M/41.5M [00:18<00:17, 1.85MB/s]
+ 28%|##8 | 11.6M/41.5M [00:18<00:16, 1.95MB/s]
+ 29%|##9 | 12.0M/41.5M [00:18<00:15, 2.06MB/s]
+ 30%|### | 12.5M/41.5M [00:18<00:14, 2.17MB/s]
+ 31%|###1 | 12.9M/41.5M [00:19<00:13, 2.28MB/s]
+ 32%|###2 | 13.4M/41.5M [00:19<00:12, 2.39MB/s]
+ 34%|###3 | 13.9M/41.5M [00:19<00:11, 2.51MB/s]
+ 35%|###4 | 14.4M/41.5M [00:19<00:10, 2.62MB/s]
+ 36%|###6 | 15.0M/41.5M [00:19<00:10, 2.76MB/s]
+ 38%|###7 | 15.6M/41.5M [00:19<00:09, 2.90MB/s]
+ 39%|###8 | 16.2M/41.5M [00:20<00:08, 3.06MB/s]
+ 41%|#### | 16.8M/41.5M [00:20<00:08, 3.21MB/s]
+ 42%|####2 | 17.5M/41.5M [00:20<00:07, 3.37MB/s]
+ 44%|####3 | 18.2M/41.5M [00:20<00:06, 3.53MB/s]
+ 46%|####5 | 18.9M/41.5M [00:20<00:06, 3.70MB/s]
+ 47%|####7 | 19.7M/41.5M [00:21<00:05, 3.87MB/s]
+ 49%|####9 | 20.5M/41.5M [00:21<00:04, 4.42MB/s]
+ 51%|#####1 | 21.2M/41.5M [00:21<00:04, 5.06MB/s]
+ 52%|#####2 | 21.7M/41.5M [00:21<00:04, 4.66MB/s]
+ 54%|#####3 | 22.2M/41.5M [00:21<00:05, 3.98MB/s]
+ 56%|#####5 | 23.0M/41.5M [00:21<00:04, 4.70MB/s]
+ 58%|#####7 | 23.9M/41.5M [00:21<00:03, 5.57MB/s]
+ 59%|#####8 | 24.5M/41.5M [00:22<00:03, 5.12MB/s]
+ 60%|###### | 25.0M/41.5M [00:22<00:03, 4.91MB/s]
+ 62%|######2 | 25.9M/41.5M [00:22<00:02, 6.01MB/s]
+ 64%|######3 | 26.5M/41.5M [00:22<00:02, 5.46MB/s]
+ 65%|######5 | 27.1M/41.5M [00:22<00:02, 5.30MB/s]
+ 68%|######7 | 28.1M/41.5M [00:22<00:02, 6.58MB/s]
+ 69%|######9 | 28.8M/41.5M [00:22<00:02, 5.94MB/s]
+ 71%|####### | 29.4M/41.5M [00:22<00:02, 5.76MB/s]
+ 74%|#######3 | 30.5M/41.5M [00:23<00:01, 7.16MB/s]
+ 75%|#######5 | 31.3M/41.5M [00:23<00:01, 6.45MB/s]
+ 77%|#######7 | 32.0M/41.5M [00:23<00:01, 6.23MB/s]
+ 80%|#######9 | 33.1M/41.5M [00:23<00:01, 7.76MB/s]
+ 82%|########1 | 33.9M/41.5M [00:23<00:01, 6.96MB/s]
+ 84%|########3 | 34.7M/41.5M [00:23<00:01, 6.73MB/s]
+ 87%|########6 | 35.9M/41.5M [00:23<00:00, 8.30MB/s]
+ 89%|########8 | 36.8M/41.5M [00:23<00:00, 7.44MB/s]
+ 90%|######### | 37.5M/41.5M [00:24<00:00, 7.20MB/s]
+ 94%|#########3| 38.9M/41.5M [00:24<00:00, 8.84MB/s]
+ 96%|#########5| 39.8M/41.5M [00:24<00:00, 7.91MB/s]
+ 98%|#########7| 40.6M/41.5M [00:24<00:00, 6.69MB/s]
+100%|##########| 41.5M/41.5M [00:24<00:00, 1.78MB/s]
</pre></div>
</div>
</div>
diff --git a/docs/how_to/compile_models/from_paddle.html b/docs/how_to/compile_models/from_paddle.html
index 359517caa..3fbc8a5a5 100644
--- a/docs/how_to/compile_models/from_paddle.html
+++ b/docs/how_to/compile_models/from_paddle.html
@@ -488,7 +488,7 @@ A quick solution is</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>TVM prediction top-1 id: 282, class name: 282: 'tiger cat',
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 7.006 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 7.747 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-paddle-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/16269b77359771348d507395692524cf/from_paddle.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_paddle.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/from_pytorch.html b/docs/how_to/compile_models/from_pytorch.html
index 8c131dbdf..12766057a 100644
--- a/docs/how_to/compile_models/from_pytorch.html
+++ b/docs/how_to/compile_models/from_pytorch.html
@@ -409,16 +409,20 @@ be unstable.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
0%| | 0.00/44.7M [00:00<?, ?B/s]
- 6%|5 | 2.66M/44.7M [00:00<00:01, 27.8MB/s]
- 12%|#2 | 5.52M/44.7M [00:00<00:01, 28.9MB/s]
- 23%|##2 | 10.3M/44.7M [00:00<00:00, 38.3MB/s]
- 35%|###4 | 15.5M/44.7M [00:00<00:00, 44.5MB/s]
- 45%|####5 | 20.2M/44.7M [00:00<00:00, 46.4MB/s]
- 58%|#####7 | 25.8M/44.7M [00:00<00:00, 49.5MB/s]
- 70%|######9 | 31.1M/44.7M [00:00<00:00, 51.7MB/s]
- 82%|########1 | 36.5M/44.7M [00:00<00:00, 53.2MB/s]
- 94%|#########3| 41.9M/44.7M [00:00<00:00, 54.1MB/s]
-100%|##########| 44.7M/44.7M [00:00<00:00, 49.1MB/s]
+ 7%|7 | 3.23M/44.7M [00:00<00:01, 33.9MB/s]
+ 14%|#4 | 6.47M/44.7M [00:00<00:01, 33.3MB/s]
+ 22%|##1 | 9.75M/44.7M [00:00<00:01, 33.7MB/s]
+ 29%|##9 | 13.0M/44.7M [00:00<00:01, 31.9MB/s]
+ 38%|###8 | 17.1M/44.7M [00:00<00:00, 35.3MB/s]
+ 46%|####5 | 20.5M/44.7M [00:00<00:00, 34.8MB/s]
+ 53%|#####3 | 23.8M/44.7M [00:00<00:01, 21.6MB/s]
+ 59%|#####9 | 26.5M/44.7M [00:01<00:00, 21.8MB/s]
+ 67%|######6 | 29.8M/44.7M [00:01<00:00, 24.9MB/s]
+ 75%|#######4 | 33.3M/44.7M [00:01<00:00, 27.8MB/s]
+ 83%|########2 | 37.0M/44.7M [00:01<00:00, 29.9MB/s]
+ 90%|########9 | 40.1M/44.7M [00:01<00:00, 28.3MB/s]
+ 96%|#########6| 43.0M/44.7M [00:01<00:00, 28.8MB/s]
+100%|##########| 44.7M/44.7M [00:01<00:00, 28.6MB/s]
</pre></div>
</div>
</div>
diff --git a/docs/how_to/compile_models/from_tensorflow.html b/docs/how_to/compile_models/from_tensorflow.html
index e3f9d3502..6a1dcc90a 100644
--- a/docs/how_to/compile_models/from_tensorflow.html
+++ b/docs/how_to/compile_models/from_tensorflow.html
@@ -631,7 +631,7 @@ banana (score = 0.00022)
desk (score = 0.00019)
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 4.119 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 1.170 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-tensorflow-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7f1d3d1b878694c201c614c807cdebc8/from_tensorflow.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_tensorflow.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/sg_execution_times.html b/docs/how_to/compile_models/sg_execution_times.html
index c73c688af..f5be286bf 100644
--- a/docs/how_to/compile_models/sg_execution_times.html
+++ b/docs/how_to/compile_models/sg_execution_times.html
@@ -322,7 +322,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-compile-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>05:47.852</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
+<p><strong>05:55.609</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 81%" />
@@ -331,43 +331,43 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="from_paddle.html#sphx-glr-how-to-compile-models-from-paddle-py"><span class="std std-ref">Compile PaddlePaddle Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_paddle.py</span></code>)</p></td>
-<td><p>01:07.006</p></td>
+<td><p>01:07.747</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_tensorflow.html#sphx-glr-how-to-compile-models-from-tensorflow-py"><span class="std std-ref">Compile Tensorflow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tensorflow.py</span></code>)</p></td>
-<td><p>01:04.119</p></td>
+<td><p>01:01.170</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="from_darknet.html#sphx-glr-how-to-compile-models-from-darknet-py"><span class="std std-ref">Compile YOLO-V2 and YOLO-V3 in DarkNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_darknet.py</span></code>)</p></td>
-<td><p>00:56.601</p></td>
+<td><p>00:57.992</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_oneflow.html#sphx-glr-how-to-compile-models-from-oneflow-py"><span class="std std-ref">Compile OneFlow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_oneflow.py</span></code>)</p></td>
-<td><p>00:40.653</p></td>
+<td><p>00:50.009</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="from_tflite.html#sphx-glr-how-to-compile-models-from-tflite-py"><span class="std std-ref">Compile TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tflite.py</span></code>)</p></td>
-<td><p>00:38.824</p></td>
+<td><p>00:36.725</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_mxnet.html#sphx-glr-how-to-compile-models-from-mxnet-py"><span class="std std-ref">Compile MXNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_mxnet.py</span></code>)</p></td>
-<td><p>00:22.452</p></td>
+<td><p>00:22.695</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="from_coreml.html#sphx-glr-how-to-compile-models-from-coreml-py"><span class="std std-ref">Compile CoreML Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_coreml.py</span></code>)</p></td>
-<td><p>00:21.643</p></td>
+<td><p>00:21.516</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_pytorch.html#sphx-glr-how-to-compile-models-from-pytorch-py"><span class="std std-ref">Compile PyTorch Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_pytorch.py</span></code>)</p></td>
-<td><p>00:19.948</p></td>
+<td><p>00:20.885</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="from_keras.html#sphx-glr-how-to-compile-models-from-keras-py"><span class="std std-ref">Compile Keras Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_keras.py</span></code>)</p></td>
-<td><p>00:14.254</p></td>
+<td><p>00:14.522</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_onnx.html#sphx-glr-how-to-compile-models-from-onnx-py"><span class="std std-ref">Compile ONNX Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_onnx.py</span></code>)</p></td>
-<td><p>00:02.351</p></td>
+<td><p>00:02.349</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/deploy_models/deploy_model_on_android.html b/docs/how_to/deploy_models/deploy_model_on_android.html
index b838d0eb4..4790bbf8d 100644
--- a/docs/how_to/deploy_models/deploy_model_on_android.html
+++ b/docs/how_to/deploy_models/deploy_model_on_android.html
@@ -648,7 +648,7 @@ to the remote android device.</p>
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 16.0557 15.8693 16.5941 15.7658 0.3119
+ 15.9465 15.9620 16.0734 15.7781 0.0909
</pre></div>
</div>
</div>
diff --git a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
index 2cffae506..c9ce36593 100644
--- a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
+++ b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
@@ -431,17 +431,15 @@ be unstable.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
0%| | 0.00/170M [00:00<?, ?B/s]
- 6%|5 | 9.44M/170M [00:00<00:01, 98.6MB/s]
- 15%|#5 | 25.9M/170M [00:00<00:01, 142MB/s]
- 25%|##5 | 42.5M/170M [00:00<00:00, 156MB/s]
- 35%|###4 | 58.8M/170M [00:00<00:00, 162MB/s]
- 44%|####4 | 75.3M/170M [00:00<00:00, 166MB/s]
- 54%|#####4 | 91.9M/170M [00:00<00:00, 169MB/s]
- 64%|######3 | 108M/170M [00:00<00:00, 170MB/s]
- 74%|#######3 | 125M/170M [00:00<00:00, 171MB/s]
- 83%|########3 | 141M/170M [00:00<00:00, 171MB/s]
- 93%|#########2| 158M/170M [00:01<00:00, 172MB/s]
-100%|##########| 170M/170M [00:01<00:00, 166MB/s]
+ 8%|8 | 14.2M/170M [00:00<00:01, 148MB/s]
+ 21%|## | 35.6M/170M [00:00<00:00, 193MB/s]
+ 34%|###3 | 56.9M/170M [00:00<00:00, 207MB/s]
+ 46%|####6 | 78.3M/170M [00:00<00:00, 213MB/s]
+ 59%|#####8 | 99.7M/170M [00:00<00:00, 217MB/s]
+ 71%|#######1 | 121M/170M [00:00<00:00, 218MB/s]
+ 84%|########3 | 142M/170M [00:00<00:00, 219MB/s]
+ 96%|#########6| 163M/170M [00:00<00:00, 221MB/s]
+100%|##########| 170M/170M [00:00<00:00, 214MB/s]
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:3878: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
for i in range(dim)
/usr/local/lib/python3.7/dist-packages/torchvision/models/detection/anchor_utils.py:127: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
@@ -536,7 +534,7 @@ torchvision rcnn models.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Get 9 valid boxes
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 51.510 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 55.836 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-object-detection-pytorch-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7795da4b258c8feff986668b95ef57ad/deploy_object_detection_pytorch.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_object_detection_pytorch.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized.html b/docs/how_to/deploy_models/deploy_prequantized.html
index a831892ea..add09ffc8 100644
--- a/docs/how_to/deploy_models/deploy_prequantized.html
+++ b/docs/how_to/deploy_models/deploy_prequantized.html
@@ -472,7 +472,8 @@ training. Other models require a full post training calibration.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
0%| | 0.00/13.6M [00:00<?, ?B/s]
-100%|##########| 13.6M/13.6M [00:00<00:00, 153MB/s]
+ 68%|######7 | 9.20M/13.6M [00:00<00:00, 95.6MB/s]
+100%|##########| 13.6M/13.6M [00:00<00:00, 110MB/s]
</pre></div>
</div>
</div>
@@ -561,7 +562,7 @@ output values are identical out of 1000 outputs from mobilenet v2.</p>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 90.5119 90.4735 93.1524 90.1430 0.3397
+ 90.3645 90.2495 96.8307 90.1843 0.6696
</pre></div>
</div>
<div class="admonition note">
@@ -600,7 +601,7 @@ This includes support for the VNNI 8 bit dot product instruction (CascadeLake or
<div class="section" id="deploy-a-quantized-tflite-model">
<h2>Deploy a quantized TFLite Model<a class="headerlink" href="#deploy-a-quantized-tflite-model" title="Permalink to this headline">¶</a></h2>
<p>TODO</p>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 5.729 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 7.076 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/fb8217c13f4351224c6cf3aacf1a87fc/deploy_prequantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized_tflite.html b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
index 959b110fe..f299b9d09 100644
--- a/docs/how_to/deploy_models/deploy_prequantized_tflite.html
+++ b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
@@ -565,7 +565,7 @@ TFLite Top-5 labels: [387 102 386 341 349]
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 118.8315 118.7852 125.2959 117.9721 0.7412
+ 119.8498 119.8288 121.2662 119.0162 0.3249
</pre></div>
</div>
<div class="admonition note">
@@ -593,7 +593,7 @@ network for ARM CPU</span></a>.</p></li>
</ul>
</div></blockquote>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 9.625 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 3.701 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-tflite-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/56691c7a27d45da61d112276334640d3/deploy_prequantized_tflite.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized_tflite.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_quantized.html b/docs/how_to/deploy_models/deploy_quantized.html
index 244f0c0b3..e30c094dc 100644
--- a/docs/how_to/deploy_models/deploy_quantized.html
+++ b/docs/how_to/deploy_models/deploy_quantized.html
@@ -504,7 +504,7 @@ for calibration. But the accuracy might be impacted.</p>
DeprecationWarning,
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 14.689 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 32.299 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-quantized-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7810ecf51bfc05f7d5e8a400ac3e815d/deploy_quantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_quantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
index c583e9586..99ebcf575 100644
--- a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
+++ b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
@@ -436,24 +436,23 @@ to your device.</p>
Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
0%| | 0/132723 [00:00<?, ?KB/s]
- 2%|1 | 2047/132723 [00:00<00:06, 20406.67KB/s]
- 5%|4 | 6578/132723 [00:00<00:03, 35035.25KB/s]
- 11%|# | 14180/132723 [00:00<00:02, 53734.70KB/s]
- 17%|#6 | 22154/132723 [00:00<00:01, 63994.50KB/s]
- 23%|##2 | 30073/132723 [00:00<00:01, 69470.33KB/s]
- 29%|##8 | 38102/132723 [00:00<00:01, 73147.09KB/s]
- 35%|###4 | 46097/132723 [00:00<00:01, 75365.68KB/s]
- 41%|#### | 54077/132723 [00:00<00:01, 76774.94KB/s]
- 47%|####6 | 62103/132723 [00:00<00:00, 77862.69KB/s]
- 53%|#####2 | 70157/132723 [00:01<00:00, 78685.87KB/s]
- 59%|#####8 | 78228/132723 [00:01<00:00, 79301.38KB/s]
- 65%|######5 | 86317/132723 [00:01<00:00, 79781.27KB/s]
- 71%|#######1 | 94447/132723 [00:01<00:00, 80239.53KB/s]
- 77%|#######7 | 102596/132723 [00:01<00:00, 80616.06KB/s]
- 83%|########3 | 110731/132723 [00:01<00:00, 80836.08KB/s]
- 90%|########9 | 118922/132723 [00:01<00:00, 81158.05KB/s]
- 96%|#########5| 127088/132723 [00:01<00:00, 81306.03KB/s]
-100%|##########| 132723/132723 [00:01<00:00, 74969.03KB/s]
+ 2%|2 | 3124/132723 [00:00<00:04, 31238.61KB/s]
+ 8%|7 | 10039/132723 [00:00<00:02, 53537.03KB/s]
+ 14%|#3 | 18569/132723 [00:00<00:01, 68032.11KB/s]
+ 21%|## | 27284/132723 [00:00<00:01, 75576.21KB/s]
+ 27%|##7 | 36034/132723 [00:00<00:01, 79872.26KB/s]
+ 34%|###3 | 44669/132723 [00:00<00:01, 82069.42KB/s]
+ 40%|#### | 53394/132723 [00:00<00:00, 83759.28KB/s]
+ 47%|####6 | 62119/132723 [00:00<00:00, 84868.29KB/s]
+ 53%|#####3 | 70833/132723 [00:00<00:00, 85572.69KB/s]
+ 60%|#####9 | 79608/132723 [00:01<00:00, 86241.46KB/s]
+ 67%|######6 | 88369/132723 [00:01<00:00, 86655.55KB/s]
+ 73%|#######3 | 97076/132723 [00:01<00:00, 86778.49KB/s]
+ 80%|#######9 | 105828/132723 [00:01<00:00, 87001.18KB/s]
+ 86%|########6 | 114574/132723 [00:01<00:00, 87137.46KB/s]
+ 93%|#########2| 123289/132723 [00:01<00:00, 87138.26KB/s]
+ 99%|#########9| 132003/132723 [00:01<00:00, 87121.86KB/s]
+100%|##########| 132723/132723 [00:01<00:00, 82401.45KB/s]
</pre></div>
</div>
<p>Create TVM runtime and do inference
@@ -496,7 +495,7 @@ Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from h
<span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
-<img src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" srcset="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" alt="deploy ssd gluoncv" class = "sphx-glr-single-img"/><p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 15.415 seconds)</p>
+<img src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" srcset="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" alt="deploy ssd gluoncv" class = "sphx-glr-single-img"/><p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 17.409 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-ssd-gluoncv-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/cccb17d28e5e8b2e94ea8cd5ec59f6ed/deploy_ssd_gluoncv.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_ssd_gluoncv.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/sg_execution_times.html b/docs/how_to/deploy_models/sg_execution_times.html
index 586333aba..ead859a5d 100644
--- a/docs/how_to/deploy_models/sg_execution_times.html
+++ b/docs/how_to/deploy_models/sg_execution_times.html
@@ -322,7 +322,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-deploy-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>10:27.491</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
+<p><strong>10:47.671</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 86%" />
@@ -331,31 +331,31 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_object_detection_pytorch.html#sphx-glr-how-to-deploy-models-deploy-object-detection-pytorch-py"><span class="std std-ref">Compile PyTorch Object Detection Models</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_object_detection_pytorch.py</span></code>)</p></td>
-<td><p>02:51.510</p></td>
+<td><p>02:55.836</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_ssd_gluoncv.html#sphx-glr-how-to-deploy-models-deploy-ssd-gluoncv-py"><span class="std std-ref">Deploy Single Shot Multibox Detector(SSD) model</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_ssd_gluoncv.py</span></code>)</p></td>
-<td><p>02:15.415</p></td>
+<td><p>02:17.409</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_prequantized_tflite.html#sphx-glr-how-to-deploy-models-deploy-prequantized-tflite-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM - Part 3 (TFLite)</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized_tflite.py</span></code>)</p></td>
-<td><p>02:09.625</p></td>
+<td><p>02:03.701</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_quantized.html#sphx-glr-how-to-deploy-models-deploy-quantized-py"><span class="std std-ref">Deploy a Quantized Model on Cuda</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_quantized.py</span></code>)</p></td>
-<td><p>01:14.689</p></td>
+<td><p>01:32.299</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_prequantized.html#sphx-glr-how-to-deploy-models-deploy-prequantized-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized.py</span></code>)</p></td>
-<td><p>01:05.729</p></td>
+<td><p>01:07.076</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_model_on_android.html#sphx-glr-how-to-deploy-models-deploy-model-on-android-py"><span class="std std-ref">Deploy the Pretrained Model on Android</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_android.py</span></code>)</p></td>
-<td><p>00:28.241</p></td>
+<td><p>00:28.877</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_model_on_rasp.html#sphx-glr-how-to-deploy-models-deploy-model-on-rasp-py"><span class="std std-ref">Deploy the Pretrained Model on Raspberry Pi</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_rasp.py</span></code>)</p></td>
-<td><p>00:22.277</p></td>
+<td><p>00:22.467</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_sparse.html#sphx-glr-how-to-deploy-models-deploy-sparse-py"><span class="std std-ref">Deploy a Hugging Face Pruned Model on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_sparse.py</span></code>)</p></td>
diff --git a/docs/how_to/extend_tvm/bring_your_own_datatypes.html b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
index 842dd5a60..141021fdf 100644
--- a/docs/how_to/extend_tvm/bring_your_own_datatypes.html
+++ b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
@@ -604,7 +604,7 @@ In this alpha state of the Bring Your Own Datatypes framework, we have not imple
<span class="n">module</span><span class="p">,</span> <a href="https://docs.python.org/3/library/stdtypes.html#dict" title="builtins.dict" class="sphx-glr-backref-module-builtins sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">params</span></a> <span class="o">=</span> <span class="n">get_mobilenet</span><span class="p">()</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip28234e9f-61c7-4f97-854a-564afd9983fc from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip8ca90524-9141-4870-9ca3-1f55665bdfc5 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
</pre></div>
</div>
<p>It’s easy to execute MobileNet with native TVM:</p>
diff --git a/docs/how_to/extend_tvm/sg_execution_times.html b/docs/how_to/extend_tvm/sg_execution_times.html
index ca5877a1d..bd634d8cc 100644
--- a/docs/how_to/extend_tvm/sg_execution_times.html
+++ b/docs/how_to/extend_tvm/sg_execution_times.html
@@ -322,7 +322,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-extend-tvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:41.355</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
+<p><strong>00:40.008</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 84%" />
@@ -331,15 +331,15 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="bring_your_own_datatypes.html#sphx-glr-how-to-extend-tvm-bring-your-own-datatypes-py"><span class="std std-ref">Bring Your Own Datatypes to TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">bring_your_own_datatypes.py</span></code>)</p></td>
-<td><p>00:38.244</p></td>
+<td><p>00:36.865</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="use_pass_instrument.html#sphx-glr-how-to-extend-tvm-use-pass-instrument-py"><span class="std std-ref">How to Use TVM Pass Instrument</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_instrument.py</span></code>)</p></td>
-<td><p>00:02.187</p></td>
+<td><p>00:02.214</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="use_pass_infra.html#sphx-glr-how-to-extend-tvm-use-pass-infra-py"><span class="std std-ref">How to Use TVM Pass Infra</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_infra.py</span></code>)</p></td>
-<td><p>00:00.917</p></td>
+<td><p>00:00.923</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="low_level_custom_pass.html#sphx-glr-how-to-extend-tvm-low-level-custom-pass-py"><span class="std std-ref">Writing a Customized Pass</span></a> (<code class="docutils literal notranslate"><span class="pre">low_level_custom_pass.py</span></code>)</p></td>
diff --git a/docs/how_to/extend_tvm/use_pass_instrument.html b/docs/how_to/extend_tvm/use_pass_instrument.html
index 60e9646cf..86c6369e5 100644
--- a/docs/how_to/extend_tvm/use_pass_instrument.html
+++ b/docs/how_to/extend_tvm/use_pass_instrument.html
@@ -507,10 +507,10 @@ profile the execution time of each passes.</p>
</pre></div>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 6848us [6848us] (45.94%; 45.94%)
-FoldScaleAxis: 8058us [6us] (54.06%; 54.06%)
- FoldConstant: 8051us [1612us] (54.02%; 99.92%)
- InferType: 6439us [6439us] (43.20%; 79.98%)
+InferType: 6849us [6849us] (46.58%; 46.58%)
+FoldScaleAxis: 7855us [6us] (53.42%; 53.42%)
+ FoldConstant: 7849us [1571us] (53.38%; 99.92%)
+ InferType: 6278us [6278us] (42.70%; 79.99%)
</pre></div>
</div>
</div>
@@ -532,10 +532,10 @@ Refer to following sections and <a class="reference internal" href="../../refere
</pre></div>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 6432us [6432us] (44.63%; 44.63%)
-FoldScaleAxis: 7981us [6us] (55.37%; 55.37%)
- FoldConstant: 7975us [1667us] (55.33%; 99.92%)
- InferType: 6308us [6308us] (43.77%; 79.10%)
+InferType: 6312us [6312us] (44.55%; 44.55%)
+FoldScaleAxis: 7857us [5us] (55.45%; 55.45%)
+ FoldConstant: 7852us [1593us] (55.41%; 99.93%)
+ InferType: 6259us [6259us] (44.17%; 79.72%)
</pre></div>
</div>
<p>Register empty list to clear existing instruments.</p>
diff --git a/docs/how_to/optimize_operators/opt_conv_cuda.html b/docs/how_to/optimize_operators/opt_conv_cuda.html
index 31ad7a377..bfaf6e4c7 100644
--- a/docs/how_to/optimize_operators/opt_conv_cuda.html
+++ b/docs/how_to/optimize_operators/opt_conv_cuda.html
@@ -556,7 +556,7 @@ latency of convolution.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Convolution: </span><span class="si">%f</span><span class="s2"> ms"</span> <span class="o">%</span> <span class="p">(</span><span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">b</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span> <span class="o">*</span> <span cl [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 54.167125 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 43.185978 ms
</pre></div>
</div>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-optimize-operators-opt-conv-cuda-py">
diff --git a/docs/how_to/optimize_operators/opt_conv_tensorcore.html b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
index ff4289265..4e9695122 100644
--- a/docs/how_to/optimize_operators/opt_conv_tensorcore.html
+++ b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
@@ -898,7 +898,7 @@ be able to run on our build server</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"conv2d with tensor core: </span><span class="si">%f</span><span class="s2"> ms"</span> <span class="o">%</span> <span class="p">(</span><span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span> <span class="o">* [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 6.873242 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 11.878459 ms
</pre></div>
</div>
</div>
diff --git a/docs/how_to/optimize_operators/opt_gemm.html b/docs/how_to/optimize_operators/opt_gemm.html
index 440b044a2..888820480 100644
--- a/docs/how_to/optimize_operators/opt_gemm.html
+++ b/docs/how_to/optimize_operators/opt_gemm.html
@@ -453,8 +453,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
<span class="nb">print</span><span class="p">(</span><span class="s2">"Baseline: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.018172
-Baseline: 3.395604
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.018950
+Baseline: 3.338291
</pre></div>
</div>
<p>In TVM, we can always inspect lower level IR to debug or optimize our schedule.
@@ -514,7 +514,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt1: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.300426
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.308090
</pre></div>
</div>
<p>Here is the generated IR after blocking.</p>
@@ -581,7 +581,7 @@ vastly.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt2: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.332632
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.331753
</pre></div>
</div>
<p>Here is the generated IR after vectorization.</p>
@@ -642,7 +642,7 @@ the access pattern for A matrix is more cache friendly.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt3: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.117883
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.121683
</pre></div>
</div>
<p>Here is the generated IR after loop permutation.</p>
@@ -725,7 +725,7 @@ flattening.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt4: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.110434
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.111281
</pre></div>
</div>
<p>Here is the generated IR after array packing.</p>
@@ -811,7 +811,7 @@ write to C when all the block results are ready.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt5: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.111427
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.111284
</pre></div>
</div>
<p>Here is the generated IR after blocking.</p>
@@ -901,7 +901,7 @@ write to C when all the block results are ready.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt6: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">opt6_time</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.145331
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.145534
</pre></div>
</div>
<p>Here is the generated IR after parallelization.</p>
diff --git a/docs/how_to/optimize_operators/sg_execution_times.html b/docs/how_to/optimize_operators/sg_execution_times.html
index 872171522..f621f57ec 100644
--- a/docs/how_to/optimize_operators/sg_execution_times.html
+++ b/docs/how_to/optimize_operators/sg_execution_times.html
@@ -322,7 +322,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-optimize-operators-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:34.322</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
+<p><strong>00:34.419</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 83%" />
@@ -331,15 +331,15 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="opt_gemm.html#sphx-glr-how-to-optimize-operators-opt-gemm-py"><span class="std std-ref">How to optimize GEMM on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_gemm.py</span></code>)</p></td>
-<td><p>00:32.069</p></td>
+<td><p>00:32.140</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="opt_conv_tensorcore.html#sphx-glr-how-to-optimize-operators-opt-conv-tensorcore-py"><span class="std std-ref">How to optimize convolution using TensorCores</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_tensorcore.py</span></code>)</p></td>
-<td><p>00:01.234</p></td>
+<td><p>00:01.283</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="opt_conv_cuda.html#sphx-glr-how-to-optimize-operators-opt-conv-cuda-py"><span class="std std-ref">How to optimize convolution on GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_cuda.py</span></code>)</p></td>
-<td><p>00:01.020</p></td>
+<td><p>00:00.996</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
index 149a24b1e..5cc0ef3d3 100644
--- a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
+++ b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
@@ -322,7 +322,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-tune-with-autoscheduler-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>05:12.442</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
+<p><strong>05:12.594</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 85%" />
@@ -331,27 +331,27 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_conv2d_layer_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py"><span class="std std-ref">Auto-scheduling a Convolution Layer for GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_layer_cuda.py</span></code>)</p></td>
-<td><p>02:35.530</p></td>
+<td><p>02:34.321</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_network_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-x86-py"><span class="std std-ref">Auto-scheduling a Neural Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_x86.py</span></code>)</p></td>
-<td><p>01:19.801</p></td>
+<td><p>01:20.685</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_network_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-cuda-py"><span class="std std-ref">Auto-scheduling a Neural Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_cuda.py</span></code>)</p></td>
-<td><p>00:42.629</p></td>
+<td><p>00:43.097</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_sparse_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-sparse-x86-py"><span class="std std-ref">Auto-scheduling Sparse Matrix Multiplication on CPU with Custom Sketch Rule</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_sparse_x86.py</span></code>)</p></td>
-<td><p>00:17.693</p></td>
+<td><p>00:17.412</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_network_mali.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-mali-py"><span class="std std-ref">Auto-scheduling a Neural Network for mali GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_mali.py</span></code>)</p></td>
-<td><p>00:08.451</p></td>
+<td><p>00:08.696</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_network_arm.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-arm-py"><span class="std std-ref">Auto-scheduling a Neural Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_arm.py</span></code>)</p></td>
-<td><p>00:08.338</p></td>
+<td><p>00:08.384</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
index 5f628f38c..07fe78b8a 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
@@ -486,72 +486,128 @@ cooperative fetching, unrolling and operator fusion.</p>
compute: Buffer(compute_2: Pointer(float32), float32, [25088], [])}
buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, compute_1: compute}
preflattened_buffer_map = {data_1: data_3: Buffer(data_2, float32, [1, 512, 7, 7], []), kernel_1: kernel_3: Buffer(kernel_2, float32, [512, 512, 3, 3], []), bias_1: bias_3: Buffer(bias_2, float32, [1, 512, 1, 1], []), compute_1: compute_3: Buffer(compute_2, float32, [1, 512, 7, 7], [])} {
- attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 64;
- allocate(conv2d_nchw: Pointer(local float32), float32, [2]), storage_scope = local;
- allocate(pad_temp.shared: Pointer(shared float32), float32, [2016]), storage_scope = shared;
- allocate(kernel.shared: Pointer(shared float32), float32, [768]), storage_scope = shared;
- attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196 {
- conv2d_nchw_1: Buffer(conv2d_nchw, float32, [1], [], scope="local", align=4)[0] = 0f32
+ attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 112;
+ allocate(conv2d_nchw: Pointer(local float32), float32, [7]), storage_scope = local;
+ allocate(pad_temp.shared: Pointer(shared float32), float32, [144]), storage_scope = shared;
+ allocate(kernel.shared: Pointer(shared float32), float32, [1536]), storage_scope = shared;
+ attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32 {
+ conv2d_nchw_1: Buffer(conv2d_nchw, float32, [7], [], scope="local", align=16)[0] = 0f32
conv2d_nchw_1[1] = 0f32
- for (rc.outer.outer: int32, 0, 16) {
+ conv2d_nchw_1[2] = 0f32
+ conv2d_nchw_1[3] = 0f32
+ conv2d_nchw_1[4] = 0f32
+ conv2d_nchw_1[5] = 0f32
+ conv2d_nchw_1[6] = 0f32
+ for (rc.outer.outer: int32, 0, 32) {
for (ry.outer.outer: int32, 0, 3) {
- let cse_var_2: int32 = (rc.outer.outer*1568)
- let cse_var_1: int32 = (ry.outer.outer*7)
+ let cse_var_4: int32 = (rc.outer.outer*784)
+ let cse_var_3: int32 = (ry.outer.outer*7)
+ let cse_var_2: int32 = (rc.outer.outer*144)
+ let cse_var_1: int32 = (ry.outer.outer*3)
{
- attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
- pad_temp.shared_1: Buffer(pad_temp.shared, float32, [2016], [], scope="shared")[threadIdx.x_1] = @tir.if_then_else(((((1 <= (floordiv(floormod(threadIdx.x_1, 63), 9) + ry.outer.outer)) && ((floordiv(floormod(threadIdx.x_1, 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 9))) && (floormod(threadIdx.x_1, 9) < 8)), data[((((cse_var_2 + (floordiv(threadIdx.x_1, 9)*7)) + cse_var_1) + floormod(threadIdx.x_1, 9)) - 8)], 0f [...]
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
- pad_temp.shared_1[(threadIdx.x_1 + 196)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 196), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 196), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 7), 9))) && (floormod((threadIdx.x_1 + 7), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 196), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 7), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
- pad_temp.shared_1[(threadIdx.x_1 + 392)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 392), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 392), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 5), 9))) && (floormod((threadIdx.x_1 + 5), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 392), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 5), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
- pad_temp.shared_1[(threadIdx.x_1 + 588)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 588), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 588), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 3), 9))) && (floormod((threadIdx.x_1 + 3), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 588), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 3), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
- pad_temp.shared_1[(threadIdx.x_1 + 784)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 784), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 784), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 1), 9))) && (floormod((threadIdx.x_1 + 1), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 784), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 1), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
- pad_temp.shared_1[(threadIdx.x_1 + 980)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 980), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 980), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 8), 9))) && (floormod((threadIdx.x_1 + 8), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 980), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 8), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
- pad_temp.shared_1[(threadIdx.x_1 + 1176)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 1176), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 1176), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 6), 9))) && (floormod((threadIdx.x_1 + 6), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1176), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 6), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
- pad_temp.shared_1[(threadIdx.x_1 + 1372)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 1372), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 1372), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 4), 9))) && (floormod((threadIdx.x_1 + 4), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1372), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 4), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
- pad_temp.shared_1[(threadIdx.x_1 + 1568)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 1568), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 1568), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 2), 9))) && (floormod((threadIdx.x_1 + 2), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1568), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 2), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
- pad_temp.shared_1[(threadIdx.x_1 + 1764)] = @tir.if_then_else(((((1 <= (floordiv(floormod(threadIdx.x_1, 63), 9) + ry.outer.outer)) && ((floordiv(floormod(threadIdx.x_1, 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 9))) && (floormod(threadIdx.x_1, 9) < 8)), data[((((cse_var_2 + (floordiv(threadIdx.x_1, 9)*7)) + cse_var_1) + floormod(threadIdx.x_1, 9)) + 1364)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
- if @tir.likely((threadIdx.x_1 < 56), dtype=bool) {
- pad_temp.shared_1[(threadIdx.x_1 + 1960)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 1960), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 1960), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 7), 9))) && (floormod((threadIdx.x_1 + 7), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1960), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 7), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ pad_temp.shared_1: Buffer(pad_temp.shared, float32, [144], [], scope="shared")[threadIdx.x_1] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(threadIdx.x_1, 9))) && (floormod(threadIdx.x_1, 9) < 8)), data[(((((cse_var_4 + (floordiv(threadIdx.x_1, 9)*49)) + cse_var_3) + (floormod(blockIdx.x, 7)*7)) + floormod(threadIdx.x_1, 9)) - 8)], 0f32 [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ pad_temp.shared_1[(threadIdx.x_1 + 32)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1 + 5), 9))) && (floormod((threadIdx.x_1 + 5), 9) < 8)), data[(((((cse_var_4 + (floordiv((threadIdx.x_1 + 32), 9)*49)) + cse_var_3) + (floormod(blockIdx.x, 7)*7)) + floormod((threadIdx.x_1 + 5), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ pad_temp.shared_1[(threadIdx.x_1 + 64)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1 + 1), 9))) && (floormod((threadIdx.x_1 + 1), 9) < 8)), data[(((((cse_var_4 + (floordiv((threadIdx.x_1 + 64), 9)*49)) + cse_var_3) + (floormod(blockIdx.x, 7)*7)) + floormod((threadIdx.x_1 + 1), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ pad_temp.shared_1[(threadIdx.x_1 + 96)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1 + 6), 9))) && (floormod((threadIdx.x_1 + 6), 9) < 8)), data[(((((cse_var_4 + (floordiv((threadIdx.x_1 + 96), 9)*49)) + cse_var_3) + (floormod(blockIdx.x, 7)*7)) + floormod((threadIdx.x_1 + 6), 9)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ if @tir.likely((threadIdx.x_1 < 16), dtype=bool) {
+ pad_temp.shared_1[(threadIdx.x_1 + 128)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1 + 2), 9))) && (floormod((threadIdx.x_1 + 2), 9) < 8)), data[(((((cse_var_4 + (floordiv((threadIdx.x_1 + 128), 9)*49)) + cse_var_3) + (floormod(blockIdx.x, 7)*7)) + floormod((threadIdx.x_1 + 2), 9)) - 8)], 0f32, dtype=float32)
}
- attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196 {
- kernel.shared_1: Buffer(kernel.shared, float32, [768], [], scope="shared")[(threadIdx.x_2*3)] = kernel[(((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 32)*4608)) + (rc.outer.outer*288)) + (floormod(threadIdx.x_2, 32)*9)) + (ry.outer.outer*3))]
- kernel.shared_1[((threadIdx.x_2*3) + 1)] = kernel[((((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 32)*4608)) + (rc.outer.outer*288)) + (floormod(threadIdx.x_2, 32)*9)) + (ry.outer.outer*3)) + 1)]
- kernel.shared_1[((threadIdx.x_2*3) + 2)] = kernel[((((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 32)*4608)) + (rc.outer.outer*288)) + (floormod(threadIdx.x_2, 32)*9)) + (ry.outer.outer*3)) + 2)]
- }
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
- if @tir.likely((threadIdx.x_2 < 60), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*3) + 588)] = kernel[(((((blockIdx.x*36864) + (floordiv((floordiv(threadIdx.x_2, 4) + 49), 8)*4608)) + (rc.outer.outer*288)) + (floormod((threadIdx.x_2 + 4), 32)*9)) + (ry.outer.outer*3))]
- kernel.shared_1[((threadIdx.x_2*3) + 589)] = kernel[((((((blockIdx.x*36864) + (floordiv((floordiv(threadIdx.x_2, 4) + 49), 8)*4608)) + (rc.outer.outer*288)) + (floormod((threadIdx.x_2 + 4), 32)*9)) + (ry.outer.outer*3)) + 1)]
- kernel.shared_1[((threadIdx.x_2*3) + 590)] = kernel[((((((blockIdx.x*36864) + (floordiv((floordiv(threadIdx.x_2, 4) + 49), 8)*4608)) + (rc.outer.outer*288)) + (floormod((threadIdx.x_2 + 4), 32)*9)) + (ry.outer.outer*3)) + 2)]
- }
- for (rc.outer.inner: int32, 0, 16) {
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7))]*kernel.shared_1[((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6))]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7))]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 384)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 1)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 1)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 1)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 385)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 2)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 2)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 2)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 386)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 3)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 387)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 64)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 4)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 64)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 388)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 65)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 5)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 65)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 389)]))
+ attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ kernel.shared_1: Buffer(kernel.shared, float32, [1536], [], scope="shared")[ramp((threadIdx.x_2*4), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(threadIdx.x_2, 12)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp((threadIdx.x_2*4), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp(threadIdx.x_2, 1, 4), broadcast(3, 4)))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ kernel.shared_1[ramp(((threadIdx.x_2*4) + 128), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 128), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 128), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 32), 1, 4), broadcast(3, 4)))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ kernel.shared_1[ramp(((threadIdx.x_2*4) + 256), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 256), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 256), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 64), 1, 4), broadcast(3, 4)))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ kernel.shared_1[ramp(((threadIdx.x_2*4) + 384), 1, 4)] = kernel[(((broadcast(((((floordiv(blockIdx.x, 7)*147456) + (floordiv(threadIdx.x_2, 12)*4608)) + cse_var_2) + 36864), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 384), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 96), 1, 4), broadcast(3, 4)))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ kernel.shared_1[ramp(((threadIdx.x_2*4) + 512), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 512), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 512), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 128), 1, 4), broadcast(3, 4)))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ kernel.shared_1[ramp(((threadIdx.x_2*4) + 640), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 640), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 640), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 160), 1, 4), broadcast(3, 4)))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ kernel.shared_1[ramp(((threadIdx.x_2*4) + 768), 1, 4)] = kernel[(((broadcast(((((floordiv(blockIdx.x, 7)*147456) + (floordiv(threadIdx.x_2, 12)*4608)) + cse_var_2) + 73728), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 768), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 192), 1, 4), broadcast(3, 4)))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ kernel.shared_1[ramp(((threadIdx.x_2*4) + 896), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 896), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 896), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 224), 1, 4), broadcast(3, 4)))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ kernel.shared_1[ramp(((threadIdx.x_2*4) + 1024), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 1024), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 1024), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 256), 1, 4), broadcast(3, 4)))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ kernel.shared_1[ramp(((threadIdx.x_2*4) + 1152), 1, 4)] = kernel[(((broadcast(((((floordiv(blockIdx.x, 7)*147456) + (floordiv(threadIdx.x_2, 12)*4608)) + cse_var_2) + 110592), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 1152), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 288), 1, 4), broadcast(3, 4)))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ kernel.shared_1[ramp(((threadIdx.x_2*4) + 1280), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 1280), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 1280), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 320), 1, 4), broadcast(3, 4)))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+ kernel.shared_1[ramp(((threadIdx.x_2*4) + 1408), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 1408), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 1408), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 352), 1, 4), broadcast(3, 4)))]
+ for (rc.outer.inner: int32, 0, 8) {
+ let cse_var_19: int32 = (rc.outer.inner*18)
+ let cse_var_18: int32 = (cse_var_19 + 7)
+ let cse_var_17: int32 = (cse_var_19 + 6)
+ let cse_var_16: int32 = (cse_var_19 + 5)
+ let cse_var_15: int32 = (cse_var_19 + 4)
+ let cse_var_14: int32 = (cse_var_19 + 3)
+ let cse_var_13: int32 = (cse_var_19 + 2)
+ let cse_var_12: int32 = (cse_var_19 + 16)
+ let cse_var_11: int32 = (cse_var_19 + 15)
+ let cse_var_10: int32 = (cse_var_19 + 14)
+ let cse_var_9: int32 = (cse_var_19 + 13)
+ let cse_var_8: int32 = (cse_var_19 + 12)
+ let cse_var_7: int32 = (cse_var_19 + 11)
+ let cse_var_6: int32 = (cse_var_19 + 10)
+ let cse_var_5: int32 = (cse_var_19 + 1)
+ {
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_19]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_5]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_13]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_14]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_15]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_16]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_17]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_5]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_13]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_14]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_15]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_16]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_17]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_18]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_13]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_14]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_15]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_16]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_17]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_18]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(cse_var_19 + 8)]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(cse_var_19 + 9)]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_6]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_7]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_8]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_9]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_10]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_11]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_6]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_7]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_8]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_9]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_10]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_11]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_12]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_7]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_8]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_9]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_10]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_11]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_12]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(cse_var_19 + 17)]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+ }
}
}
}
}
- compute[((blockIdx.x*392) + threadIdx.x)] = max((conv2d_nchw_1[0] + bias[((blockIdx.x*8) + floordiv(threadIdx.x, 49))]), 0f32)
- compute[(((blockIdx.x*392) + threadIdx.x) + 196)] = max((conv2d_nchw_1[1] + bias[(((blockIdx.x*8) + floordiv(threadIdx.x, 49)) + 4)]), 0f32)
+ for (i3.inner: int32, 0, 7) {
+ compute[((((floordiv(blockIdx.x, 7)*1568) + (threadIdx.x*49)) + (floormod(blockIdx.x, 7)*7)) + i3.inner)] = max((conv2d_nchw_1[i3.inner] + bias[((floordiv(blockIdx.x, 7)*32) + threadIdx.x)]), 0f32)
+ }
}
}
</pre></div>
@@ -587,7 +643,7 @@ cooperative fetching, unrolling and operator fusion.</p>
<span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 0.321 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 0.322 ms
</pre></div>
</div>
</div>
@@ -618,18 +674,18 @@ conv2d_nchw_nn_o_o_o_i, conv2d_nchw_nn_o_o_i = s[conv2d_nchw].split(conv2d_nchw_
conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=1)
-conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=4)
-conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=2)
+conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=32)
+conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
-conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=7)
+conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=1)
conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
-conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
+conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=7)
conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
-conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=7)
+conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=2)
-conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=16)
+conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=8)
conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1)
conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=1)
conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=3)
@@ -639,13 +695,13 @@ compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1)
compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=1)
-compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=4)
-compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=2)
+compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=32)
+compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
-compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=7)
+compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=1)
compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
-compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
-compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=7)
+compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
+compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
@@ -663,14 +719,14 @@ s[compute].bind(compute_i0_o_o_i_i1_o_o_i_fused_i2_o_o_i_fused_i3_o_o_i_fused, t
compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused = s[compute].fuse(compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i)
s[compute].bind(compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused, te.thread_axis("threadIdx.x"))
kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
-kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=3)
+kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=4)
s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=196)
+kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=32)
s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=196)
+pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=32)
s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 64)
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "unroll_explicit", True)
@@ -690,55 +746,2561 @@ CUDA source code:
#define int64_t long long
#define uint64_t unsigned long long
#endif
-extern "C" __global__ void __launch_bounds__(196) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
- float conv2d_nchw[2];
- __shared__ float pad_temp_shared[2016];
- __shared__ float kernel_shared[768];
+extern "C" __global__ void __launch_bounds__(32) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+ float conv2d_nchw[7];
+ __shared__ float pad_temp_shared[144];
+ __shared__ float kernel_shared[1536];
conv2d_nchw[0] = 0.000000e+00f;
conv2d_nchw[1] = 0.000000e+00f;
- for (int rc_outer_outer = 0; rc_outer_outer < 16; ++rc_outer_outer) {
+ conv2d_nchw[2] = 0.000000e+00f;
+ conv2d_nchw[3] = 0.000000e+00f;
+ conv2d_nchw[4] = 0.000000e+00f;
+ conv2d_nchw[5] = 0.000000e+00f;
+ conv2d_nchw[6] = 0.000000e+00f;
+ for (int rc_outer_outer = 0; rc_outer_outer < 32; ++rc_outer_outer) {
for (int ry_outer_outer = 0; ry_outer_outer < 3; ++ry_outer_outer) {
__syncthreads();
- pad_temp_shared[((int)threadIdx.x)] = (((((1 <= (((((int)threadIdx.x) % 63) / 9) + ry_outer_outer)) && ((((((int)threadIdx.x) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + ((((int)threadIdx.x) / 9) * 7)) + (ry_outer_outer * 7)) + (((int)threadIdx.x) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 196)] = (((((1 <= ((((((int)threadIdx.x) + 7) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 7) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 7) % 9))) && (((((int)threadIdx.x) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 196) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 7) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 392)] = (((((1 <= ((((((int)threadIdx.x) + 14) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 14) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 5) % 9))) && (((((int)threadIdx.x) + 5) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 392) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 5) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 588)] = (((((1 <= ((((((int)threadIdx.x) + 21) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 21) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 3) % 9))) && (((((int)threadIdx.x) + 3) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 588) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 3) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 784)] = (((((1 <= ((((((int)threadIdx.x) + 28) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 28) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 1) % 9))) && (((((int)threadIdx.x) + 1) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 784) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 1) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 980)] = (((((1 <= ((((((int)threadIdx.x) + 35) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 35) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 8) % 9))) && (((((int)threadIdx.x) + 8) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 980) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 8) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 1176)] = (((((1 <= ((((((int)threadIdx.x) + 42) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 42) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 6) % 9))) && (((((int)threadIdx.x) + 6) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1176) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 6) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 1372)] = (((((1 <= ((((((int)threadIdx.x) + 49) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 49) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 4) % 9))) && (((((int)threadIdx.x) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1372) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 4) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 1568)] = (((((1 <= ((((((int)threadIdx.x) + 56) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 56) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 2) % 9))) && (((((int)threadIdx.x) + 2) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1568) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 2) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 1764)] = (((((1 <= (((((int)threadIdx.x) % 63) / 9) + ry_outer_outer)) && ((((((int)threadIdx.x) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + ((((int)threadIdx.x) / 9) * 7)) + (ry_outer_outer * 7)) + (((int)threadIdx.x) % 9)) + 1364)] : 0.000000e+00f);
- if (((int)threadIdx.x) < 56) {
- pad_temp_shared[(((int)threadIdx.x) + 1960)] = (((((1 <= ((((((int)threadIdx.x) + 7) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 7) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 7) % 9))) && (((((int)threadIdx.x) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1960) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 7) % 9)) - 8)] : 0.000000e+00f);
- }
- kernel_shared[(((int)threadIdx.x) * 3)] = kernel[(((((((int)blockIdx.x) * 36864) + ((((int)threadIdx.x) >> 5) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) & 31) * 9)) + (ry_outer_outer * 3))];
- kernel_shared[((((int)threadIdx.x) * 3) + 1)] = kernel[((((((((int)blockIdx.x) * 36864) + ((((int)threadIdx.x) >> 5) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) & 31) * 9)) + (ry_outer_outer * 3)) + 1)];
- kernel_shared[((((int)threadIdx.x) * 3) + 2)] = kernel[((((((((int)blockIdx.x) * 36864) + ((((int)threadIdx.x) >> 5) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) & 31) * 9)) + (ry_outer_outer * 3)) + 2)];
- if (((int)threadIdx.x) < 60) {
- kernel_shared[((((int)threadIdx.x) * 3) + 588)] = kernel[(((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 196) >> 5) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 4) & 31) * 9)) + (ry_outer_outer * 3))];
- kernel_shared[((((int)threadIdx.x) * 3) + 589)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 196) >> 5) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 4) & 31) * 9)) + (ry_outer_outer * 3)) + 1)];
- kernel_shared[((((int)threadIdx.x) * 3) + 590)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 196) >> 5) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 4) & 31) * 9)) + (ry_outer_outer * 3)) + 2)];
+ pad_temp_shared[((int)threadIdx.x)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + ((((int)threadIdx.x) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((int)threadIdx.x) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 32)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) + 5) % 9))) && (((((int)threadIdx.x) + 5) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 32) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 5) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 64)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) + 1) % 9))) && (((((int)threadIdx.x) + 1) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 64) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 1) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 96)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) + 6) % 9))) && (((((int)threadIdx.x) + 6) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 96) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 6) % 9)) - 8)] : 0.000000e+00f);
+ if (((int)threadIdx.x) < 16) {
+ pad_temp_shared[(((int)threadIdx.x) + 128)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) + 2) % 9))) && (((((int)threadIdx.x) + 2) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 128) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 2) % 9)) - 8)] : 0.000000e+00f);
}
+ int4 _1;
+ int4 _2;
+ int4 _3;
+ int4 _4 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)));
+ int4 _5;
+ int4 _6;
+ int4 _7;
+ int4 _8 = make_int4(((((int)threadIdx.x) * 4))+(1*0), ((((int)threadIdx.x) * 4))+(1*1), ((((int)threadIdx.x) * 4))+(1*2), ((((int)threadIdx.x) * 4))+(1*3));
+ int4 _9 = make_int4(3, 3, 3, 3);
+ _7.x = (_8.x%_9.x);
+ _7.y = (_8.y%_9.y);
+ _7.z = (_8.z%_9.z);
+ _7.w = (_8.w%_9.w);
+ int4 _10;
+ int4 _11 = make_int4(((((int)threadIdx.x) * 4))+(1*0), ((((int)threadIdx.x) * 4))+(1*1), ((((int)threadIdx.x) * 4))+(1*2), ((((int)threadIdx.x) * 4))+(1*3));
+ int4 _12 = make_int4(3, 3, 3, 3);
+ _10.x = (_11.x/_12.x);
+ _10.y = (_11.y/_12.y);
+ _10.z = (_11.z/_12.z);
+ _10.w = (_11.w/_12.w);
+ int4 _13;
+ ushort4 _14;
+ ushort4 _15;
+ ushort4 _16;
+ int4 _17 = make_int4(3, 3, 3, 3);
+ int4 _18 = make_int4(0, 0, 0, 0);
+ _16.x = (_17.x>=_18.x);
+ _16.y = (_17.y>=_18.y);
+ _16.z = (_17.z>=_18.z);
+ _16.w = (_17.w>=_18.w);
+ ushort4 _19;
+ int4 _20 = make_int4(0, 0, 0, 0);
+ _19.x = (_7.x>=_20.x);
+ _19.y = (_7.y>=_20.y);
+ _19.z = (_7.z>=_20.z);
+ _19.w = (_7.w>=_20.w);
+ _15.x = (_16.x&&_19.x);
+ _15.y = (_16.y&&_19.y);
+ _15.z = (_16.z&&_19.z);
+ _15.w = (_16.w&&_19.w);
+ ushort4 _21;
+ ushort4 _22;
+ int4 _23 = make_int4(3, 3, 3, 3);
+ int4 _24 = make_int4(0, 0, 0, 0);
+ _22.x = (_23.x<_24.x);
+ _22.y = (_23.y<_24.y);
+ _22.z = (_23.z<_24.z);
+ _22.w = (_23.w<_24.w);
+ ushort4 _25;
+ int4 _26 = make_int4(0, 0, 0, 0);
+ _25.x = (_7.x<=_26.x);
+ _25.y = (_7.y<=_26.y);
+ _25.z = (_7.z<=_26.z);
+ _25.w = (_7.w<=_26.w);
+ _21.x = (_22.x&&_25.x);
+ _21.y = (_22.y&&_25.y);
+ _21.z = (_22.z&&_25.z);
+ _21.w = (_22.w&&_25.w);
+ _14.x = (_15.x||_21.x);
+ _14.y = (_15.y||_21.y);
+ _14.z = (_15.z||_21.z);
+ _14.w = (_15.w||_21.w);
+ int4 _27;
+ int4 _28 = make_int4(1, 1, 1, 1);
+ _27.x = (_10.x-_28.x);
+ _27.y = (_10.y-_28.y);
+ _27.z = (_10.z-_28.z);
+ _27.w = (_10.w-_28.w);
+ _13.x = (bool(_14.x)?_10.x:_27.x);
+ _13.y = (bool(_14.y)?_10.y:_27.y);
+ _13.z = (bool(_14.z)?_10.z:_27.z);
+ _13.w = (bool(_14.w)?_10.w:_27.w);
+ int4 _29 = make_int4(16, 16, 16, 16);
+ _6.x = (_13.x%_29.x);
+ _6.y = (_13.y%_29.y);
+ _6.z = (_13.z%_29.z);
+ _6.w = (_13.w%_29.w);
+ int4 _30;
+ ushort4 _31;
+ ushort4 _32;
+ ushort4 _33;
+ int4 _34 = make_int4(16, 16, 16, 16);
+ int4 _35 = make_int4(0, 0, 0, 0);
+ _33.x = (_34.x>=_35.x);
+ _33.y = (_34.y>=_35.y);
+ _33.z = (_34.z>=_35.z);
+ _33.w = (_34.w>=_35.w);
+ ushort4 _36;
+ int4 _37 = make_int4(0, 0, 0, 0);
+ _36.x = (_6.x>=_37.x);
+ _36.y = (_6.y>=_37.y);
+ _36.z = (_6.z>=_37.z);
+ _36.w = (_6.w>=_37.w);
+ _32.x = (_33.x&&_36.x);
+ _32.y = (_33.y&&_36.y);
+ _32.z = (_33.z&&_36.z);
+ _32.w = (_33.w&&_36.w);
+ ushort4 _38;
+ ushort4 _39;
+ int4 _40 = make_int4(16, 16, 16, 16);
+ int4 _41 = make_int4(0, 0, 0, 0);
+ _39.x = (_40.x<_41.x);
+ _39.y = (_40.y<_41.y);
+ _39.z = (_40.z<_41.z);
+ _39.w = (_40.w<_41.w);
+ ushort4 _42;
+ int4 _43 = make_int4(0, 0, 0, 0);
+ _42.x = (_6.x<=_43.x);
+ _42.y = (_6.y<=_43.y);
+ _42.z = (_6.z<=_43.z);
+ _42.w = (_6.w<=_43.w);
+ _38.x = (_39.x&&_42.x);
+ _38.y = (_39.y&&_42.y);
+ _38.z = (_39.z&&_42.z);
+ _38.w = (_39.w&&_42.w);
+ _31.x = (_32.x||_38.x);
+ _31.y = (_32.y||_38.y);
+ _31.z = (_32.z||_38.z);
+ _31.w = (_32.w||_38.w);
+ int4 _44;
+ int4 _45 = make_int4(16, 16, 16, 16);
+ _44.x = (_6.x+_45.x);
+ _44.y = (_6.y+_45.y);
+ _44.z = (_6.z+_45.z);
+ _44.w = (_6.w+_45.w);
+ _30.x = (bool(_31.x)?_6.x:_44.x);
+ _30.y = (bool(_31.y)?_6.y:_44.y);
+ _30.z = (bool(_31.z)?_6.z:_44.z);
+ _30.w = (bool(_31.w)?_6.w:_44.w);
+ int4 _46 = make_int4(9, 9, 9, 9);
+ _5.x = (_30.x*_46.x);
+ _5.y = (_30.y*_46.y);
+ _5.z = (_30.z*_46.z);
+ _5.w = (_30.w*_46.w);
+ _3.x = (_4.x+_5.x);
+ _3.y = (_4.y+_5.y);
+ _3.z = (_4.z+_5.z);
+ _3.w = (_4.w+_5.w);
+ int4 _47 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+ _2.x = (_3.x+_47.x);
+ _2.y = (_3.y+_47.y);
+ _2.z = (_3.z+_47.z);
+ _2.w = (_3.w+_47.w);
+ int4 _48;
+ int4 _49 = make_int4((((int)threadIdx.x))+(1*0), (((int)threadIdx.x))+(1*1), (((int)threadIdx.x))+(1*2), (((int)threadIdx.x))+(1*3));
+ int4 _50 = make_int4(3, 3, 3, 3);
+ _48.x = (_49.x%_50.x);
+ _48.y = (_49.y%_50.y);
+ _48.z = (_49.z%_50.z);
+ _48.w = (_49.w%_50.w);
+ int4 _51;
+ ushort4 _52;
+ ushort4 _53;
+ ushort4 _54;
+ int4 _55 = make_int4(3, 3, 3, 3);
+ int4 _56 = make_int4(0, 0, 0, 0);
+ _54.x = (_55.x>=_56.x);
+ _54.y = (_55.y>=_56.y);
+ _54.z = (_55.z>=_56.z);
+ _54.w = (_55.w>=_56.w);
+ ushort4 _57;
+ int4 _58 = make_int4(0, 0, 0, 0);
+ _57.x = (_48.x>=_58.x);
+ _57.y = (_48.y>=_58.y);
+ _57.z = (_48.z>=_58.z);
+ _57.w = (_48.w>=_58.w);
+ _53.x = (_54.x&&_57.x);
+ _53.y = (_54.y&&_57.y);
+ _53.z = (_54.z&&_57.z);
+ _53.w = (_54.w&&_57.w);
+ ushort4 _59;
+ ushort4 _60;
+ int4 _61 = make_int4(3, 3, 3, 3);
+ int4 _62 = make_int4(0, 0, 0, 0);
+ _60.x = (_61.x<_62.x);
+ _60.y = (_61.y<_62.y);
+ _60.z = (_61.z<_62.z);
+ _60.w = (_61.w<_62.w);
+ ushort4 _63;
+ int4 _64 = make_int4(0, 0, 0, 0);
+ _63.x = (_48.x<=_64.x);
+ _63.y = (_48.y<=_64.y);
+ _63.z = (_48.z<=_64.z);
+ _63.w = (_48.w<=_64.w);
+ _59.x = (_60.x&&_63.x);
+ _59.y = (_60.y&&_63.y);
+ _59.z = (_60.z&&_63.z);
+ _59.w = (_60.w&&_63.w);
+ _52.x = (_53.x||_59.x);
+ _52.y = (_53.y||_59.y);
+ _52.z = (_53.z||_59.z);
+ _52.w = (_53.w||_59.w);
+ int4 _65;
+ int4 _66 = make_int4(3, 3, 3, 3);
+ _65.x = (_48.x+_66.x);
+ _65.y = (_48.y+_66.y);
+ _65.z = (_48.z+_66.z);
+ _65.w = (_48.w+_66.w);
+ _51.x = (bool(_52.x)?_48.x:_65.x);
+ _51.y = (bool(_52.y)?_48.y:_65.y);
+ _51.z = (bool(_52.z)?_48.z:_65.z);
+ _51.w = (bool(_52.w)?_48.w:_65.w);
+ _1.x = (_2.x+_51.x);
+ _1.y = (_2.y+_51.y);
+ _1.z = (_2.z+_51.z);
+ _1.w = (_2.w+_51.w);
+ *(float4*)(kernel_shared + (((int)threadIdx.x) * 4)) = make_float4(kernel[_1.x],kernel[_1.y],kernel[_1.z],kernel[_1.w]);
+ int4 _67;
+ int4 _68;
+ int4 _69;
+ int4 _70 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 128) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 128) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 128) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 128) / 48) * 4608)) + (rc_outer_outer * 144)));
+ int4 _71;
+ int4 _72;
+ int4 _73;
+ int4 _74 = make_int4((((((int)threadIdx.x) * 4) + 128))+(1*0), (((((int)threadIdx.x) * 4) + 128))+(1*1), (((((int)threadIdx.x) * 4) + 128))+(1*2), (((((int)threadIdx.x) * 4) + 128))+(1*3));
+ int4 _75 = make_int4(3, 3, 3, 3);
+ _73.x = (_74.x%_75.x);
+ _73.y = (_74.y%_75.y);
+ _73.z = (_74.z%_75.z);
+ _73.w = (_74.w%_75.w);
+ int4 _76;
+ int4 _77 = make_int4((((((int)threadIdx.x) * 4) + 128))+(1*0), (((((int)threadIdx.x) * 4) + 128))+(1*1), (((((int)threadIdx.x) * 4) + 128))+(1*2), (((((int)threadIdx.x) * 4) + 128))+(1*3));
+ int4 _78 = make_int4(3, 3, 3, 3);
+ _76.x = (_77.x/_78.x);
+ _76.y = (_77.y/_78.y);
+ _76.z = (_77.z/_78.z);
+ _76.w = (_77.w/_78.w);
+ int4 _79;
+ ushort4 _80;
+ ushort4 _81;
+ ushort4 _82;
+ int4 _83 = make_int4(3, 3, 3, 3);
+ int4 _84 = make_int4(0, 0, 0, 0);
+ _82.x = (_83.x>=_84.x);
+ _82.y = (_83.y>=_84.y);
+ _82.z = (_83.z>=_84.z);
+ _82.w = (_83.w>=_84.w);
+ ushort4 _85;
+ int4 _86 = make_int4(0, 0, 0, 0);
+ _85.x = (_73.x>=_86.x);
+ _85.y = (_73.y>=_86.y);
+ _85.z = (_73.z>=_86.z);
+ _85.w = (_73.w>=_86.w);
+ _81.x = (_82.x&&_85.x);
+ _81.y = (_82.y&&_85.y);
+ _81.z = (_82.z&&_85.z);
+ _81.w = (_82.w&&_85.w);
+ ushort4 _87;
+ ushort4 _88;
+ int4 _89 = make_int4(3, 3, 3, 3);
+ int4 _90 = make_int4(0, 0, 0, 0);
+ _88.x = (_89.x<_90.x);
+ _88.y = (_89.y<_90.y);
+ _88.z = (_89.z<_90.z);
+ _88.w = (_89.w<_90.w);
+ ushort4 _91;
+ int4 _92 = make_int4(0, 0, 0, 0);
+ _91.x = (_73.x<=_92.x);
+ _91.y = (_73.y<=_92.y);
+ _91.z = (_73.z<=_92.z);
+ _91.w = (_73.w<=_92.w);
+ _87.x = (_88.x&&_91.x);
+ _87.y = (_88.y&&_91.y);
+ _87.z = (_88.z&&_91.z);
+ _87.w = (_88.w&&_91.w);
+ _80.x = (_81.x||_87.x);
+ _80.y = (_81.y||_87.y);
+ _80.z = (_81.z||_87.z);
+ _80.w = (_81.w||_87.w);
+ int4 _93;
+ int4 _94 = make_int4(1, 1, 1, 1);
+ _93.x = (_76.x-_94.x);
+ _93.y = (_76.y-_94.y);
+ _93.z = (_76.z-_94.z);
+ _93.w = (_76.w-_94.w);
+ _79.x = (bool(_80.x)?_76.x:_93.x);
+ _79.y = (bool(_80.y)?_76.y:_93.y);
+ _79.z = (bool(_80.z)?_76.z:_93.z);
+ _79.w = (bool(_80.w)?_76.w:_93.w);
+ int4 _95 = make_int4(16, 16, 16, 16);
+ _72.x = (_79.x%_95.x);
+ _72.y = (_79.y%_95.y);
+ _72.z = (_79.z%_95.z);
+ _72.w = (_79.w%_95.w);
+ int4 _96;
+ ushort4 _97;
+ ushort4 _98;
+ ushort4 _99;
+ int4 _100 = make_int4(16, 16, 16, 16);
+ int4 _101 = make_int4(0, 0, 0, 0);
+ _99.x = (_100.x>=_101.x);
+ _99.y = (_100.y>=_101.y);
+ _99.z = (_100.z>=_101.z);
+ _99.w = (_100.w>=_101.w);
+ ushort4 _102;
+ int4 _103 = make_int4(0, 0, 0, 0);
+ _102.x = (_72.x>=_103.x);
+ _102.y = (_72.y>=_103.y);
+ _102.z = (_72.z>=_103.z);
+ _102.w = (_72.w>=_103.w);
+ _98.x = (_99.x&&_102.x);
+ _98.y = (_99.y&&_102.y);
+ _98.z = (_99.z&&_102.z);
+ _98.w = (_99.w&&_102.w);
+ ushort4 _104;
+ ushort4 _105;
+ int4 _106 = make_int4(16, 16, 16, 16);
+ int4 _107 = make_int4(0, 0, 0, 0);
+ _105.x = (_106.x<_107.x);
+ _105.y = (_106.y<_107.y);
+ _105.z = (_106.z<_107.z);
+ _105.w = (_106.w<_107.w);
+ ushort4 _108;
+ int4 _109 = make_int4(0, 0, 0, 0);
+ _108.x = (_72.x<=_109.x);
+ _108.y = (_72.y<=_109.y);
+ _108.z = (_72.z<=_109.z);
+ _108.w = (_72.w<=_109.w);
+ _104.x = (_105.x&&_108.x);
+ _104.y = (_105.y&&_108.y);
+ _104.z = (_105.z&&_108.z);
+ _104.w = (_105.w&&_108.w);
+ _97.x = (_98.x||_104.x);
+ _97.y = (_98.y||_104.y);
+ _97.z = (_98.z||_104.z);
+ _97.w = (_98.w||_104.w);
+ int4 _110;
+ int4 _111 = make_int4(16, 16, 16, 16);
+ _110.x = (_72.x+_111.x);
+ _110.y = (_72.y+_111.y);
+ _110.z = (_72.z+_111.z);
+ _110.w = (_72.w+_111.w);
+ _96.x = (bool(_97.x)?_72.x:_110.x);
+ _96.y = (bool(_97.y)?_72.y:_110.y);
+ _96.z = (bool(_97.z)?_72.z:_110.z);
+ _96.w = (bool(_97.w)?_72.w:_110.w);
+ int4 _112 = make_int4(9, 9, 9, 9);
+ _71.x = (_96.x*_112.x);
+ _71.y = (_96.y*_112.y);
+ _71.z = (_96.z*_112.z);
+ _71.w = (_96.w*_112.w);
+ _69.x = (_70.x+_71.x);
+ _69.y = (_70.y+_71.y);
+ _69.z = (_70.z+_71.z);
+ _69.w = (_70.w+_71.w);
+ int4 _113 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+ _68.x = (_69.x+_113.x);
+ _68.y = (_69.y+_113.y);
+ _68.z = (_69.z+_113.z);
+ _68.w = (_69.w+_113.w);
+ int4 _114;
+ int4 _115 = make_int4(((((int)threadIdx.x) + 32))+(1*0), ((((int)threadIdx.x) + 32))+(1*1), ((((int)threadIdx.x) + 32))+(1*2), ((((int)threadIdx.x) + 32))+(1*3));
+ int4 _116 = make_int4(3, 3, 3, 3);
+ _114.x = (_115.x%_116.x);
+ _114.y = (_115.y%_116.y);
+ _114.z = (_115.z%_116.z);
+ _114.w = (_115.w%_116.w);
+ int4 _117;
+ ushort4 _118;
+ ushort4 _119;
+ ushort4 _120;
+ int4 _121 = make_int4(3, 3, 3, 3);
+ int4 _122 = make_int4(0, 0, 0, 0);
+ _120.x = (_121.x>=_122.x);
+ _120.y = (_121.y>=_122.y);
+ _120.z = (_121.z>=_122.z);
+ _120.w = (_121.w>=_122.w);
+ ushort4 _123;
+ int4 _124 = make_int4(0, 0, 0, 0);
+ _123.x = (_114.x>=_124.x);
+ _123.y = (_114.y>=_124.y);
+ _123.z = (_114.z>=_124.z);
+ _123.w = (_114.w>=_124.w);
+ _119.x = (_120.x&&_123.x);
+ _119.y = (_120.y&&_123.y);
+ _119.z = (_120.z&&_123.z);
+ _119.w = (_120.w&&_123.w);
+ ushort4 _125;
+ ushort4 _126;
+ int4 _127 = make_int4(3, 3, 3, 3);
+ int4 _128 = make_int4(0, 0, 0, 0);
+ _126.x = (_127.x<_128.x);
+ _126.y = (_127.y<_128.y);
+ _126.z = (_127.z<_128.z);
+ _126.w = (_127.w<_128.w);
+ ushort4 _129;
+ int4 _130 = make_int4(0, 0, 0, 0);
+ _129.x = (_114.x<=_130.x);
+ _129.y = (_114.y<=_130.y);
+ _129.z = (_114.z<=_130.z);
+ _129.w = (_114.w<=_130.w);
+ _125.x = (_126.x&&_129.x);
+ _125.y = (_126.y&&_129.y);
+ _125.z = (_126.z&&_129.z);
+ _125.w = (_126.w&&_129.w);
+ _118.x = (_119.x||_125.x);
+ _118.y = (_119.y||_125.y);
+ _118.z = (_119.z||_125.z);
+ _118.w = (_119.w||_125.w);
+ int4 _131;
+ int4 _132 = make_int4(3, 3, 3, 3);
+ _131.x = (_114.x+_132.x);
+ _131.y = (_114.y+_132.y);
+ _131.z = (_114.z+_132.z);
+ _131.w = (_114.w+_132.w);
+ _117.x = (bool(_118.x)?_114.x:_131.x);
+ _117.y = (bool(_118.y)?_114.y:_131.y);
+ _117.z = (bool(_118.z)?_114.z:_131.z);
+ _117.w = (bool(_118.w)?_114.w:_131.w);
+ _67.x = (_68.x+_117.x);
+ _67.y = (_68.y+_117.y);
+ _67.z = (_68.z+_117.z);
+ _67.w = (_68.w+_117.w);
+ *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 128)) = make_float4(kernel[_67.x],kernel[_67.y],kernel[_67.z],kernel[_67.w]);
+ int4 _133;
+ int4 _134;
+ int4 _135;
+ int4 _136 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 256) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 256) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 256) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 256) / 48) * 4608)) + (rc_outer_outer * 144)));
+ int4 _137;
+ int4 _138;
+ int4 _139;
+ int4 _140 = make_int4((((((int)threadIdx.x) * 4) + 256))+(1*0), (((((int)threadIdx.x) * 4) + 256))+(1*1), (((((int)threadIdx.x) * 4) + 256))+(1*2), (((((int)threadIdx.x) * 4) + 256))+(1*3));
+ int4 _141 = make_int4(3, 3, 3, 3);
+ _139.x = (_140.x%_141.x);
+ _139.y = (_140.y%_141.y);
+ _139.z = (_140.z%_141.z);
+ _139.w = (_140.w%_141.w);
+ int4 _142;
+ int4 _143 = make_int4((((((int)threadIdx.x) * 4) + 256))+(1*0), (((((int)threadIdx.x) * 4) + 256))+(1*1), (((((int)threadIdx.x) * 4) + 256))+(1*2), (((((int)threadIdx.x) * 4) + 256))+(1*3));
+ int4 _144 = make_int4(3, 3, 3, 3);
+ _142.x = (_143.x/_144.x);
+ _142.y = (_143.y/_144.y);
+ _142.z = (_143.z/_144.z);
+ _142.w = (_143.w/_144.w);
+ int4 _145;
+ ushort4 _146;
+ ushort4 _147;
+ ushort4 _148;
+ int4 _149 = make_int4(3, 3, 3, 3);
+ int4 _150 = make_int4(0, 0, 0, 0);
+ _148.x = (_149.x>=_150.x);
+ _148.y = (_149.y>=_150.y);
+ _148.z = (_149.z>=_150.z);
+ _148.w = (_149.w>=_150.w);
+ ushort4 _151;
+ int4 _152 = make_int4(0, 0, 0, 0);
+ _151.x = (_139.x>=_152.x);
+ _151.y = (_139.y>=_152.y);
+ _151.z = (_139.z>=_152.z);
+ _151.w = (_139.w>=_152.w);
+ _147.x = (_148.x&&_151.x);
+ _147.y = (_148.y&&_151.y);
+ _147.z = (_148.z&&_151.z);
+ _147.w = (_148.w&&_151.w);
+ ushort4 _153;
+ ushort4 _154;
+ int4 _155 = make_int4(3, 3, 3, 3);
+ int4 _156 = make_int4(0, 0, 0, 0);
+ _154.x = (_155.x<_156.x);
+ _154.y = (_155.y<_156.y);
+ _154.z = (_155.z<_156.z);
+ _154.w = (_155.w<_156.w);
+ ushort4 _157;
+ int4 _158 = make_int4(0, 0, 0, 0);
+ _157.x = (_139.x<=_158.x);
+ _157.y = (_139.y<=_158.y);
+ _157.z = (_139.z<=_158.z);
+ _157.w = (_139.w<=_158.w);
+ _153.x = (_154.x&&_157.x);
+ _153.y = (_154.y&&_157.y);
+ _153.z = (_154.z&&_157.z);
+ _153.w = (_154.w&&_157.w);
+ _146.x = (_147.x||_153.x);
+ _146.y = (_147.y||_153.y);
+ _146.z = (_147.z||_153.z);
+ _146.w = (_147.w||_153.w);
+ int4 _159;
+ int4 _160 = make_int4(1, 1, 1, 1);
+ _159.x = (_142.x-_160.x);
+ _159.y = (_142.y-_160.y);
+ _159.z = (_142.z-_160.z);
+ _159.w = (_142.w-_160.w);
+ _145.x = (bool(_146.x)?_142.x:_159.x);
+ _145.y = (bool(_146.y)?_142.y:_159.y);
+ _145.z = (bool(_146.z)?_142.z:_159.z);
+ _145.w = (bool(_146.w)?_142.w:_159.w);
+ int4 _161 = make_int4(16, 16, 16, 16);
+ _138.x = (_145.x%_161.x);
+ _138.y = (_145.y%_161.y);
+ _138.z = (_145.z%_161.z);
+ _138.w = (_145.w%_161.w);
+ int4 _162;
+ ushort4 _163;
+ ushort4 _164;
+ ushort4 _165;
+ int4 _166 = make_int4(16, 16, 16, 16);
+ int4 _167 = make_int4(0, 0, 0, 0);
+ _165.x = (_166.x>=_167.x);
+ _165.y = (_166.y>=_167.y);
+ _165.z = (_166.z>=_167.z);
+ _165.w = (_166.w>=_167.w);
+ ushort4 _168;
+ int4 _169 = make_int4(0, 0, 0, 0);
+ _168.x = (_138.x>=_169.x);
+ _168.y = (_138.y>=_169.y);
+ _168.z = (_138.z>=_169.z);
+ _168.w = (_138.w>=_169.w);
+ _164.x = (_165.x&&_168.x);
+ _164.y = (_165.y&&_168.y);
+ _164.z = (_165.z&&_168.z);
+ _164.w = (_165.w&&_168.w);
+ ushort4 _170;
+ ushort4 _171;
+ int4 _172 = make_int4(16, 16, 16, 16);
+ int4 _173 = make_int4(0, 0, 0, 0);
+ _171.x = (_172.x<_173.x);
+ _171.y = (_172.y<_173.y);
+ _171.z = (_172.z<_173.z);
+ _171.w = (_172.w<_173.w);
+ ushort4 _174;
+ int4 _175 = make_int4(0, 0, 0, 0);
+ _174.x = (_138.x<=_175.x);
+ _174.y = (_138.y<=_175.y);
+ _174.z = (_138.z<=_175.z);
+ _174.w = (_138.w<=_175.w);
+ _170.x = (_171.x&&_174.x);
+ _170.y = (_171.y&&_174.y);
+ _170.z = (_171.z&&_174.z);
+ _170.w = (_171.w&&_174.w);
+ _163.x = (_164.x||_170.x);
+ _163.y = (_164.y||_170.y);
+ _163.z = (_164.z||_170.z);
+ _163.w = (_164.w||_170.w);
+ int4 _176;
+ int4 _177 = make_int4(16, 16, 16, 16);
+ _176.x = (_138.x+_177.x);
+ _176.y = (_138.y+_177.y);
+ _176.z = (_138.z+_177.z);
+ _176.w = (_138.w+_177.w);
+ _162.x = (bool(_163.x)?_138.x:_176.x);
+ _162.y = (bool(_163.y)?_138.y:_176.y);
+ _162.z = (bool(_163.z)?_138.z:_176.z);
+ _162.w = (bool(_163.w)?_138.w:_176.w);
+ int4 _178 = make_int4(9, 9, 9, 9);
+ _137.x = (_162.x*_178.x);
+ _137.y = (_162.y*_178.y);
+ _137.z = (_162.z*_178.z);
+ _137.w = (_162.w*_178.w);
+ _135.x = (_136.x+_137.x);
+ _135.y = (_136.y+_137.y);
+ _135.z = (_136.z+_137.z);
+ _135.w = (_136.w+_137.w);
+ int4 _179 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+ _134.x = (_135.x+_179.x);
+ _134.y = (_135.y+_179.y);
+ _134.z = (_135.z+_179.z);
+ _134.w = (_135.w+_179.w);
+ int4 _180;
+ int4 _181 = make_int4(((((int)threadIdx.x) + 64))+(1*0), ((((int)threadIdx.x) + 64))+(1*1), ((((int)threadIdx.x) + 64))+(1*2), ((((int)threadIdx.x) + 64))+(1*3));
+ int4 _182 = make_int4(3, 3, 3, 3);
+ _180.x = (_181.x%_182.x);
+ _180.y = (_181.y%_182.y);
+ _180.z = (_181.z%_182.z);
+ _180.w = (_181.w%_182.w);
+ int4 _183;
+ ushort4 _184;
+ ushort4 _185;
+ ushort4 _186;
+ int4 _187 = make_int4(3, 3, 3, 3);
+ int4 _188 = make_int4(0, 0, 0, 0);
+ _186.x = (_187.x>=_188.x);
+ _186.y = (_187.y>=_188.y);
+ _186.z = (_187.z>=_188.z);
+ _186.w = (_187.w>=_188.w);
+ ushort4 _189;
+ int4 _190 = make_int4(0, 0, 0, 0);
+ _189.x = (_180.x>=_190.x);
+ _189.y = (_180.y>=_190.y);
+ _189.z = (_180.z>=_190.z);
+ _189.w = (_180.w>=_190.w);
+ _185.x = (_186.x&&_189.x);
+ _185.y = (_186.y&&_189.y);
+ _185.z = (_186.z&&_189.z);
+ _185.w = (_186.w&&_189.w);
+ ushort4 _191;
+ ushort4 _192;
+ int4 _193 = make_int4(3, 3, 3, 3);
+ int4 _194 = make_int4(0, 0, 0, 0);
+ _192.x = (_193.x<_194.x);
+ _192.y = (_193.y<_194.y);
+ _192.z = (_193.z<_194.z);
+ _192.w = (_193.w<_194.w);
+ ushort4 _195;
+ int4 _196 = make_int4(0, 0, 0, 0);
+ _195.x = (_180.x<=_196.x);
+ _195.y = (_180.y<=_196.y);
+ _195.z = (_180.z<=_196.z);
+ _195.w = (_180.w<=_196.w);
+ _191.x = (_192.x&&_195.x);
+ _191.y = (_192.y&&_195.y);
+ _191.z = (_192.z&&_195.z);
+ _191.w = (_192.w&&_195.w);
+ _184.x = (_185.x||_191.x);
+ _184.y = (_185.y||_191.y);
+ _184.z = (_185.z||_191.z);
+ _184.w = (_185.w||_191.w);
+ int4 _197;
+ int4 _198 = make_int4(3, 3, 3, 3);
+ _197.x = (_180.x+_198.x);
+ _197.y = (_180.y+_198.y);
+ _197.z = (_180.z+_198.z);
+ _197.w = (_180.w+_198.w);
+ _183.x = (bool(_184.x)?_180.x:_197.x);
+ _183.y = (bool(_184.y)?_180.y:_197.y);
+ _183.z = (bool(_184.z)?_180.z:_197.z);
+ _183.w = (bool(_184.w)?_180.w:_197.w);
+ _133.x = (_134.x+_183.x);
+ _133.y = (_134.y+_183.y);
+ _133.z = (_134.z+_183.z);
+ _133.w = (_134.w+_183.w);
+ *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 256)) = make_float4(kernel[_133.x],kernel[_133.y],kernel[_133.z],kernel[_133.w]);
+ int4 _199;
+ int4 _200;
+ int4 _201;
+ int4 _202 = make_int4((((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 36864), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 36864), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 36864), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 36864));
+ int4 _203;
+ int4 _204;
+ int4 _205;
+ int4 _206 = make_int4((((((int)threadIdx.x) * 4) + 384))+(1*0), (((((int)threadIdx.x) * 4) + 384))+(1*1), (((((int)threadIdx.x) * 4) + 384))+(1*2), (((((int)threadIdx.x) * 4) + 384))+(1*3));
+ int4 _207 = make_int4(3, 3, 3, 3);
+ _205.x = (_206.x%_207.x);
+ _205.y = (_206.y%_207.y);
+ _205.z = (_206.z%_207.z);
+ _205.w = (_206.w%_207.w);
+ int4 _208;
+ int4 _209 = make_int4((((((int)threadIdx.x) * 4) + 384))+(1*0), (((((int)threadIdx.x) * 4) + 384))+(1*1), (((((int)threadIdx.x) * 4) + 384))+(1*2), (((((int)threadIdx.x) * 4) + 384))+(1*3));
+ int4 _210 = make_int4(3, 3, 3, 3);
+ _208.x = (_209.x/_210.x);
+ _208.y = (_209.y/_210.y);
+ _208.z = (_209.z/_210.z);
+ _208.w = (_209.w/_210.w);
+ int4 _211;
+ ushort4 _212;
+ ushort4 _213;
+ ushort4 _214;
+ int4 _215 = make_int4(3, 3, 3, 3);
+ int4 _216 = make_int4(0, 0, 0, 0);
+ _214.x = (_215.x>=_216.x);
+ _214.y = (_215.y>=_216.y);
+ _214.z = (_215.z>=_216.z);
+ _214.w = (_215.w>=_216.w);
+ ushort4 _217;
+ int4 _218 = make_int4(0, 0, 0, 0);
+ _217.x = (_205.x>=_218.x);
+ _217.y = (_205.y>=_218.y);
+ _217.z = (_205.z>=_218.z);
+ _217.w = (_205.w>=_218.w);
+ _213.x = (_214.x&&_217.x);
+ _213.y = (_214.y&&_217.y);
+ _213.z = (_214.z&&_217.z);
+ _213.w = (_214.w&&_217.w);
+ ushort4 _219;
+ ushort4 _220;
+ int4 _221 = make_int4(3, 3, 3, 3);
+ int4 _222 = make_int4(0, 0, 0, 0);
+ _220.x = (_221.x<_222.x);
+ _220.y = (_221.y<_222.y);
+ _220.z = (_221.z<_222.z);
+ _220.w = (_221.w<_222.w);
+ ushort4 _223;
+ int4 _224 = make_int4(0, 0, 0, 0);
+ _223.x = (_205.x<=_224.x);
+ _223.y = (_205.y<=_224.y);
+ _223.z = (_205.z<=_224.z);
+ _223.w = (_205.w<=_224.w);
+ _219.x = (_220.x&&_223.x);
+ _219.y = (_220.y&&_223.y);
+ _219.z = (_220.z&&_223.z);
+ _219.w = (_220.w&&_223.w);
+ _212.x = (_213.x||_219.x);
+ _212.y = (_213.y||_219.y);
+ _212.z = (_213.z||_219.z);
+ _212.w = (_213.w||_219.w);
+ int4 _225;
+ int4 _226 = make_int4(1, 1, 1, 1);
+ _225.x = (_208.x-_226.x);
+ _225.y = (_208.y-_226.y);
+ _225.z = (_208.z-_226.z);
+ _225.w = (_208.w-_226.w);
+ _211.x = (bool(_212.x)?_208.x:_225.x);
+ _211.y = (bool(_212.y)?_208.y:_225.y);
+ _211.z = (bool(_212.z)?_208.z:_225.z);
+ _211.w = (bool(_212.w)?_208.w:_225.w);
+ int4 _227 = make_int4(16, 16, 16, 16);
+ _204.x = (_211.x%_227.x);
+ _204.y = (_211.y%_227.y);
+ _204.z = (_211.z%_227.z);
+ _204.w = (_211.w%_227.w);
+ int4 _228;
+ ushort4 _229;
+ ushort4 _230;
+ ushort4 _231;
+ int4 _232 = make_int4(16, 16, 16, 16);
+ int4 _233 = make_int4(0, 0, 0, 0);
+ _231.x = (_232.x>=_233.x);
+ _231.y = (_232.y>=_233.y);
+ _231.z = (_232.z>=_233.z);
+ _231.w = (_232.w>=_233.w);
+ ushort4 _234;
+ int4 _235 = make_int4(0, 0, 0, 0);
+ _234.x = (_204.x>=_235.x);
+ _234.y = (_204.y>=_235.y);
+ _234.z = (_204.z>=_235.z);
+ _234.w = (_204.w>=_235.w);
+ _230.x = (_231.x&&_234.x);
+ _230.y = (_231.y&&_234.y);
+ _230.z = (_231.z&&_234.z);
+ _230.w = (_231.w&&_234.w);
+ ushort4 _236;
+ ushort4 _237;
+ int4 _238 = make_int4(16, 16, 16, 16);
+ int4 _239 = make_int4(0, 0, 0, 0);
+ _237.x = (_238.x<_239.x);
+ _237.y = (_238.y<_239.y);
+ _237.z = (_238.z<_239.z);
+ _237.w = (_238.w<_239.w);
+ ushort4 _240;
+ int4 _241 = make_int4(0, 0, 0, 0);
+ _240.x = (_204.x<=_241.x);
+ _240.y = (_204.y<=_241.y);
+ _240.z = (_204.z<=_241.z);
+ _240.w = (_204.w<=_241.w);
+ _236.x = (_237.x&&_240.x);
+ _236.y = (_237.y&&_240.y);
+ _236.z = (_237.z&&_240.z);
+ _236.w = (_237.w&&_240.w);
+ _229.x = (_230.x||_236.x);
+ _229.y = (_230.y||_236.y);
+ _229.z = (_230.z||_236.z);
+ _229.w = (_230.w||_236.w);
+ int4 _242;
+ int4 _243 = make_int4(16, 16, 16, 16);
+ _242.x = (_204.x+_243.x);
+ _242.y = (_204.y+_243.y);
+ _242.z = (_204.z+_243.z);
+ _242.w = (_204.w+_243.w);
+ _228.x = (bool(_229.x)?_204.x:_242.x);
+ _228.y = (bool(_229.y)?_204.y:_242.y);
+ _228.z = (bool(_229.z)?_204.z:_242.z);
+ _228.w = (bool(_229.w)?_204.w:_242.w);
+ int4 _244 = make_int4(9, 9, 9, 9);
+ _203.x = (_228.x*_244.x);
+ _203.y = (_228.y*_244.y);
+ _203.z = (_228.z*_244.z);
+ _203.w = (_228.w*_244.w);
+ _201.x = (_202.x+_203.x);
+ _201.y = (_202.y+_203.y);
+ _201.z = (_202.z+_203.z);
+ _201.w = (_202.w+_203.w);
+ int4 _245 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+ _200.x = (_201.x+_245.x);
+ _200.y = (_201.y+_245.y);
+ _200.z = (_201.z+_245.z);
+ _200.w = (_201.w+_245.w);
+ int4 _246;
+ int4 _247 = make_int4(((((int)threadIdx.x) + 96))+(1*0), ((((int)threadIdx.x) + 96))+(1*1), ((((int)threadIdx.x) + 96))+(1*2), ((((int)threadIdx.x) + 96))+(1*3));
+ int4 _248 = make_int4(3, 3, 3, 3);
+ _246.x = (_247.x%_248.x);
+ _246.y = (_247.y%_248.y);
+ _246.z = (_247.z%_248.z);
+ _246.w = (_247.w%_248.w);
+ int4 _249;
+ ushort4 _250;
+ ushort4 _251;
+ ushort4 _252;
+ int4 _253 = make_int4(3, 3, 3, 3);
+ int4 _254 = make_int4(0, 0, 0, 0);
+ _252.x = (_253.x>=_254.x);
+ _252.y = (_253.y>=_254.y);
+ _252.z = (_253.z>=_254.z);
+ _252.w = (_253.w>=_254.w);
+ ushort4 _255;
+ int4 _256 = make_int4(0, 0, 0, 0);
+ _255.x = (_246.x>=_256.x);
+ _255.y = (_246.y>=_256.y);
+ _255.z = (_246.z>=_256.z);
+ _255.w = (_246.w>=_256.w);
+ _251.x = (_252.x&&_255.x);
+ _251.y = (_252.y&&_255.y);
+ _251.z = (_252.z&&_255.z);
+ _251.w = (_252.w&&_255.w);
+ ushort4 _257;
+ ushort4 _258;
+ int4 _259 = make_int4(3, 3, 3, 3);
+ int4 _260 = make_int4(0, 0, 0, 0);
+ _258.x = (_259.x<_260.x);
+ _258.y = (_259.y<_260.y);
+ _258.z = (_259.z<_260.z);
+ _258.w = (_259.w<_260.w);
+ ushort4 _261;
+ int4 _262 = make_int4(0, 0, 0, 0);
+ _261.x = (_246.x<=_262.x);
+ _261.y = (_246.y<=_262.y);
+ _261.z = (_246.z<=_262.z);
+ _261.w = (_246.w<=_262.w);
+ _257.x = (_258.x&&_261.x);
+ _257.y = (_258.y&&_261.y);
+ _257.z = (_258.z&&_261.z);
+ _257.w = (_258.w&&_261.w);
+ _250.x = (_251.x||_257.x);
+ _250.y = (_251.y||_257.y);
+ _250.z = (_251.z||_257.z);
+ _250.w = (_251.w||_257.w);
+ int4 _263;
+ int4 _264 = make_int4(3, 3, 3, 3);
+ _263.x = (_246.x+_264.x);
+ _263.y = (_246.y+_264.y);
+ _263.z = (_246.z+_264.z);
+ _263.w = (_246.w+_264.w);
+ _249.x = (bool(_250.x)?_246.x:_263.x);
+ _249.y = (bool(_250.y)?_246.y:_263.y);
+ _249.z = (bool(_250.z)?_246.z:_263.z);
+ _249.w = (bool(_250.w)?_246.w:_263.w);
+ _199.x = (_200.x+_249.x);
+ _199.y = (_200.y+_249.y);
+ _199.z = (_200.z+_249.z);
+ _199.w = (_200.w+_249.w);
+ *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 384)) = make_float4(kernel[_199.x],kernel[_199.y],kernel[_199.z],kernel[_199.w]);
+ int4 _265;
+ int4 _266;
+ int4 _267;
+ int4 _268 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 512) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 512) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 512) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 512) / 48) * 4608)) + (rc_outer_outer * 144)));
+ int4 _269;
+ int4 _270;
+ int4 _271;
+ int4 _272 = make_int4((((((int)threadIdx.x) * 4) + 512))+(1*0), (((((int)threadIdx.x) * 4) + 512))+(1*1), (((((int)threadIdx.x) * 4) + 512))+(1*2), (((((int)threadIdx.x) * 4) + 512))+(1*3));
+ int4 _273 = make_int4(3, 3, 3, 3);
+ _271.x = (_272.x%_273.x);
+ _271.y = (_272.y%_273.y);
+ _271.z = (_272.z%_273.z);
+ _271.w = (_272.w%_273.w);
+ int4 _274;
+ int4 _275 = make_int4((((((int)threadIdx.x) * 4) + 512))+(1*0), (((((int)threadIdx.x) * 4) + 512))+(1*1), (((((int)threadIdx.x) * 4) + 512))+(1*2), (((((int)threadIdx.x) * 4) + 512))+(1*3));
+ int4 _276 = make_int4(3, 3, 3, 3);
+ _274.x = (_275.x/_276.x);
+ _274.y = (_275.y/_276.y);
+ _274.z = (_275.z/_276.z);
+ _274.w = (_275.w/_276.w);
+ int4 _277;
+ ushort4 _278;
+ ushort4 _279;
+ ushort4 _280;
+ int4 _281 = make_int4(3, 3, 3, 3);
+ int4 _282 = make_int4(0, 0, 0, 0);
+ _280.x = (_281.x>=_282.x);
+ _280.y = (_281.y>=_282.y);
+ _280.z = (_281.z>=_282.z);
+ _280.w = (_281.w>=_282.w);
+ ushort4 _283;
+ int4 _284 = make_int4(0, 0, 0, 0);
+ _283.x = (_271.x>=_284.x);
+ _283.y = (_271.y>=_284.y);
+ _283.z = (_271.z>=_284.z);
+ _283.w = (_271.w>=_284.w);
+ _279.x = (_280.x&&_283.x);
+ _279.y = (_280.y&&_283.y);
+ _279.z = (_280.z&&_283.z);
+ _279.w = (_280.w&&_283.w);
+ ushort4 _285;
+ ushort4 _286;
+ int4 _287 = make_int4(3, 3, 3, 3);
+ int4 _288 = make_int4(0, 0, 0, 0);
+ _286.x = (_287.x<_288.x);
+ _286.y = (_287.y<_288.y);
+ _286.z = (_287.z<_288.z);
+ _286.w = (_287.w<_288.w);
+ ushort4 _289;
+ int4 _290 = make_int4(0, 0, 0, 0);
+ _289.x = (_271.x<=_290.x);
+ _289.y = (_271.y<=_290.y);
+ _289.z = (_271.z<=_290.z);
+ _289.w = (_271.w<=_290.w);
+ _285.x = (_286.x&&_289.x);
+ _285.y = (_286.y&&_289.y);
+ _285.z = (_286.z&&_289.z);
+ _285.w = (_286.w&&_289.w);
+ _278.x = (_279.x||_285.x);
+ _278.y = (_279.y||_285.y);
+ _278.z = (_279.z||_285.z);
+ _278.w = (_279.w||_285.w);
+ int4 _291;
+ int4 _292 = make_int4(1, 1, 1, 1);
+ _291.x = (_274.x-_292.x);
+ _291.y = (_274.y-_292.y);
+ _291.z = (_274.z-_292.z);
+ _291.w = (_274.w-_292.w);
+ _277.x = (bool(_278.x)?_274.x:_291.x);
+ _277.y = (bool(_278.y)?_274.y:_291.y);
+ _277.z = (bool(_278.z)?_274.z:_291.z);
+ _277.w = (bool(_278.w)?_274.w:_291.w);
+ int4 _293 = make_int4(16, 16, 16, 16);
+ _270.x = (_277.x%_293.x);
+ _270.y = (_277.y%_293.y);
+ _270.z = (_277.z%_293.z);
+ _270.w = (_277.w%_293.w);
+ int4 _294;
+ ushort4 _295;
+ ushort4 _296;
+ ushort4 _297;
+ int4 _298 = make_int4(16, 16, 16, 16);
+ int4 _299 = make_int4(0, 0, 0, 0);
+ _297.x = (_298.x>=_299.x);
+ _297.y = (_298.y>=_299.y);
+ _297.z = (_298.z>=_299.z);
+ _297.w = (_298.w>=_299.w);
+ ushort4 _300;
+ int4 _301 = make_int4(0, 0, 0, 0);
+ _300.x = (_270.x>=_301.x);
+ _300.y = (_270.y>=_301.y);
+ _300.z = (_270.z>=_301.z);
+ _300.w = (_270.w>=_301.w);
+ _296.x = (_297.x&&_300.x);
+ _296.y = (_297.y&&_300.y);
+ _296.z = (_297.z&&_300.z);
+ _296.w = (_297.w&&_300.w);
+ ushort4 _302;
+ ushort4 _303;
+ int4 _304 = make_int4(16, 16, 16, 16);
+ int4 _305 = make_int4(0, 0, 0, 0);
+ _303.x = (_304.x<_305.x);
+ _303.y = (_304.y<_305.y);
+ _303.z = (_304.z<_305.z);
+ _303.w = (_304.w<_305.w);
+ ushort4 _306;
+ int4 _307 = make_int4(0, 0, 0, 0);
+ _306.x = (_270.x<=_307.x);
+ _306.y = (_270.y<=_307.y);
+ _306.z = (_270.z<=_307.z);
+ _306.w = (_270.w<=_307.w);
+ _302.x = (_303.x&&_306.x);
+ _302.y = (_303.y&&_306.y);
+ _302.z = (_303.z&&_306.z);
+ _302.w = (_303.w&&_306.w);
+ _295.x = (_296.x||_302.x);
+ _295.y = (_296.y||_302.y);
+ _295.z = (_296.z||_302.z);
+ _295.w = (_296.w||_302.w);
+ int4 _308;
+ int4 _309 = make_int4(16, 16, 16, 16);
+ _308.x = (_270.x+_309.x);
+ _308.y = (_270.y+_309.y);
+ _308.z = (_270.z+_309.z);
+ _308.w = (_270.w+_309.w);
+ _294.x = (bool(_295.x)?_270.x:_308.x);
+ _294.y = (bool(_295.y)?_270.y:_308.y);
+ _294.z = (bool(_295.z)?_270.z:_308.z);
+ _294.w = (bool(_295.w)?_270.w:_308.w);
+ int4 _310 = make_int4(9, 9, 9, 9);
+ _269.x = (_294.x*_310.x);
+ _269.y = (_294.y*_310.y);
+ _269.z = (_294.z*_310.z);
+ _269.w = (_294.w*_310.w);
+ _267.x = (_268.x+_269.x);
+ _267.y = (_268.y+_269.y);
+ _267.z = (_268.z+_269.z);
+ _267.w = (_268.w+_269.w);
+ int4 _311 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+ _266.x = (_267.x+_311.x);
+ _266.y = (_267.y+_311.y);
+ _266.z = (_267.z+_311.z);
+ _266.w = (_267.w+_311.w);
+ int4 _312;
+ int4 _313 = make_int4(((((int)threadIdx.x) + 128))+(1*0), ((((int)threadIdx.x) + 128))+(1*1), ((((int)threadIdx.x) + 128))+(1*2), ((((int)threadIdx.x) + 128))+(1*3));
+ int4 _314 = make_int4(3, 3, 3, 3);
+ _312.x = (_313.x%_314.x);
+ _312.y = (_313.y%_314.y);
+ _312.z = (_313.z%_314.z);
+ _312.w = (_313.w%_314.w);
+ int4 _315;
+ ushort4 _316;
+ ushort4 _317;
+ ushort4 _318;
+ int4 _319 = make_int4(3, 3, 3, 3);
+ int4 _320 = make_int4(0, 0, 0, 0);
+ _318.x = (_319.x>=_320.x);
+ _318.y = (_319.y>=_320.y);
+ _318.z = (_319.z>=_320.z);
+ _318.w = (_319.w>=_320.w);
+ ushort4 _321;
+ int4 _322 = make_int4(0, 0, 0, 0);
+ _321.x = (_312.x>=_322.x);
+ _321.y = (_312.y>=_322.y);
+ _321.z = (_312.z>=_322.z);
+ _321.w = (_312.w>=_322.w);
+ _317.x = (_318.x&&_321.x);
+ _317.y = (_318.y&&_321.y);
+ _317.z = (_318.z&&_321.z);
+ _317.w = (_318.w&&_321.w);
+ ushort4 _323;
+ ushort4 _324;
+ int4 _325 = make_int4(3, 3, 3, 3);
+ int4 _326 = make_int4(0, 0, 0, 0);
+ _324.x = (_325.x<_326.x);
+ _324.y = (_325.y<_326.y);
+ _324.z = (_325.z<_326.z);
+ _324.w = (_325.w<_326.w);
+ ushort4 _327;
+ int4 _328 = make_int4(0, 0, 0, 0);
+ _327.x = (_312.x<=_328.x);
+ _327.y = (_312.y<=_328.y);
+ _327.z = (_312.z<=_328.z);
+ _327.w = (_312.w<=_328.w);
+ _323.x = (_324.x&&_327.x);
+ _323.y = (_324.y&&_327.y);
+ _323.z = (_324.z&&_327.z);
+ _323.w = (_324.w&&_327.w);
+ _316.x = (_317.x||_323.x);
+ _316.y = (_317.y||_323.y);
+ _316.z = (_317.z||_323.z);
+ _316.w = (_317.w||_323.w);
+ int4 _329;
+ int4 _330 = make_int4(3, 3, 3, 3);
+ _329.x = (_312.x+_330.x);
+ _329.y = (_312.y+_330.y);
+ _329.z = (_312.z+_330.z);
+ _329.w = (_312.w+_330.w);
+ _315.x = (bool(_316.x)?_312.x:_329.x);
+ _315.y = (bool(_316.y)?_312.y:_329.y);
+ _315.z = (bool(_316.z)?_312.z:_329.z);
+ _315.w = (bool(_316.w)?_312.w:_329.w);
+ _265.x = (_266.x+_315.x);
+ _265.y = (_266.y+_315.y);
+ _265.z = (_266.z+_315.z);
+ _265.w = (_266.w+_315.w);
+ *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 512)) = make_float4(kernel[_265.x],kernel[_265.y],kernel[_265.z],kernel[_265.w]);
+ int4 _331;
+ int4 _332;
+ int4 _333;
+ int4 _334 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 640) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 640) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 640) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 640) / 48) * 4608)) + (rc_outer_outer * 144)));
+ int4 _335;
+ int4 _336;
+ int4 _337;
+ int4 _338 = make_int4((((((int)threadIdx.x) * 4) + 640))+(1*0), (((((int)threadIdx.x) * 4) + 640))+(1*1), (((((int)threadIdx.x) * 4) + 640))+(1*2), (((((int)threadIdx.x) * 4) + 640))+(1*3));
+ int4 _339 = make_int4(3, 3, 3, 3);
+ _337.x = (_338.x%_339.x);
+ _337.y = (_338.y%_339.y);
+ _337.z = (_338.z%_339.z);
+ _337.w = (_338.w%_339.w);
+ int4 _340;
+ int4 _341 = make_int4((((((int)threadIdx.x) * 4) + 640))+(1*0), (((((int)threadIdx.x) * 4) + 640))+(1*1), (((((int)threadIdx.x) * 4) + 640))+(1*2), (((((int)threadIdx.x) * 4) + 640))+(1*3));
+ int4 _342 = make_int4(3, 3, 3, 3);
+ _340.x = (_341.x/_342.x);
+ _340.y = (_341.y/_342.y);
+ _340.z = (_341.z/_342.z);
+ _340.w = (_341.w/_342.w);
+ int4 _343;
+ ushort4 _344;
+ ushort4 _345;
+ ushort4 _346;
+ int4 _347 = make_int4(3, 3, 3, 3);
+ int4 _348 = make_int4(0, 0, 0, 0);
+ _346.x = (_347.x>=_348.x);
+ _346.y = (_347.y>=_348.y);
+ _346.z = (_347.z>=_348.z);
+ _346.w = (_347.w>=_348.w);
+ ushort4 _349;
+ int4 _350 = make_int4(0, 0, 0, 0);
+ _349.x = (_337.x>=_350.x);
+ _349.y = (_337.y>=_350.y);
+ _349.z = (_337.z>=_350.z);
+ _349.w = (_337.w>=_350.w);
+ _345.x = (_346.x&&_349.x);
+ _345.y = (_346.y&&_349.y);
+ _345.z = (_346.z&&_349.z);
+ _345.w = (_346.w&&_349.w);
+ ushort4 _351;
+ ushort4 _352;
+ int4 _353 = make_int4(3, 3, 3, 3);
+ int4 _354 = make_int4(0, 0, 0, 0);
+ _352.x = (_353.x<_354.x);
+ _352.y = (_353.y<_354.y);
+ _352.z = (_353.z<_354.z);
+ _352.w = (_353.w<_354.w);
+ ushort4 _355;
+ int4 _356 = make_int4(0, 0, 0, 0);
+ _355.x = (_337.x<=_356.x);
+ _355.y = (_337.y<=_356.y);
+ _355.z = (_337.z<=_356.z);
+ _355.w = (_337.w<=_356.w);
+ _351.x = (_352.x&&_355.x);
+ _351.y = (_352.y&&_355.y);
+ _351.z = (_352.z&&_355.z);
+ _351.w = (_352.w&&_355.w);
+ _344.x = (_345.x||_351.x);
+ _344.y = (_345.y||_351.y);
+ _344.z = (_345.z||_351.z);
+ _344.w = (_345.w||_351.w);
+ int4 _357;
+ int4 _358 = make_int4(1, 1, 1, 1);
+ _357.x = (_340.x-_358.x);
+ _357.y = (_340.y-_358.y);
+ _357.z = (_340.z-_358.z);
+ _357.w = (_340.w-_358.w);
+ _343.x = (bool(_344.x)?_340.x:_357.x);
+ _343.y = (bool(_344.y)?_340.y:_357.y);
+ _343.z = (bool(_344.z)?_340.z:_357.z);
+ _343.w = (bool(_344.w)?_340.w:_357.w);
+ int4 _359 = make_int4(16, 16, 16, 16);
+ _336.x = (_343.x%_359.x);
+ _336.y = (_343.y%_359.y);
+ _336.z = (_343.z%_359.z);
+ _336.w = (_343.w%_359.w);
+ int4 _360;
+ ushort4 _361;
+ ushort4 _362;
+ ushort4 _363;
+ int4 _364 = make_int4(16, 16, 16, 16);
+ int4 _365 = make_int4(0, 0, 0, 0);
+ _363.x = (_364.x>=_365.x);
+ _363.y = (_364.y>=_365.y);
+ _363.z = (_364.z>=_365.z);
+ _363.w = (_364.w>=_365.w);
+ ushort4 _366;
+ int4 _367 = make_int4(0, 0, 0, 0);
+ _366.x = (_336.x>=_367.x);
+ _366.y = (_336.y>=_367.y);
+ _366.z = (_336.z>=_367.z);
+ _366.w = (_336.w>=_367.w);
+ _362.x = (_363.x&&_366.x);
+ _362.y = (_363.y&&_366.y);
+ _362.z = (_363.z&&_366.z);
+ _362.w = (_363.w&&_366.w);
+ ushort4 _368;
+ ushort4 _369;
+ int4 _370 = make_int4(16, 16, 16, 16);
+ int4 _371 = make_int4(0, 0, 0, 0);
+ _369.x = (_370.x<_371.x);
+ _369.y = (_370.y<_371.y);
+ _369.z = (_370.z<_371.z);
+ _369.w = (_370.w<_371.w);
+ ushort4 _372;
+ int4 _373 = make_int4(0, 0, 0, 0);
+ _372.x = (_336.x<=_373.x);
+ _372.y = (_336.y<=_373.y);
+ _372.z = (_336.z<=_373.z);
+ _372.w = (_336.w<=_373.w);
+ _368.x = (_369.x&&_372.x);
+ _368.y = (_369.y&&_372.y);
+ _368.z = (_369.z&&_372.z);
+ _368.w = (_369.w&&_372.w);
+ _361.x = (_362.x||_368.x);
+ _361.y = (_362.y||_368.y);
+ _361.z = (_362.z||_368.z);
+ _361.w = (_362.w||_368.w);
+ int4 _374;
+ int4 _375 = make_int4(16, 16, 16, 16);
+ _374.x = (_336.x+_375.x);
+ _374.y = (_336.y+_375.y);
+ _374.z = (_336.z+_375.z);
+ _374.w = (_336.w+_375.w);
+ _360.x = (bool(_361.x)?_336.x:_374.x);
+ _360.y = (bool(_361.y)?_336.y:_374.y);
+ _360.z = (bool(_361.z)?_336.z:_374.z);
+ _360.w = (bool(_361.w)?_336.w:_374.w);
+ int4 _376 = make_int4(9, 9, 9, 9);
+ _335.x = (_360.x*_376.x);
+ _335.y = (_360.y*_376.y);
+ _335.z = (_360.z*_376.z);
+ _335.w = (_360.w*_376.w);
+ _333.x = (_334.x+_335.x);
+ _333.y = (_334.y+_335.y);
+ _333.z = (_334.z+_335.z);
+ _333.w = (_334.w+_335.w);
+ int4 _377 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+ _332.x = (_333.x+_377.x);
+ _332.y = (_333.y+_377.y);
+ _332.z = (_333.z+_377.z);
+ _332.w = (_333.w+_377.w);
+ int4 _378;
+ int4 _379 = make_int4(((((int)threadIdx.x) + 160))+(1*0), ((((int)threadIdx.x) + 160))+(1*1), ((((int)threadIdx.x) + 160))+(1*2), ((((int)threadIdx.x) + 160))+(1*3));
+ int4 _380 = make_int4(3, 3, 3, 3);
+ _378.x = (_379.x%_380.x);
+ _378.y = (_379.y%_380.y);
+ _378.z = (_379.z%_380.z);
+ _378.w = (_379.w%_380.w);
+ int4 _381;
+ ushort4 _382;
+ ushort4 _383;
+ ushort4 _384;
+ int4 _385 = make_int4(3, 3, 3, 3);
+ int4 _386 = make_int4(0, 0, 0, 0);
+ _384.x = (_385.x>=_386.x);
+ _384.y = (_385.y>=_386.y);
+ _384.z = (_385.z>=_386.z);
+ _384.w = (_385.w>=_386.w);
+ ushort4 _387;
+ int4 _388 = make_int4(0, 0, 0, 0);
+ _387.x = (_378.x>=_388.x);
+ _387.y = (_378.y>=_388.y);
+ _387.z = (_378.z>=_388.z);
+ _387.w = (_378.w>=_388.w);
+ _383.x = (_384.x&&_387.x);
+ _383.y = (_384.y&&_387.y);
+ _383.z = (_384.z&&_387.z);
+ _383.w = (_384.w&&_387.w);
+ ushort4 _389;
+ ushort4 _390;
+ int4 _391 = make_int4(3, 3, 3, 3);
+ int4 _392 = make_int4(0, 0, 0, 0);
+ _390.x = (_391.x<_392.x);
+ _390.y = (_391.y<_392.y);
+ _390.z = (_391.z<_392.z);
+ _390.w = (_391.w<_392.w);
+ ushort4 _393;
+ int4 _394 = make_int4(0, 0, 0, 0);
+ _393.x = (_378.x<=_394.x);
+ _393.y = (_378.y<=_394.y);
+ _393.z = (_378.z<=_394.z);
+ _393.w = (_378.w<=_394.w);
+ _389.x = (_390.x&&_393.x);
+ _389.y = (_390.y&&_393.y);
+ _389.z = (_390.z&&_393.z);
+ _389.w = (_390.w&&_393.w);
+ _382.x = (_383.x||_389.x);
+ _382.y = (_383.y||_389.y);
+ _382.z = (_383.z||_389.z);
+ _382.w = (_383.w||_389.w);
+ int4 _395;
+ int4 _396 = make_int4(3, 3, 3, 3);
+ _395.x = (_378.x+_396.x);
+ _395.y = (_378.y+_396.y);
+ _395.z = (_378.z+_396.z);
+ _395.w = (_378.w+_396.w);
+ _381.x = (bool(_382.x)?_378.x:_395.x);
+ _381.y = (bool(_382.y)?_378.y:_395.y);
+ _381.z = (bool(_382.z)?_378.z:_395.z);
+ _381.w = (bool(_382.w)?_378.w:_395.w);
+ _331.x = (_332.x+_381.x);
+ _331.y = (_332.y+_381.y);
+ _331.z = (_332.z+_381.z);
+ _331.w = (_332.w+_381.w);
+ *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 640)) = make_float4(kernel[_331.x],kernel[_331.y],kernel[_331.z],kernel[_331.w]);
+ int4 _397;
+ int4 _398;
+ int4 _399;
+ int4 _400 = make_int4((((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 73728), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 73728), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 73728), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 73728));
+ int4 _401;
+ int4 _402;
+ int4 _403;
+ int4 _404 = make_int4((((((int)threadIdx.x) * 4) + 768))+(1*0), (((((int)threadIdx.x) * 4) + 768))+(1*1), (((((int)threadIdx.x) * 4) + 768))+(1*2), (((((int)threadIdx.x) * 4) + 768))+(1*3));
+ int4 _405 = make_int4(3, 3, 3, 3);
+ _403.x = (_404.x%_405.x);
+ _403.y = (_404.y%_405.y);
+ _403.z = (_404.z%_405.z);
+ _403.w = (_404.w%_405.w);
+ int4 _406;
+ int4 _407 = make_int4((((((int)threadIdx.x) * 4) + 768))+(1*0), (((((int)threadIdx.x) * 4) + 768))+(1*1), (((((int)threadIdx.x) * 4) + 768))+(1*2), (((((int)threadIdx.x) * 4) + 768))+(1*3));
+ int4 _408 = make_int4(3, 3, 3, 3);
+ _406.x = (_407.x/_408.x);
+ _406.y = (_407.y/_408.y);
+ _406.z = (_407.z/_408.z);
+ _406.w = (_407.w/_408.w);
+ int4 _409;
+ ushort4 _410;
+ ushort4 _411;
+ ushort4 _412;
+ int4 _413 = make_int4(3, 3, 3, 3);
+ int4 _414 = make_int4(0, 0, 0, 0);
+ _412.x = (_413.x>=_414.x);
+ _412.y = (_413.y>=_414.y);
+ _412.z = (_413.z>=_414.z);
+ _412.w = (_413.w>=_414.w);
+ ushort4 _415;
+ int4 _416 = make_int4(0, 0, 0, 0);
+ _415.x = (_403.x>=_416.x);
+ _415.y = (_403.y>=_416.y);
+ _415.z = (_403.z>=_416.z);
+ _415.w = (_403.w>=_416.w);
+ _411.x = (_412.x&&_415.x);
+ _411.y = (_412.y&&_415.y);
+ _411.z = (_412.z&&_415.z);
+ _411.w = (_412.w&&_415.w);
+ ushort4 _417;
+ ushort4 _418;
+ int4 _419 = make_int4(3, 3, 3, 3);
+ int4 _420 = make_int4(0, 0, 0, 0);
+ _418.x = (_419.x<_420.x);
+ _418.y = (_419.y<_420.y);
+ _418.z = (_419.z<_420.z);
+ _418.w = (_419.w<_420.w);
+ ushort4 _421;
+ int4 _422 = make_int4(0, 0, 0, 0);
+ _421.x = (_403.x<=_422.x);
+ _421.y = (_403.y<=_422.y);
+ _421.z = (_403.z<=_422.z);
+ _421.w = (_403.w<=_422.w);
+ _417.x = (_418.x&&_421.x);
+ _417.y = (_418.y&&_421.y);
+ _417.z = (_418.z&&_421.z);
+ _417.w = (_418.w&&_421.w);
+ _410.x = (_411.x||_417.x);
+ _410.y = (_411.y||_417.y);
+ _410.z = (_411.z||_417.z);
+ _410.w = (_411.w||_417.w);
+ int4 _423;
+ int4 _424 = make_int4(1, 1, 1, 1);
+ _423.x = (_406.x-_424.x);
+ _423.y = (_406.y-_424.y);
+ _423.z = (_406.z-_424.z);
+ _423.w = (_406.w-_424.w);
+ _409.x = (bool(_410.x)?_406.x:_423.x);
+ _409.y = (bool(_410.y)?_406.y:_423.y);
+ _409.z = (bool(_410.z)?_406.z:_423.z);
+ _409.w = (bool(_410.w)?_406.w:_423.w);
+ int4 _425 = make_int4(16, 16, 16, 16);
+ _402.x = (_409.x%_425.x);
+ _402.y = (_409.y%_425.y);
+ _402.z = (_409.z%_425.z);
+ _402.w = (_409.w%_425.w);
+ int4 _426;
+ ushort4 _427;
+ ushort4 _428;
+ ushort4 _429;
+ int4 _430 = make_int4(16, 16, 16, 16);
+ int4 _431 = make_int4(0, 0, 0, 0);
+ _429.x = (_430.x>=_431.x);
+ _429.y = (_430.y>=_431.y);
+ _429.z = (_430.z>=_431.z);
+ _429.w = (_430.w>=_431.w);
+ ushort4 _432;
+ int4 _433 = make_int4(0, 0, 0, 0);
+ _432.x = (_402.x>=_433.x);
+ _432.y = (_402.y>=_433.y);
+ _432.z = (_402.z>=_433.z);
+ _432.w = (_402.w>=_433.w);
+ _428.x = (_429.x&&_432.x);
+ _428.y = (_429.y&&_432.y);
+ _428.z = (_429.z&&_432.z);
+ _428.w = (_429.w&&_432.w);
+ ushort4 _434;
+ ushort4 _435;
+ int4 _436 = make_int4(16, 16, 16, 16);
+ int4 _437 = make_int4(0, 0, 0, 0);
+ _435.x = (_436.x<_437.x);
+ _435.y = (_436.y<_437.y);
+ _435.z = (_436.z<_437.z);
+ _435.w = (_436.w<_437.w);
+ ushort4 _438;
+ int4 _439 = make_int4(0, 0, 0, 0);
+ _438.x = (_402.x<=_439.x);
+ _438.y = (_402.y<=_439.y);
+ _438.z = (_402.z<=_439.z);
+ _438.w = (_402.w<=_439.w);
+ _434.x = (_435.x&&_438.x);
+ _434.y = (_435.y&&_438.y);
+ _434.z = (_435.z&&_438.z);
+ _434.w = (_435.w&&_438.w);
+ _427.x = (_428.x||_434.x);
+ _427.y = (_428.y||_434.y);
+ _427.z = (_428.z||_434.z);
+ _427.w = (_428.w||_434.w);
+ int4 _440;
+ int4 _441 = make_int4(16, 16, 16, 16);
+ _440.x = (_402.x+_441.x);
+ _440.y = (_402.y+_441.y);
+ _440.z = (_402.z+_441.z);
+ _440.w = (_402.w+_441.w);
+ _426.x = (bool(_427.x)?_402.x:_440.x);
+ _426.y = (bool(_427.y)?_402.y:_440.y);
+ _426.z = (bool(_427.z)?_402.z:_440.z);
+ _426.w = (bool(_427.w)?_402.w:_440.w);
+ int4 _442 = make_int4(9, 9, 9, 9);
+ _401.x = (_426.x*_442.x);
+ _401.y = (_426.y*_442.y);
+ _401.z = (_426.z*_442.z);
+ _401.w = (_426.w*_442.w);
+ _399.x = (_400.x+_401.x);
+ _399.y = (_400.y+_401.y);
+ _399.z = (_400.z+_401.z);
+ _399.w = (_400.w+_401.w);
+ int4 _443 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+ _398.x = (_399.x+_443.x);
+ _398.y = (_399.y+_443.y);
+ _398.z = (_399.z+_443.z);
+ _398.w = (_399.w+_443.w);
+ int4 _444;
+ int4 _445 = make_int4(((((int)threadIdx.x) + 192))+(1*0), ((((int)threadIdx.x) + 192))+(1*1), ((((int)threadIdx.x) + 192))+(1*2), ((((int)threadIdx.x) + 192))+(1*3));
+ int4 _446 = make_int4(3, 3, 3, 3);
+ _444.x = (_445.x%_446.x);
+ _444.y = (_445.y%_446.y);
+ _444.z = (_445.z%_446.z);
+ _444.w = (_445.w%_446.w);
+ int4 _447;
+ ushort4 _448;
+ ushort4 _449;
+ ushort4 _450;
+ int4 _451 = make_int4(3, 3, 3, 3);
+ int4 _452 = make_int4(0, 0, 0, 0);
+ _450.x = (_451.x>=_452.x);
+ _450.y = (_451.y>=_452.y);
+ _450.z = (_451.z>=_452.z);
+ _450.w = (_451.w>=_452.w);
+ ushort4 _453;
+ int4 _454 = make_int4(0, 0, 0, 0);
+ _453.x = (_444.x>=_454.x);
+ _453.y = (_444.y>=_454.y);
+ _453.z = (_444.z>=_454.z);
+ _453.w = (_444.w>=_454.w);
+ _449.x = (_450.x&&_453.x);
+ _449.y = (_450.y&&_453.y);
+ _449.z = (_450.z&&_453.z);
+ _449.w = (_450.w&&_453.w);
+ ushort4 _455;
+ ushort4 _456;
+ int4 _457 = make_int4(3, 3, 3, 3);
+ int4 _458 = make_int4(0, 0, 0, 0);
+ _456.x = (_457.x<_458.x);
+ _456.y = (_457.y<_458.y);
+ _456.z = (_457.z<_458.z);
+ _456.w = (_457.w<_458.w);
+ ushort4 _459;
+ int4 _460 = make_int4(0, 0, 0, 0);
+ _459.x = (_444.x<=_460.x);
+ _459.y = (_444.y<=_460.y);
+ _459.z = (_444.z<=_460.z);
+ _459.w = (_444.w<=_460.w);
+ _455.x = (_456.x&&_459.x);
+ _455.y = (_456.y&&_459.y);
+ _455.z = (_456.z&&_459.z);
+ _455.w = (_456.w&&_459.w);
+ _448.x = (_449.x||_455.x);
+ _448.y = (_449.y||_455.y);
+ _448.z = (_449.z||_455.z);
+ _448.w = (_449.w||_455.w);
+ int4 _461;
+ int4 _462 = make_int4(3, 3, 3, 3);
+ _461.x = (_444.x+_462.x);
+ _461.y = (_444.y+_462.y);
+ _461.z = (_444.z+_462.z);
+ _461.w = (_444.w+_462.w);
+ _447.x = (bool(_448.x)?_444.x:_461.x);
+ _447.y = (bool(_448.y)?_444.y:_461.y);
+ _447.z = (bool(_448.z)?_444.z:_461.z);
+ _447.w = (bool(_448.w)?_444.w:_461.w);
+ _397.x = (_398.x+_447.x);
+ _397.y = (_398.y+_447.y);
+ _397.z = (_398.z+_447.z);
+ _397.w = (_398.w+_447.w);
+ *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 768)) = make_float4(kernel[_397.x],kernel[_397.y],kernel[_397.z],kernel[_397.w]);
+ int4 _463;
+ int4 _464;
+ int4 _465;
+ int4 _466 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 896) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 896) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 896) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 896) / 48) * 4608)) + (rc_outer_outer * 144)));
+ int4 _467;
+ int4 _468;
+ int4 _469;
+ int4 _470 = make_int4((((((int)threadIdx.x) * 4) + 896))+(1*0), (((((int)threadIdx.x) * 4) + 896))+(1*1), (((((int)threadIdx.x) * 4) + 896))+(1*2), (((((int)threadIdx.x) * 4) + 896))+(1*3));
+ int4 _471 = make_int4(3, 3, 3, 3);
+ _469.x = (_470.x%_471.x);
+ _469.y = (_470.y%_471.y);
+ _469.z = (_470.z%_471.z);
+ _469.w = (_470.w%_471.w);
+ int4 _472;
+ int4 _473 = make_int4((((((int)threadIdx.x) * 4) + 896))+(1*0), (((((int)threadIdx.x) * 4) + 896))+(1*1), (((((int)threadIdx.x) * 4) + 896))+(1*2), (((((int)threadIdx.x) * 4) + 896))+(1*3));
+ int4 _474 = make_int4(3, 3, 3, 3);
+ _472.x = (_473.x/_474.x);
+ _472.y = (_473.y/_474.y);
+ _472.z = (_473.z/_474.z);
+ _472.w = (_473.w/_474.w);
+ int4 _475;
+ ushort4 _476;
+ ushort4 _477;
+ ushort4 _478;
+ int4 _479 = make_int4(3, 3, 3, 3);
+ int4 _480 = make_int4(0, 0, 0, 0);
+ _478.x = (_479.x>=_480.x);
+ _478.y = (_479.y>=_480.y);
+ _478.z = (_479.z>=_480.z);
+ _478.w = (_479.w>=_480.w);
+ ushort4 _481;
+ int4 _482 = make_int4(0, 0, 0, 0);
+ _481.x = (_469.x>=_482.x);
+ _481.y = (_469.y>=_482.y);
+ _481.z = (_469.z>=_482.z);
+ _481.w = (_469.w>=_482.w);
+ _477.x = (_478.x&&_481.x);
+ _477.y = (_478.y&&_481.y);
+ _477.z = (_478.z&&_481.z);
+ _477.w = (_478.w&&_481.w);
+ ushort4 _483;
+ ushort4 _484;
+ int4 _485 = make_int4(3, 3, 3, 3);
+ int4 _486 = make_int4(0, 0, 0, 0);
+ _484.x = (_485.x<_486.x);
+ _484.y = (_485.y<_486.y);
+ _484.z = (_485.z<_486.z);
+ _484.w = (_485.w<_486.w);
+ ushort4 _487;
+ int4 _488 = make_int4(0, 0, 0, 0);
+ _487.x = (_469.x<=_488.x);
+ _487.y = (_469.y<=_488.y);
+ _487.z = (_469.z<=_488.z);
+ _487.w = (_469.w<=_488.w);
+ _483.x = (_484.x&&_487.x);
+ _483.y = (_484.y&&_487.y);
+ _483.z = (_484.z&&_487.z);
+ _483.w = (_484.w&&_487.w);
+ _476.x = (_477.x||_483.x);
+ _476.y = (_477.y||_483.y);
+ _476.z = (_477.z||_483.z);
+ _476.w = (_477.w||_483.w);
+ int4 _489;
+ int4 _490 = make_int4(1, 1, 1, 1);
+ _489.x = (_472.x-_490.x);
+ _489.y = (_472.y-_490.y);
+ _489.z = (_472.z-_490.z);
+ _489.w = (_472.w-_490.w);
+ _475.x = (bool(_476.x)?_472.x:_489.x);
+ _475.y = (bool(_476.y)?_472.y:_489.y);
+ _475.z = (bool(_476.z)?_472.z:_489.z);
+ _475.w = (bool(_476.w)?_472.w:_489.w);
+ int4 _491 = make_int4(16, 16, 16, 16);
+ _468.x = (_475.x%_491.x);
+ _468.y = (_475.y%_491.y);
+ _468.z = (_475.z%_491.z);
+ _468.w = (_475.w%_491.w);
+ int4 _492;
+ ushort4 _493;
+ ushort4 _494;
+ ushort4 _495;
+ int4 _496 = make_int4(16, 16, 16, 16);
+ int4 _497 = make_int4(0, 0, 0, 0);
+ _495.x = (_496.x>=_497.x);
+ _495.y = (_496.y>=_497.y);
+ _495.z = (_496.z>=_497.z);
+ _495.w = (_496.w>=_497.w);
+ ushort4 _498;
+ int4 _499 = make_int4(0, 0, 0, 0);
+ _498.x = (_468.x>=_499.x);
+ _498.y = (_468.y>=_499.y);
+ _498.z = (_468.z>=_499.z);
+ _498.w = (_468.w>=_499.w);
+ _494.x = (_495.x&&_498.x);
+ _494.y = (_495.y&&_498.y);
+ _494.z = (_495.z&&_498.z);
+ _494.w = (_495.w&&_498.w);
+ ushort4 _500;
+ ushort4 _501;
+ int4 _502 = make_int4(16, 16, 16, 16);
+ int4 _503 = make_int4(0, 0, 0, 0);
+ _501.x = (_502.x<_503.x);
+ _501.y = (_502.y<_503.y);
+ _501.z = (_502.z<_503.z);
+ _501.w = (_502.w<_503.w);
+ ushort4 _504;
+ int4 _505 = make_int4(0, 0, 0, 0);
+ _504.x = (_468.x<=_505.x);
+ _504.y = (_468.y<=_505.y);
+ _504.z = (_468.z<=_505.z);
+ _504.w = (_468.w<=_505.w);
+ _500.x = (_501.x&&_504.x);
+ _500.y = (_501.y&&_504.y);
+ _500.z = (_501.z&&_504.z);
+ _500.w = (_501.w&&_504.w);
+ _493.x = (_494.x||_500.x);
+ _493.y = (_494.y||_500.y);
+ _493.z = (_494.z||_500.z);
+ _493.w = (_494.w||_500.w);
+ int4 _506;
+ int4 _507 = make_int4(16, 16, 16, 16);
+ _506.x = (_468.x+_507.x);
+ _506.y = (_468.y+_507.y);
+ _506.z = (_468.z+_507.z);
+ _506.w = (_468.w+_507.w);
+ _492.x = (bool(_493.x)?_468.x:_506.x);
+ _492.y = (bool(_493.y)?_468.y:_506.y);
+ _492.z = (bool(_493.z)?_468.z:_506.z);
+ _492.w = (bool(_493.w)?_468.w:_506.w);
+ int4 _508 = make_int4(9, 9, 9, 9);
+ _467.x = (_492.x*_508.x);
+ _467.y = (_492.y*_508.y);
+ _467.z = (_492.z*_508.z);
+ _467.w = (_492.w*_508.w);
+ _465.x = (_466.x+_467.x);
+ _465.y = (_466.y+_467.y);
+ _465.z = (_466.z+_467.z);
+ _465.w = (_466.w+_467.w);
+ int4 _509 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+ _464.x = (_465.x+_509.x);
+ _464.y = (_465.y+_509.y);
+ _464.z = (_465.z+_509.z);
+ _464.w = (_465.w+_509.w);
+ int4 _510;
+ int4 _511 = make_int4(((((int)threadIdx.x) + 224))+(1*0), ((((int)threadIdx.x) + 224))+(1*1), ((((int)threadIdx.x) + 224))+(1*2), ((((int)threadIdx.x) + 224))+(1*3));
+ int4 _512 = make_int4(3, 3, 3, 3);
+ _510.x = (_511.x%_512.x);
+ _510.y = (_511.y%_512.y);
+ _510.z = (_511.z%_512.z);
+ _510.w = (_511.w%_512.w);
+ int4 _513;
+ ushort4 _514;
+ ushort4 _515;
+ ushort4 _516;
+ int4 _517 = make_int4(3, 3, 3, 3);
+ int4 _518 = make_int4(0, 0, 0, 0);
+ _516.x = (_517.x>=_518.x);
+ _516.y = (_517.y>=_518.y);
+ _516.z = (_517.z>=_518.z);
+ _516.w = (_517.w>=_518.w);
+ ushort4 _519;
+ int4 _520 = make_int4(0, 0, 0, 0);
+ _519.x = (_510.x>=_520.x);
+ _519.y = (_510.y>=_520.y);
+ _519.z = (_510.z>=_520.z);
+ _519.w = (_510.w>=_520.w);
+ _515.x = (_516.x&&_519.x);
+ _515.y = (_516.y&&_519.y);
+ _515.z = (_516.z&&_519.z);
+ _515.w = (_516.w&&_519.w);
+ ushort4 _521;
+ ushort4 _522;
+ int4 _523 = make_int4(3, 3, 3, 3);
+ int4 _524 = make_int4(0, 0, 0, 0);
+ _522.x = (_523.x<_524.x);
+ _522.y = (_523.y<_524.y);
+ _522.z = (_523.z<_524.z);
+ _522.w = (_523.w<_524.w);
+ ushort4 _525;
+ int4 _526 = make_int4(0, 0, 0, 0);
+ _525.x = (_510.x<=_526.x);
+ _525.y = (_510.y<=_526.y);
+ _525.z = (_510.z<=_526.z);
+ _525.w = (_510.w<=_526.w);
+ _521.x = (_522.x&&_525.x);
+ _521.y = (_522.y&&_525.y);
+ _521.z = (_522.z&&_525.z);
+ _521.w = (_522.w&&_525.w);
+ _514.x = (_515.x||_521.x);
+ _514.y = (_515.y||_521.y);
+ _514.z = (_515.z||_521.z);
+ _514.w = (_515.w||_521.w);
+ int4 _527;
+ int4 _528 = make_int4(3, 3, 3, 3);
+ _527.x = (_510.x+_528.x);
+ _527.y = (_510.y+_528.y);
+ _527.z = (_510.z+_528.z);
+ _527.w = (_510.w+_528.w);
+ _513.x = (bool(_514.x)?_510.x:_527.x);
+ _513.y = (bool(_514.y)?_510.y:_527.y);
+ _513.z = (bool(_514.z)?_510.z:_527.z);
+ _513.w = (bool(_514.w)?_510.w:_527.w);
+ _463.x = (_464.x+_513.x);
+ _463.y = (_464.y+_513.y);
+ _463.z = (_464.z+_513.z);
+ _463.w = (_464.w+_513.w);
+ *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 896)) = make_float4(kernel[_463.x],kernel[_463.y],kernel[_463.z],kernel[_463.w]);
+ int4 _529;
+ int4 _530;
+ int4 _531;
+ int4 _532 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1024) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1024) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1024) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1024) / 48) * 4608)) + (rc_outer_outer [...]
+ int4 _533;
+ int4 _534;
+ int4 _535;
+ int4 _536 = make_int4((((((int)threadIdx.x) * 4) + 1024))+(1*0), (((((int)threadIdx.x) * 4) + 1024))+(1*1), (((((int)threadIdx.x) * 4) + 1024))+(1*2), (((((int)threadIdx.x) * 4) + 1024))+(1*3));
+ int4 _537 = make_int4(3, 3, 3, 3);
+ _535.x = (_536.x%_537.x);
+ _535.y = (_536.y%_537.y);
+ _535.z = (_536.z%_537.z);
+ _535.w = (_536.w%_537.w);
+ int4 _538;
+ int4 _539 = make_int4((((((int)threadIdx.x) * 4) + 1024))+(1*0), (((((int)threadIdx.x) * 4) + 1024))+(1*1), (((((int)threadIdx.x) * 4) + 1024))+(1*2), (((((int)threadIdx.x) * 4) + 1024))+(1*3));
+ int4 _540 = make_int4(3, 3, 3, 3);
+ _538.x = (_539.x/_540.x);
+ _538.y = (_539.y/_540.y);
+ _538.z = (_539.z/_540.z);
+ _538.w = (_539.w/_540.w);
+ int4 _541;
+ ushort4 _542;
+ ushort4 _543;
+ ushort4 _544;
+ int4 _545 = make_int4(3, 3, 3, 3);
+ int4 _546 = make_int4(0, 0, 0, 0);
+ _544.x = (_545.x>=_546.x);
+ _544.y = (_545.y>=_546.y);
+ _544.z = (_545.z>=_546.z);
+ _544.w = (_545.w>=_546.w);
+ ushort4 _547;
+ int4 _548 = make_int4(0, 0, 0, 0);
+ _547.x = (_535.x>=_548.x);
+ _547.y = (_535.y>=_548.y);
+ _547.z = (_535.z>=_548.z);
+ _547.w = (_535.w>=_548.w);
+ _543.x = (_544.x&&_547.x);
+ _543.y = (_544.y&&_547.y);
+ _543.z = (_544.z&&_547.z);
+ _543.w = (_544.w&&_547.w);
+ ushort4 _549;
+ ushort4 _550;
+ int4 _551 = make_int4(3, 3, 3, 3);
+ int4 _552 = make_int4(0, 0, 0, 0);
+ _550.x = (_551.x<_552.x);
+ _550.y = (_551.y<_552.y);
+ _550.z = (_551.z<_552.z);
+ _550.w = (_551.w<_552.w);
+ ushort4 _553;
+ int4 _554 = make_int4(0, 0, 0, 0);
+ _553.x = (_535.x<=_554.x);
+ _553.y = (_535.y<=_554.y);
+ _553.z = (_535.z<=_554.z);
+ _553.w = (_535.w<=_554.w);
+ _549.x = (_550.x&&_553.x);
+ _549.y = (_550.y&&_553.y);
+ _549.z = (_550.z&&_553.z);
+ _549.w = (_550.w&&_553.w);
+ _542.x = (_543.x||_549.x);
+ _542.y = (_543.y||_549.y);
+ _542.z = (_543.z||_549.z);
+ _542.w = (_543.w||_549.w);
+ int4 _555;
+ int4 _556 = make_int4(1, 1, 1, 1);
+ _555.x = (_538.x-_556.x);
+ _555.y = (_538.y-_556.y);
+ _555.z = (_538.z-_556.z);
+ _555.w = (_538.w-_556.w);
+ _541.x = (bool(_542.x)?_538.x:_555.x);
+ _541.y = (bool(_542.y)?_538.y:_555.y);
+ _541.z = (bool(_542.z)?_538.z:_555.z);
+ _541.w = (bool(_542.w)?_538.w:_555.w);
+ int4 _557 = make_int4(16, 16, 16, 16);
+ _534.x = (_541.x%_557.x);
+ _534.y = (_541.y%_557.y);
+ _534.z = (_541.z%_557.z);
+ _534.w = (_541.w%_557.w);
+ int4 _558;
+ ushort4 _559;
+ ushort4 _560;
+ ushort4 _561;
+ int4 _562 = make_int4(16, 16, 16, 16);
+ int4 _563 = make_int4(0, 0, 0, 0);
+ _561.x = (_562.x>=_563.x);
+ _561.y = (_562.y>=_563.y);
+ _561.z = (_562.z>=_563.z);
+ _561.w = (_562.w>=_563.w);
+ ushort4 _564;
+ int4 _565 = make_int4(0, 0, 0, 0);
+ _564.x = (_534.x>=_565.x);
+ _564.y = (_534.y>=_565.y);
+ _564.z = (_534.z>=_565.z);
+ _564.w = (_534.w>=_565.w);
+ _560.x = (_561.x&&_564.x);
+ _560.y = (_561.y&&_564.y);
+ _560.z = (_561.z&&_564.z);
+ _560.w = (_561.w&&_564.w);
+ ushort4 _566;
+ ushort4 _567;
+ int4 _568 = make_int4(16, 16, 16, 16);
+ int4 _569 = make_int4(0, 0, 0, 0);
+ _567.x = (_568.x<_569.x);
+ _567.y = (_568.y<_569.y);
+ _567.z = (_568.z<_569.z);
+ _567.w = (_568.w<_569.w);
+ ushort4 _570;
+ int4 _571 = make_int4(0, 0, 0, 0);
+ _570.x = (_534.x<=_571.x);
+ _570.y = (_534.y<=_571.y);
+ _570.z = (_534.z<=_571.z);
+ _570.w = (_534.w<=_571.w);
+ _566.x = (_567.x&&_570.x);
+ _566.y = (_567.y&&_570.y);
+ _566.z = (_567.z&&_570.z);
+ _566.w = (_567.w&&_570.w);
+ _559.x = (_560.x||_566.x);
+ _559.y = (_560.y||_566.y);
+ _559.z = (_560.z||_566.z);
+ _559.w = (_560.w||_566.w);
+ int4 _572;
+ int4 _573 = make_int4(16, 16, 16, 16);
+ _572.x = (_534.x+_573.x);
+ _572.y = (_534.y+_573.y);
+ _572.z = (_534.z+_573.z);
+ _572.w = (_534.w+_573.w);
+ _558.x = (bool(_559.x)?_534.x:_572.x);
+ _558.y = (bool(_559.y)?_534.y:_572.y);
+ _558.z = (bool(_559.z)?_534.z:_572.z);
+ _558.w = (bool(_559.w)?_534.w:_572.w);
+ int4 _574 = make_int4(9, 9, 9, 9);
+ _533.x = (_558.x*_574.x);
+ _533.y = (_558.y*_574.y);
+ _533.z = (_558.z*_574.z);
+ _533.w = (_558.w*_574.w);
+ _531.x = (_532.x+_533.x);
+ _531.y = (_532.y+_533.y);
+ _531.z = (_532.z+_533.z);
+ _531.w = (_532.w+_533.w);
+ int4 _575 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+ _530.x = (_531.x+_575.x);
+ _530.y = (_531.y+_575.y);
+ _530.z = (_531.z+_575.z);
+ _530.w = (_531.w+_575.w);
+ int4 _576;
+ int4 _577 = make_int4(((((int)threadIdx.x) + 256))+(1*0), ((((int)threadIdx.x) + 256))+(1*1), ((((int)threadIdx.x) + 256))+(1*2), ((((int)threadIdx.x) + 256))+(1*3));
+ int4 _578 = make_int4(3, 3, 3, 3);
+ _576.x = (_577.x%_578.x);
+ _576.y = (_577.y%_578.y);
+ _576.z = (_577.z%_578.z);
+ _576.w = (_577.w%_578.w);
+ int4 _579;
+ ushort4 _580;
+ ushort4 _581;
+ ushort4 _582;
+ int4 _583 = make_int4(3, 3, 3, 3);
+ int4 _584 = make_int4(0, 0, 0, 0);
+ _582.x = (_583.x>=_584.x);
+ _582.y = (_583.y>=_584.y);
+ _582.z = (_583.z>=_584.z);
+ _582.w = (_583.w>=_584.w);
+ ushort4 _585;
+ int4 _586 = make_int4(0, 0, 0, 0);
+ _585.x = (_576.x>=_586.x);
+ _585.y = (_576.y>=_586.y);
+ _585.z = (_576.z>=_586.z);
+ _585.w = (_576.w>=_586.w);
+ _581.x = (_582.x&&_585.x);
+ _581.y = (_582.y&&_585.y);
+ _581.z = (_582.z&&_585.z);
+ _581.w = (_582.w&&_585.w);
+ ushort4 _587;
+ ushort4 _588;
+ int4 _589 = make_int4(3, 3, 3, 3);
+ int4 _590 = make_int4(0, 0, 0, 0);
+ _588.x = (_589.x<_590.x);
+ _588.y = (_589.y<_590.y);
+ _588.z = (_589.z<_590.z);
+ _588.w = (_589.w<_590.w);
+ ushort4 _591;
+ int4 _592 = make_int4(0, 0, 0, 0);
+ _591.x = (_576.x<=_592.x);
+ _591.y = (_576.y<=_592.y);
+ _591.z = (_576.z<=_592.z);
+ _591.w = (_576.w<=_592.w);
+ _587.x = (_588.x&&_591.x);
+ _587.y = (_588.y&&_591.y);
+ _587.z = (_588.z&&_591.z);
+ _587.w = (_588.w&&_591.w);
+ _580.x = (_581.x||_587.x);
+ _580.y = (_581.y||_587.y);
+ _580.z = (_581.z||_587.z);
+ _580.w = (_581.w||_587.w);
+ int4 _593;
+ int4 _594 = make_int4(3, 3, 3, 3);
+ _593.x = (_576.x+_594.x);
+ _593.y = (_576.y+_594.y);
+ _593.z = (_576.z+_594.z);
+ _593.w = (_576.w+_594.w);
+ _579.x = (bool(_580.x)?_576.x:_593.x);
+ _579.y = (bool(_580.y)?_576.y:_593.y);
+ _579.z = (bool(_580.z)?_576.z:_593.z);
+ _579.w = (bool(_580.w)?_576.w:_593.w);
+ _529.x = (_530.x+_579.x);
+ _529.y = (_530.y+_579.y);
+ _529.z = (_530.z+_579.z);
+ _529.w = (_530.w+_579.w);
+ *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 1024)) = make_float4(kernel[_529.x],kernel[_529.y],kernel[_529.z],kernel[_529.w]);
+ int4 _595;
+ int4 _596;
+ int4 _597;
+ int4 _598 = make_int4((((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 110592), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 110592), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 110592), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 110592));
+ int4 _599;
+ int4 _600;
+ int4 _601;
+ int4 _602 = make_int4((((((int)threadIdx.x) * 4) + 1152))+(1*0), (((((int)threadIdx.x) * 4) + 1152))+(1*1), (((((int)threadIdx.x) * 4) + 1152))+(1*2), (((((int)threadIdx.x) * 4) + 1152))+(1*3));
+ int4 _603 = make_int4(3, 3, 3, 3);
+ _601.x = (_602.x%_603.x);
+ _601.y = (_602.y%_603.y);
+ _601.z = (_602.z%_603.z);
+ _601.w = (_602.w%_603.w);
+ int4 _604;
+ int4 _605 = make_int4((((((int)threadIdx.x) * 4) + 1152))+(1*0), (((((int)threadIdx.x) * 4) + 1152))+(1*1), (((((int)threadIdx.x) * 4) + 1152))+(1*2), (((((int)threadIdx.x) * 4) + 1152))+(1*3));
+ int4 _606 = make_int4(3, 3, 3, 3);
+ _604.x = (_605.x/_606.x);
+ _604.y = (_605.y/_606.y);
+ _604.z = (_605.z/_606.z);
+ _604.w = (_605.w/_606.w);
+ int4 _607;
+ ushort4 _608;
+ ushort4 _609;
+ ushort4 _610;
+ int4 _611 = make_int4(3, 3, 3, 3);
+ int4 _612 = make_int4(0, 0, 0, 0);
+ _610.x = (_611.x>=_612.x);
+ _610.y = (_611.y>=_612.y);
+ _610.z = (_611.z>=_612.z);
+ _610.w = (_611.w>=_612.w);
+ ushort4 _613;
+ int4 _614 = make_int4(0, 0, 0, 0);
+ _613.x = (_601.x>=_614.x);
+ _613.y = (_601.y>=_614.y);
+ _613.z = (_601.z>=_614.z);
+ _613.w = (_601.w>=_614.w);
+ _609.x = (_610.x&&_613.x);
+ _609.y = (_610.y&&_613.y);
+ _609.z = (_610.z&&_613.z);
+ _609.w = (_610.w&&_613.w);
+ ushort4 _615;
+ ushort4 _616;
+ int4 _617 = make_int4(3, 3, 3, 3);
+ int4 _618 = make_int4(0, 0, 0, 0);
+ _616.x = (_617.x<_618.x);
+ _616.y = (_617.y<_618.y);
+ _616.z = (_617.z<_618.z);
+ _616.w = (_617.w<_618.w);
+ ushort4 _619;
+ int4 _620 = make_int4(0, 0, 0, 0);
+ _619.x = (_601.x<=_620.x);
+ _619.y = (_601.y<=_620.y);
+ _619.z = (_601.z<=_620.z);
+ _619.w = (_601.w<=_620.w);
+ _615.x = (_616.x&&_619.x);
+ _615.y = (_616.y&&_619.y);
+ _615.z = (_616.z&&_619.z);
+ _615.w = (_616.w&&_619.w);
+ _608.x = (_609.x||_615.x);
+ _608.y = (_609.y||_615.y);
+ _608.z = (_609.z||_615.z);
+ _608.w = (_609.w||_615.w);
+ int4 _621;
+ int4 _622 = make_int4(1, 1, 1, 1);
+ _621.x = (_604.x-_622.x);
+ _621.y = (_604.y-_622.y);
+ _621.z = (_604.z-_622.z);
+ _621.w = (_604.w-_622.w);
+ _607.x = (bool(_608.x)?_604.x:_621.x);
+ _607.y = (bool(_608.y)?_604.y:_621.y);
+ _607.z = (bool(_608.z)?_604.z:_621.z);
+ _607.w = (bool(_608.w)?_604.w:_621.w);
+ int4 _623 = make_int4(16, 16, 16, 16);
+ _600.x = (_607.x%_623.x);
+ _600.y = (_607.y%_623.y);
+ _600.z = (_607.z%_623.z);
+ _600.w = (_607.w%_623.w);
+ int4 _624;
+ ushort4 _625;
+ ushort4 _626;
+ ushort4 _627;
+ int4 _628 = make_int4(16, 16, 16, 16);
+ int4 _629 = make_int4(0, 0, 0, 0);
+ _627.x = (_628.x>=_629.x);
+ _627.y = (_628.y>=_629.y);
+ _627.z = (_628.z>=_629.z);
+ _627.w = (_628.w>=_629.w);
+ ushort4 _630;
+ int4 _631 = make_int4(0, 0, 0, 0);
+ _630.x = (_600.x>=_631.x);
+ _630.y = (_600.y>=_631.y);
+ _630.z = (_600.z>=_631.z);
+ _630.w = (_600.w>=_631.w);
+ _626.x = (_627.x&&_630.x);
+ _626.y = (_627.y&&_630.y);
+ _626.z = (_627.z&&_630.z);
+ _626.w = (_627.w&&_630.w);
+ ushort4 _632;
+ ushort4 _633;
+ int4 _634 = make_int4(16, 16, 16, 16);
+ int4 _635 = make_int4(0, 0, 0, 0);
+ _633.x = (_634.x<_635.x);
+ _633.y = (_634.y<_635.y);
+ _633.z = (_634.z<_635.z);
+ _633.w = (_634.w<_635.w);
+ ushort4 _636;
+ int4 _637 = make_int4(0, 0, 0, 0);
+ _636.x = (_600.x<=_637.x);
+ _636.y = (_600.y<=_637.y);
+ _636.z = (_600.z<=_637.z);
+ _636.w = (_600.w<=_637.w);
+ _632.x = (_633.x&&_636.x);
+ _632.y = (_633.y&&_636.y);
+ _632.z = (_633.z&&_636.z);
+ _632.w = (_633.w&&_636.w);
+ _625.x = (_626.x||_632.x);
+ _625.y = (_626.y||_632.y);
+ _625.z = (_626.z||_632.z);
+ _625.w = (_626.w||_632.w);
+ int4 _638;
+ int4 _639 = make_int4(16, 16, 16, 16);
+ _638.x = (_600.x+_639.x);
+ _638.y = (_600.y+_639.y);
+ _638.z = (_600.z+_639.z);
+ _638.w = (_600.w+_639.w);
+ _624.x = (bool(_625.x)?_600.x:_638.x);
+ _624.y = (bool(_625.y)?_600.y:_638.y);
+ _624.z = (bool(_625.z)?_600.z:_638.z);
+ _624.w = (bool(_625.w)?_600.w:_638.w);
+ int4 _640 = make_int4(9, 9, 9, 9);
+ _599.x = (_624.x*_640.x);
+ _599.y = (_624.y*_640.y);
+ _599.z = (_624.z*_640.z);
+ _599.w = (_624.w*_640.w);
+ _597.x = (_598.x+_599.x);
+ _597.y = (_598.y+_599.y);
+ _597.z = (_598.z+_599.z);
+ _597.w = (_598.w+_599.w);
+ int4 _641 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+ _596.x = (_597.x+_641.x);
+ _596.y = (_597.y+_641.y);
+ _596.z = (_597.z+_641.z);
+ _596.w = (_597.w+_641.w);
+ int4 _642;
+ int4 _643 = make_int4(((((int)threadIdx.x) + 288))+(1*0), ((((int)threadIdx.x) + 288))+(1*1), ((((int)threadIdx.x) + 288))+(1*2), ((((int)threadIdx.x) + 288))+(1*3));
+ int4 _644 = make_int4(3, 3, 3, 3);
+ _642.x = (_643.x%_644.x);
+ _642.y = (_643.y%_644.y);
+ _642.z = (_643.z%_644.z);
+ _642.w = (_643.w%_644.w);
+ int4 _645;
+ ushort4 _646;
+ ushort4 _647;
+ ushort4 _648;
+ int4 _649 = make_int4(3, 3, 3, 3);
+ int4 _650 = make_int4(0, 0, 0, 0);
+ _648.x = (_649.x>=_650.x);
+ _648.y = (_649.y>=_650.y);
+ _648.z = (_649.z>=_650.z);
+ _648.w = (_649.w>=_650.w);
+ ushort4 _651;
+ int4 _652 = make_int4(0, 0, 0, 0);
+ _651.x = (_642.x>=_652.x);
+ _651.y = (_642.y>=_652.y);
+ _651.z = (_642.z>=_652.z);
+ _651.w = (_642.w>=_652.w);
+ _647.x = (_648.x&&_651.x);
+ _647.y = (_648.y&&_651.y);
+ _647.z = (_648.z&&_651.z);
+ _647.w = (_648.w&&_651.w);
+ ushort4 _653;
+ ushort4 _654;
+ int4 _655 = make_int4(3, 3, 3, 3);
+ int4 _656 = make_int4(0, 0, 0, 0);
+ _654.x = (_655.x<_656.x);
+ _654.y = (_655.y<_656.y);
+ _654.z = (_655.z<_656.z);
+ _654.w = (_655.w<_656.w);
+ ushort4 _657;
+ int4 _658 = make_int4(0, 0, 0, 0);
+ _657.x = (_642.x<=_658.x);
+ _657.y = (_642.y<=_658.y);
+ _657.z = (_642.z<=_658.z);
+ _657.w = (_642.w<=_658.w);
+ _653.x = (_654.x&&_657.x);
+ _653.y = (_654.y&&_657.y);
+ _653.z = (_654.z&&_657.z);
+ _653.w = (_654.w&&_657.w);
+ _646.x = (_647.x||_653.x);
+ _646.y = (_647.y||_653.y);
+ _646.z = (_647.z||_653.z);
+ _646.w = (_647.w||_653.w);
+ int4 _659;
+ int4 _660 = make_int4(3, 3, 3, 3);
+ _659.x = (_642.x+_660.x);
+ _659.y = (_642.y+_660.y);
+ _659.z = (_642.z+_660.z);
+ _659.w = (_642.w+_660.w);
+ _645.x = (bool(_646.x)?_642.x:_659.x);
+ _645.y = (bool(_646.y)?_642.y:_659.y);
+ _645.z = (bool(_646.z)?_642.z:_659.z);
+ _645.w = (bool(_646.w)?_642.w:_659.w);
+ _595.x = (_596.x+_645.x);
+ _595.y = (_596.y+_645.y);
+ _595.z = (_596.z+_645.z);
+ _595.w = (_596.w+_645.w);
+ *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 1152)) = make_float4(kernel[_595.x],kernel[_595.y],kernel[_595.z],kernel[_595.w]);
+ int4 _661;
+ int4 _662;
+ int4 _663;
+ int4 _664 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1280) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1280) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1280) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1280) / 48) * 4608)) + (rc_outer_outer [...]
+ int4 _665;
+ int4 _666;
+ int4 _667;
+ int4 _668 = make_int4((((((int)threadIdx.x) * 4) + 1280))+(1*0), (((((int)threadIdx.x) * 4) + 1280))+(1*1), (((((int)threadIdx.x) * 4) + 1280))+(1*2), (((((int)threadIdx.x) * 4) + 1280))+(1*3));
+ int4 _669 = make_int4(3, 3, 3, 3);
+ _667.x = (_668.x%_669.x);
+ _667.y = (_668.y%_669.y);
+ _667.z = (_668.z%_669.z);
+ _667.w = (_668.w%_669.w);
+ int4 _670;
+ int4 _671 = make_int4((((((int)threadIdx.x) * 4) + 1280))+(1*0), (((((int)threadIdx.x) * 4) + 1280))+(1*1), (((((int)threadIdx.x) * 4) + 1280))+(1*2), (((((int)threadIdx.x) * 4) + 1280))+(1*3));
+ int4 _672 = make_int4(3, 3, 3, 3);
+ _670.x = (_671.x/_672.x);
+ _670.y = (_671.y/_672.y);
+ _670.z = (_671.z/_672.z);
+ _670.w = (_671.w/_672.w);
+ int4 _673;
+ ushort4 _674;
+ ushort4 _675;
+ ushort4 _676;
+ int4 _677 = make_int4(3, 3, 3, 3);
+ int4 _678 = make_int4(0, 0, 0, 0);
+ _676.x = (_677.x>=_678.x);
+ _676.y = (_677.y>=_678.y);
+ _676.z = (_677.z>=_678.z);
+ _676.w = (_677.w>=_678.w);
+ ushort4 _679;
+ int4 _680 = make_int4(0, 0, 0, 0);
+ _679.x = (_667.x>=_680.x);
+ _679.y = (_667.y>=_680.y);
+ _679.z = (_667.z>=_680.z);
+ _679.w = (_667.w>=_680.w);
+ _675.x = (_676.x&&_679.x);
+ _675.y = (_676.y&&_679.y);
+ _675.z = (_676.z&&_679.z);
+ _675.w = (_676.w&&_679.w);
+ ushort4 _681;
+ ushort4 _682;
+ int4 _683 = make_int4(3, 3, 3, 3);
+ int4 _684 = make_int4(0, 0, 0, 0);
+ _682.x = (_683.x<_684.x);
+ _682.y = (_683.y<_684.y);
+ _682.z = (_683.z<_684.z);
+ _682.w = (_683.w<_684.w);
+ ushort4 _685;
+ int4 _686 = make_int4(0, 0, 0, 0);
+ _685.x = (_667.x<=_686.x);
+ _685.y = (_667.y<=_686.y);
+ _685.z = (_667.z<=_686.z);
+ _685.w = (_667.w<=_686.w);
+ _681.x = (_682.x&&_685.x);
+ _681.y = (_682.y&&_685.y);
+ _681.z = (_682.z&&_685.z);
+ _681.w = (_682.w&&_685.w);
+ _674.x = (_675.x||_681.x);
+ _674.y = (_675.y||_681.y);
+ _674.z = (_675.z||_681.z);
+ _674.w = (_675.w||_681.w);
+ int4 _687;
+ int4 _688 = make_int4(1, 1, 1, 1);
+ _687.x = (_670.x-_688.x);
+ _687.y = (_670.y-_688.y);
+ _687.z = (_670.z-_688.z);
+ _687.w = (_670.w-_688.w);
+ _673.x = (bool(_674.x)?_670.x:_687.x);
+ _673.y = (bool(_674.y)?_670.y:_687.y);
+ _673.z = (bool(_674.z)?_670.z:_687.z);
+ _673.w = (bool(_674.w)?_670.w:_687.w);
+ int4 _689 = make_int4(16, 16, 16, 16);
+ _666.x = (_673.x%_689.x);
+ _666.y = (_673.y%_689.y);
+ _666.z = (_673.z%_689.z);
+ _666.w = (_673.w%_689.w);
+ int4 _690;
+ ushort4 _691;
+ ushort4 _692;
+ ushort4 _693;
+ int4 _694 = make_int4(16, 16, 16, 16);
+ int4 _695 = make_int4(0, 0, 0, 0);
+ _693.x = (_694.x>=_695.x);
+ _693.y = (_694.y>=_695.y);
+ _693.z = (_694.z>=_695.z);
+ _693.w = (_694.w>=_695.w);
+ ushort4 _696;
+ int4 _697 = make_int4(0, 0, 0, 0);
+ _696.x = (_666.x>=_697.x);
+ _696.y = (_666.y>=_697.y);
+ _696.z = (_666.z>=_697.z);
+ _696.w = (_666.w>=_697.w);
+ _692.x = (_693.x&&_696.x);
+ _692.y = (_693.y&&_696.y);
+ _692.z = (_693.z&&_696.z);
+ _692.w = (_693.w&&_696.w);
+ ushort4 _698;
+ ushort4 _699;
+ int4 _700 = make_int4(16, 16, 16, 16);
+ int4 _701 = make_int4(0, 0, 0, 0);
+ _699.x = (_700.x<_701.x);
+ _699.y = (_700.y<_701.y);
+ _699.z = (_700.z<_701.z);
+ _699.w = (_700.w<_701.w);
+ ushort4 _702;
+ int4 _703 = make_int4(0, 0, 0, 0);
+ _702.x = (_666.x<=_703.x);
+ _702.y = (_666.y<=_703.y);
+ _702.z = (_666.z<=_703.z);
+ _702.w = (_666.w<=_703.w);
+ _698.x = (_699.x&&_702.x);
+ _698.y = (_699.y&&_702.y);
+ _698.z = (_699.z&&_702.z);
+ _698.w = (_699.w&&_702.w);
+ _691.x = (_692.x||_698.x);
+ _691.y = (_692.y||_698.y);
+ _691.z = (_692.z||_698.z);
+ _691.w = (_692.w||_698.w);
+ int4 _704;
+ int4 _705 = make_int4(16, 16, 16, 16);
+ _704.x = (_666.x+_705.x);
+ _704.y = (_666.y+_705.y);
+ _704.z = (_666.z+_705.z);
+ _704.w = (_666.w+_705.w);
+ _690.x = (bool(_691.x)?_666.x:_704.x);
+ _690.y = (bool(_691.y)?_666.y:_704.y);
+ _690.z = (bool(_691.z)?_666.z:_704.z);
+ _690.w = (bool(_691.w)?_666.w:_704.w);
+ int4 _706 = make_int4(9, 9, 9, 9);
+ _665.x = (_690.x*_706.x);
+ _665.y = (_690.y*_706.y);
+ _665.z = (_690.z*_706.z);
+ _665.w = (_690.w*_706.w);
+ _663.x = (_664.x+_665.x);
+ _663.y = (_664.y+_665.y);
+ _663.z = (_664.z+_665.z);
+ _663.w = (_664.w+_665.w);
+ int4 _707 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+ _662.x = (_663.x+_707.x);
+ _662.y = (_663.y+_707.y);
+ _662.z = (_663.z+_707.z);
+ _662.w = (_663.w+_707.w);
+ int4 _708;
+ int4 _709 = make_int4(((((int)threadIdx.x) + 320))+(1*0), ((((int)threadIdx.x) + 320))+(1*1), ((((int)threadIdx.x) + 320))+(1*2), ((((int)threadIdx.x) + 320))+(1*3));
+ int4 _710 = make_int4(3, 3, 3, 3);
+ _708.x = (_709.x%_710.x);
+ _708.y = (_709.y%_710.y);
+ _708.z = (_709.z%_710.z);
+ _708.w = (_709.w%_710.w);
+ int4 _711;
+ ushort4 _712;
+ ushort4 _713;
+ ushort4 _714;
+ int4 _715 = make_int4(3, 3, 3, 3);
+ int4 _716 = make_int4(0, 0, 0, 0);
+ _714.x = (_715.x>=_716.x);
+ _714.y = (_715.y>=_716.y);
+ _714.z = (_715.z>=_716.z);
+ _714.w = (_715.w>=_716.w);
+ ushort4 _717;
+ int4 _718 = make_int4(0, 0, 0, 0);
+ _717.x = (_708.x>=_718.x);
+ _717.y = (_708.y>=_718.y);
+ _717.z = (_708.z>=_718.z);
+ _717.w = (_708.w>=_718.w);
+ _713.x = (_714.x&&_717.x);
+ _713.y = (_714.y&&_717.y);
+ _713.z = (_714.z&&_717.z);
+ _713.w = (_714.w&&_717.w);
+ ushort4 _719;
+ ushort4 _720;
+ int4 _721 = make_int4(3, 3, 3, 3);
+ int4 _722 = make_int4(0, 0, 0, 0);
+ _720.x = (_721.x<_722.x);
+ _720.y = (_721.y<_722.y);
+ _720.z = (_721.z<_722.z);
+ _720.w = (_721.w<_722.w);
+ ushort4 _723;
+ int4 _724 = make_int4(0, 0, 0, 0);
+ _723.x = (_708.x<=_724.x);
+ _723.y = (_708.y<=_724.y);
+ _723.z = (_708.z<=_724.z);
+ _723.w = (_708.w<=_724.w);
+ _719.x = (_720.x&&_723.x);
+ _719.y = (_720.y&&_723.y);
+ _719.z = (_720.z&&_723.z);
+ _719.w = (_720.w&&_723.w);
+ _712.x = (_713.x||_719.x);
+ _712.y = (_713.y||_719.y);
+ _712.z = (_713.z||_719.z);
+ _712.w = (_713.w||_719.w);
+ int4 _725;
+ int4 _726 = make_int4(3, 3, 3, 3);
+ _725.x = (_708.x+_726.x);
+ _725.y = (_708.y+_726.y);
+ _725.z = (_708.z+_726.z);
+ _725.w = (_708.w+_726.w);
+ _711.x = (bool(_712.x)?_708.x:_725.x);
+ _711.y = (bool(_712.y)?_708.y:_725.y);
+ _711.z = (bool(_712.z)?_708.z:_725.z);
+ _711.w = (bool(_712.w)?_708.w:_725.w);
+ _661.x = (_662.x+_711.x);
+ _661.y = (_662.y+_711.y);
+ _661.z = (_662.z+_711.z);
+ _661.w = (_662.w+_711.w);
+ *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 1280)) = make_float4(kernel[_661.x],kernel[_661.y],kernel[_661.z],kernel[_661.w]);
+ int4 _727;
+ int4 _728;
+ int4 _729;
+ int4 _730 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1408) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1408) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1408) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1408) / 48) * 4608)) + (rc_outer_outer [...]
+ int4 _731;
+ int4 _732;
+ int4 _733;
+ int4 _734 = make_int4((((((int)threadIdx.x) * 4) + 1408))+(1*0), (((((int)threadIdx.x) * 4) + 1408))+(1*1), (((((int)threadIdx.x) * 4) + 1408))+(1*2), (((((int)threadIdx.x) * 4) + 1408))+(1*3));
+ int4 _735 = make_int4(3, 3, 3, 3);
+ _733.x = (_734.x%_735.x);
+ _733.y = (_734.y%_735.y);
+ _733.z = (_734.z%_735.z);
+ _733.w = (_734.w%_735.w);
+ int4 _736;
+ int4 _737 = make_int4((((((int)threadIdx.x) * 4) + 1408))+(1*0), (((((int)threadIdx.x) * 4) + 1408))+(1*1), (((((int)threadIdx.x) * 4) + 1408))+(1*2), (((((int)threadIdx.x) * 4) + 1408))+(1*3));
+ int4 _738 = make_int4(3, 3, 3, 3);
+ _736.x = (_737.x/_738.x);
+ _736.y = (_737.y/_738.y);
+ _736.z = (_737.z/_738.z);
+ _736.w = (_737.w/_738.w);
+ int4 _739;
+ ushort4 _740;
+ ushort4 _741;
+ ushort4 _742;
+ int4 _743 = make_int4(3, 3, 3, 3);
+ int4 _744 = make_int4(0, 0, 0, 0);
+ _742.x = (_743.x>=_744.x);
+ _742.y = (_743.y>=_744.y);
+ _742.z = (_743.z>=_744.z);
+ _742.w = (_743.w>=_744.w);
+ ushort4 _745;
+ int4 _746 = make_int4(0, 0, 0, 0);
+ _745.x = (_733.x>=_746.x);
+ _745.y = (_733.y>=_746.y);
+ _745.z = (_733.z>=_746.z);
+ _745.w = (_733.w>=_746.w);
+ _741.x = (_742.x&&_745.x);
+ _741.y = (_742.y&&_745.y);
+ _741.z = (_742.z&&_745.z);
+ _741.w = (_742.w&&_745.w);
+ ushort4 _747;
+ ushort4 _748;
+ int4 _749 = make_int4(3, 3, 3, 3);
+ int4 _750 = make_int4(0, 0, 0, 0);
+ _748.x = (_749.x<_750.x);
+ _748.y = (_749.y<_750.y);
+ _748.z = (_749.z<_750.z);
+ _748.w = (_749.w<_750.w);
+ ushort4 _751;
+ int4 _752 = make_int4(0, 0, 0, 0);
+ _751.x = (_733.x<=_752.x);
+ _751.y = (_733.y<=_752.y);
+ _751.z = (_733.z<=_752.z);
+ _751.w = (_733.w<=_752.w);
+ _747.x = (_748.x&&_751.x);
+ _747.y = (_748.y&&_751.y);
+ _747.z = (_748.z&&_751.z);
+ _747.w = (_748.w&&_751.w);
+ _740.x = (_741.x||_747.x);
+ _740.y = (_741.y||_747.y);
+ _740.z = (_741.z||_747.z);
+ _740.w = (_741.w||_747.w);
+ int4 _753;
+ int4 _754 = make_int4(1, 1, 1, 1);
+ _753.x = (_736.x-_754.x);
+ _753.y = (_736.y-_754.y);
+ _753.z = (_736.z-_754.z);
+ _753.w = (_736.w-_754.w);
+ _739.x = (bool(_740.x)?_736.x:_753.x);
+ _739.y = (bool(_740.y)?_736.y:_753.y);
+ _739.z = (bool(_740.z)?_736.z:_753.z);
+ _739.w = (bool(_740.w)?_736.w:_753.w);
+ int4 _755 = make_int4(16, 16, 16, 16);
+ _732.x = (_739.x%_755.x);
+ _732.y = (_739.y%_755.y);
+ _732.z = (_739.z%_755.z);
+ _732.w = (_739.w%_755.w);
+ int4 _756;
+ ushort4 _757;
+ ushort4 _758;
+ ushort4 _759;
+ int4 _760 = make_int4(16, 16, 16, 16);
+ int4 _761 = make_int4(0, 0, 0, 0);
+ _759.x = (_760.x>=_761.x);
+ _759.y = (_760.y>=_761.y);
+ _759.z = (_760.z>=_761.z);
+ _759.w = (_760.w>=_761.w);
+ ushort4 _762;
+ int4 _763 = make_int4(0, 0, 0, 0);
+ _762.x = (_732.x>=_763.x);
+ _762.y = (_732.y>=_763.y);
+ _762.z = (_732.z>=_763.z);
+ _762.w = (_732.w>=_763.w);
+ _758.x = (_759.x&&_762.x);
+ _758.y = (_759.y&&_762.y);
+ _758.z = (_759.z&&_762.z);
+ _758.w = (_759.w&&_762.w);
+ ushort4 _764;
+ ushort4 _765;
+ int4 _766 = make_int4(16, 16, 16, 16);
+ int4 _767 = make_int4(0, 0, 0, 0);
+ _765.x = (_766.x<_767.x);
+ _765.y = (_766.y<_767.y);
+ _765.z = (_766.z<_767.z);
+ _765.w = (_766.w<_767.w);
+ ushort4 _768;
+ int4 _769 = make_int4(0, 0, 0, 0);
+ _768.x = (_732.x<=_769.x);
+ _768.y = (_732.y<=_769.y);
+ _768.z = (_732.z<=_769.z);
+ _768.w = (_732.w<=_769.w);
+ _764.x = (_765.x&&_768.x);
+ _764.y = (_765.y&&_768.y);
+ _764.z = (_765.z&&_768.z);
+ _764.w = (_765.w&&_768.w);
+ _757.x = (_758.x||_764.x);
+ _757.y = (_758.y||_764.y);
+ _757.z = (_758.z||_764.z);
+ _757.w = (_758.w||_764.w);
+ int4 _770;
+ int4 _771 = make_int4(16, 16, 16, 16);
+ _770.x = (_732.x+_771.x);
+ _770.y = (_732.y+_771.y);
+ _770.z = (_732.z+_771.z);
+ _770.w = (_732.w+_771.w);
+ _756.x = (bool(_757.x)?_732.x:_770.x);
+ _756.y = (bool(_757.y)?_732.y:_770.y);
+ _756.z = (bool(_757.z)?_732.z:_770.z);
+ _756.w = (bool(_757.w)?_732.w:_770.w);
+ int4 _772 = make_int4(9, 9, 9, 9);
+ _731.x = (_756.x*_772.x);
+ _731.y = (_756.y*_772.y);
+ _731.z = (_756.z*_772.z);
+ _731.w = (_756.w*_772.w);
+ _729.x = (_730.x+_731.x);
+ _729.y = (_730.y+_731.y);
+ _729.z = (_730.z+_731.z);
+ _729.w = (_730.w+_731.w);
+ int4 _773 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+ _728.x = (_729.x+_773.x);
+ _728.y = (_729.y+_773.y);
+ _728.z = (_729.z+_773.z);
+ _728.w = (_729.w+_773.w);
+ int4 _774;
+ int4 _775 = make_int4(((((int)threadIdx.x) + 352))+(1*0), ((((int)threadIdx.x) + 352))+(1*1), ((((int)threadIdx.x) + 352))+(1*2), ((((int)threadIdx.x) + 352))+(1*3));
+ int4 _776 = make_int4(3, 3, 3, 3);
+ _774.x = (_775.x%_776.x);
+ _774.y = (_775.y%_776.y);
+ _774.z = (_775.z%_776.z);
+ _774.w = (_775.w%_776.w);
+ int4 _777;
+ ushort4 _778;
+ ushort4 _779;
+ ushort4 _780;
+ int4 _781 = make_int4(3, 3, 3, 3);
+ int4 _782 = make_int4(0, 0, 0, 0);
+ _780.x = (_781.x>=_782.x);
+ _780.y = (_781.y>=_782.y);
+ _780.z = (_781.z>=_782.z);
+ _780.w = (_781.w>=_782.w);
+ ushort4 _783;
+ int4 _784 = make_int4(0, 0, 0, 0);
+ _783.x = (_774.x>=_784.x);
+ _783.y = (_774.y>=_784.y);
+ _783.z = (_774.z>=_784.z);
+ _783.w = (_774.w>=_784.w);
+ _779.x = (_780.x&&_783.x);
+ _779.y = (_780.y&&_783.y);
+ _779.z = (_780.z&&_783.z);
+ _779.w = (_780.w&&_783.w);
+ ushort4 _785;
+ ushort4 _786;
+ int4 _787 = make_int4(3, 3, 3, 3);
+ int4 _788 = make_int4(0, 0, 0, 0);
+ _786.x = (_787.x<_788.x);
+ _786.y = (_787.y<_788.y);
+ _786.z = (_787.z<_788.z);
+ _786.w = (_787.w<_788.w);
+ ushort4 _789;
+ int4 _790 = make_int4(0, 0, 0, 0);
+ _789.x = (_774.x<=_790.x);
+ _789.y = (_774.y<=_790.y);
+ _789.z = (_774.z<=_790.z);
+ _789.w = (_774.w<=_790.w);
+ _785.x = (_786.x&&_789.x);
+ _785.y = (_786.y&&_789.y);
+ _785.z = (_786.z&&_789.z);
+ _785.w = (_786.w&&_789.w);
+ _778.x = (_779.x||_785.x);
+ _778.y = (_779.y||_785.y);
+ _778.z = (_779.z||_785.z);
+ _778.w = (_779.w||_785.w);
+ int4 _791;
+ int4 _792 = make_int4(3, 3, 3, 3);
+ _791.x = (_774.x+_792.x);
+ _791.y = (_774.y+_792.y);
+ _791.z = (_774.z+_792.z);
+ _791.w = (_774.w+_792.w);
+ _777.x = (bool(_778.x)?_774.x:_791.x);
+ _777.y = (bool(_778.y)?_774.y:_791.y);
+ _777.z = (bool(_778.z)?_774.z:_791.z);
+ _777.w = (bool(_778.w)?_774.w:_791.w);
+ _727.x = (_728.x+_777.x);
+ _727.y = (_728.y+_777.y);
+ _727.z = (_728.z+_777.z);
+ _727.w = (_728.w+_777.w);
+ *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 1408)) = make_float4(kernel[_727.x],kernel[_727.y],kernel[_727.z],kernel[_727.w]);
__syncthreads();
- for (int rc_outer_inner = 0; rc_outer_inner < 16; ++rc_outer_inner) {
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7))] * kernel_shared[(((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6))]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7))] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 384)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 1)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 1)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 1)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 385)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 2)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 2)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 2)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 386)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 3)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 387)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 64)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 4)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 64)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 388)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 65)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 5)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 65)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 389)]));
+ for (int rc_outer_inner = 0; rc_outer_inner < 8; ++rc_outer_inner) {
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(rc_outer_inner * 18)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 18) + 1)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_outer_inner * 18) + 2)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_outer_inner * 18) + 3)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_outer_inner * 18) + 4)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_outer_inner * 18) + 5)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_outer_inner * 18) + 6)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 18) + 1)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 18) + 2)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_outer_inner * 18) + 3)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_outer_inner * 18) + 4)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_outer_inner * 18) + 5)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_outer_inner * 18) + 6)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_outer_inner * 18) + 7)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 18) + 2)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 18) + 3)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_outer_inner * 18) + 4)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_outer_inner * 18) + 5)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_outer_inner * 18) + 6)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_outer_inner * 18) + 7)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_outer_inner * 18) + 8)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 18) + 9)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 18) + 10)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_outer_inner * 18) + 11)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_outer_inner * 18) + 12)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_outer_inner * 18) + 13)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_outer_inner * 18) + 14)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_outer_inner * 18) + 15)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 18) + 10)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 18) + 11)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_outer_inner * 18) + 12)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_outer_inner * 18) + 13)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_outer_inner * 18) + 14)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_outer_inner * 18) + 15)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_outer_inner * 18) + 16)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 18) + 11)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 18) + 12)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_outer_inner * 18) + 13)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_outer_inner * 18) + 14)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_outer_inner * 18) + 15)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_outer_inner * 18) + 16)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_outer_inner * 18) + 17)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
}
}
}
- compute[((((int)blockIdx.x) * 392) + ((int)threadIdx.x))] = max((conv2d_nchw[0] + bias[((((int)blockIdx.x) * 8) + (((int)threadIdx.x) / 49))]), 0.000000e+00f);
- compute[(((((int)blockIdx.x) * 392) + ((int)threadIdx.x)) + 196)] = max((conv2d_nchw[1] + bias[(((((int)blockIdx.x) * 8) + (((int)threadIdx.x) / 49)) + 4)]), 0.000000e+00f);
+ for (int i3_inner = 0; i3_inner < 7; ++i3_inner) {
+ compute[(((((((int)blockIdx.x) / 7) * 1568) + (((int)threadIdx.x) * 49)) + ((((int)blockIdx.x) % 7) * 7)) + i3_inner)] = max((conv2d_nchw[i3_inner] + bias[(((((int)blockIdx.x) / 7) * 32) + ((int)threadIdx.x))]), 0.000000e+00f);
+ }
}
</pre></div>
</div>
@@ -774,7 +3336,7 @@ In the example below we resume the status and do more 5 trials.</p>
Get devices for measurement successfully!
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 35.530 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 34.321 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/e3e540f3b477c0c52d8eb73e674e8ffd/tune_conv2d_layer_cuda.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_conv2d_layer_cuda.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
index 3232d60b6..98472f75a 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
@@ -901,7 +901,7 @@ so we can read the log file and load the best schedules.</p>
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 9.6964 9.7275 9.7331 9.6288 0.0479
+ 9.8793 9.8894 9.9270 9.8216 0.0436
</pre></div>
</div>
</div>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_x86.html b/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
index db7a6dc1d..fb304af72 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
@@ -920,7 +920,7 @@ so we can read the log file and load the best schedules.</p>
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 752.6694 752.9576 753.1499 751.9008 0.5491
+ 757.9333 757.6261 759.1621 757.0116 0.9044
</pre></div>
</div>
</div>
@@ -942,7 +942,7 @@ to learn how to use the RPC Tracker and RPC Server.
To use the RPC Tracker in auto-scheduler, replace the runner in <code class="code docutils literal notranslate"><span class="pre">TuningOptions</span></code>
with <a class="reference internal" href="../../reference/api/python/auto_scheduler.html#tvm.auto_scheduler.RPCRunner" title="tvm.auto_scheduler.RPCRunner"><code class="xref any py py-class docutils literal notranslate"><span class="pre">auto_scheduler.RPCRunner</span></code></a>.</p></li>
</ol>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 19.801 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 20.685 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-network-x86-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/e416b94ca1090b0897c0f6e0df95b911/tune_network_x86.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_network_x86.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html b/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
index cb6ecd47f..3a5b75b97 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
@@ -620,12 +620,12 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
placeholder_4: Buffer(placeholder_14: Pointer(float32), float32, [65536], []),
compute: Buffer(compute_2: Pointer(float32), float32, [65536], [])}
buffer_map = {placeholder_5: placeholder, placeholder_6: placeholder_1, placeholder_7: placeholder_2, placeholder_8: placeholder_3, placeholder_9: placeholder_4, compute_1: compute}
- preflattened_buffer_map = {placeholder_5: placeholder_15: Buffer(placeholder_10, float32, [128, 256], []), placeholder_7: placeholder_16: Buffer(placeholder_12, int32, [4916], []), placeholder_8: placeholder_17: Buffer(placeholder_13, int32, [33], []), compute_1: compute_3: Buffer(compute_2, float32, [128, 512], []), placeholder_9: placeholder_18: Buffer(placeholder_14, float32, [128, 512], []), placeholder_6: placeholder_19: Buffer(placeholder_11, float32, [4916, 16, 1], [])} {
+ preflattened_buffer_map = {placeholder_7: placeholder_15: Buffer(placeholder_12, int32, [4916], []), placeholder_6: placeholder_16: Buffer(placeholder_11, float32, [4916, 16, 1], []), placeholder_8: placeholder_17: Buffer(placeholder_13, int32, [33], []), placeholder_5: placeholder_18: Buffer(placeholder_10, float32, [128, 256], []), placeholder_9: placeholder_19: Buffer(placeholder_14, float32, [128, 512], []), compute_1: compute_3: Buffer(compute_2, float32, [128, 512], [])} {
for (i0.outer.i1.outer.fused: int32, 0, 32) "parallel" {
allocate(compute_4: Pointer(global float32), float32, [2048]), storage_scope = global {
- for (nb_j.inner: int32, 0, 2) {
- for (i.inner.init: int32, 0, 64) {
- let cse_var_1: int32 = ((i.inner.init*32) + (nb_j.inner*16))
+ for (i.outer.inner: int32, 0, 4) {
+ for (i.inner.init: int32, 0, 32) {
+ let cse_var_1: int32 = ((i.outer.inner*512) + (i.inner.init*16))
{
compute_5: Buffer(compute_4, float32, [2048], [])[cse_var_1] = 0f32
compute_5[(cse_var_1 + 1)] = 0f32
@@ -645,51 +645,78 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
compute_5[(cse_var_1 + 15)] = 0f32
}
}
- for (elem_idx: int32, 0, let cse_var_2: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner) in (placeholder_3[(cse_var_2 + 1)] - placeholder_3[cse_var_2])) {
- for (i.inner: int32, 0, 64) {
- let cse_var_21: int32 = (elem_idx*16)
- let cse_var_20: int32 = ((i.inner*32) + (nb_j.inner*16))
- let cse_var_19: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)
- let cse_var_18: int32 = ((floordiv(i0.outer.i1.outer.fused, 16)*16384) + (i.inner*256))
- let cse_var_17: int32 = (cse_var_20 + 9)
- let cse_var_16: int32 = (cse_var_20 + 8)
- let cse_var_15: int32 = (cse_var_20 + 7)
- let cse_var_14: int32 = (cse_var_20 + 6)
- let cse_var_13: int32 = (cse_var_20 + 5)
- let cse_var_12: int32 = (cse_var_20 + 4)
- let cse_var_11: int32 = (cse_var_20 + 3)
- let cse_var_10: int32 = (cse_var_20 + 2)
- let cse_var_9: int32 = (cse_var_20 + 15)
- let cse_var_8: int32 = (cse_var_20 + 14)
- let cse_var_7: int32 = (cse_var_20 + 13)
- let cse_var_6: int32 = (cse_var_20 + 12)
- let cse_var_5: int32 = (cse_var_20 + 11)
- let cse_var_4: int32 = (cse_var_20 + 10)
- let cse_var_3: int32 = (cse_var_20 + 1)
- {
- compute_5[cse_var_20] = (compute_5[cse_var_20] + (placeholder_1[((placeholder_3[cse_var_19]*16) + cse_var_21)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_3] = (compute_5[cse_var_3] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 1)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 2)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 3)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 4)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 5)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 6)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 7)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 8)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 9)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_4] = (compute_5[cse_var_4] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 10)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_5] = (compute_5[cse_var_5] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 11)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_6] = (compute_5[cse_var_6] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 12)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 13)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 14)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
- compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 15)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
+ for (elem_idx: int32, 0, (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])) {
+ for (i.inner: int32, 0, 32) {
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_2: int32 = ((i.outer.inner*512) + (i.inner*16))
+ compute_5[cse_var_2] = (compute_5[cse_var_2] + (placeholder_1[((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16))]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_3: int32 = (((i.outer.inner*512) + (i.inner*16)) + 1)
+ compute_5[cse_var_3] = (compute_5[cse_var_3] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 1)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_4: int32 = (((i.outer.inner*512) + (i.inner*16)) + 2)
+ compute_5[cse_var_4] = (compute_5[cse_var_4] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 2)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_5: int32 = (((i.outer.inner*512) + (i.inner*16)) + 3)
+ compute_5[cse_var_5] = (compute_5[cse_var_5] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 3)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_6: int32 = (((i.outer.inner*512) + (i.inner*16)) + 4)
+ compute_5[cse_var_6] = (compute_5[cse_var_6] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 4)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_7: int32 = (((i.outer.inner*512) + (i.inner*16)) + 5)
+ compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 5)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_8: int32 = (((i.outer.inner*512) + (i.inner*16)) + 6)
+ compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 6)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_9: int32 = (((i.outer.inner*512) + (i.inner*16)) + 7)
+ compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 7)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_10: int32 = (((i.outer.inner*512) + (i.inner*16)) + 8)
+ compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 8)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_11: int32 = (((i.outer.inner*512) + (i.inner*16)) + 9)
+ compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 9)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_12: int32 = (((i.outer.inner*512) + (i.inner*16)) + 10)
+ compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 10)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_13: int32 = (((i.outer.inner*512) + (i.inner*16)) + 11)
+ compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 11)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_14: int32 = (((i.outer.inner*512) + (i.inner*16)) + 12)
+ compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 12)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_15: int32 = (((i.outer.inner*512) + (i.inner*16)) + 13)
+ compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 13)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_16: int32 = (((i.outer.inner*512) + (i.inner*16)) + 14)
+ compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 14)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_17: int32 = (((i.outer.inner*512) + (i.inner*16)) + 15)
+ compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 15)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
}
}
}
}
- for (i0.inner: int32, 0, 64) {
- let cse_var_22: int32 = (((floordiv(i0.outer.i1.outer.fused, 16)*32768) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 16)*32))
- compute[ramp(cse_var_22, 1, 32)] = max((compute_5[ramp((i0.inner*32), 1, 32)] + placeholder_4[ramp(cse_var_22, 1, 32)]), broadcast(0f32, 32))
+ for (i0.inner: int32, 0, 128) {
+ let cse_var_18: int32 = ((i0.inner*512) + (i0.outer.i1.outer.fused*16))
+ compute[ramp(cse_var_18, 1, 16)] = max((compute_5[ramp((i0.inner*16), 1, 16)] + placeholder_4[ramp(cse_var_18, 1, 16)]), broadcast(0f32, 16))
}
}
}
@@ -727,7 +754,7 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
<span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 1.855 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 1.730 ms
</pre></div>
</div>
<div class="admonition note">
diff --git a/docs/how_to/tune_with_autotvm/sg_execution_times.html b/docs/how_to/tune_with_autotvm/sg_execution_times.html
index 7f5f3fcf3..c40dad72e 100644
--- a/docs/how_to/tune_with_autotvm/sg_execution_times.html
+++ b/docs/how_to/tune_with_autotvm/sg_execution_times.html
@@ -322,7 +322,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-tune-with-autotvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:43.277</strong> total execution time for <strong>how_to_tune_with_autotvm</strong> files:</p>
+<p><strong>00:43.532</strong> total execution time for <strong>how_to_tune_with_autotvm</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 84%" />
@@ -331,11 +331,11 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_conv2d_cuda.html#sphx-glr-how-to-tune-with-autotvm-tune-conv2d-cuda-py"><span class="std std-ref">Tuning High Performance Convolution on NVIDIA GPUs</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_cuda.py</span></code>)</p></td>
-<td><p>00:43.248</p></td>
+<td><p>00:43.499</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_relay_x86.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-x86-py"><span class="std std-ref">Auto-tuning a Convolutional Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_x86.py</span></code>)</p></td>
-<td><p>00:00.015</p></td>
+<td><p>00:00.019</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_relay_cuda.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-cuda-py"><span class="std std-ref">Auto-tuning a Convolutional Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_cuda.py</span></code>)</p></td>
diff --git a/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html b/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
index 9b50d4e52..bb763fa2c 100644
--- a/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
+++ b/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
@@ -1164,8 +1164,8 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 4, 32]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 128]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2885496
-No: 6 GFLOPS: 110.83/110.83 result: MeasureResult(costs=(0.002088788229166667,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.8140833377838135, timestamp=1655930458.0250723) [('tile_f', [-1, 1, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3754080
-No: 7 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+No: 6 GFLOPS: 110.46/110.46 result: MeasureResult(costs=(0.00209571425,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.82222318649292, timestamp=1655930909.2240996) [('tile_f', [-1, 1, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3754080
+No: 7 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1288,7 +1288,7 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 16, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 256, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6225319
-No: 8 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+No: 8 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1411,7 +1411,7 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 1, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 8, 64]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,943546
-No: 9 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+No: 9 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1534,7 +1534,7 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 16, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 16, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2868708
-No: 10 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+No: 10 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 142, in build
res = future.result()
File "/usr/lib/python3.7/concurrent/futures/_base.py", line 435, in result
@@ -1552,7 +1552,7 @@ No: 10 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
TimeoutError
[('tile_f', [-1, 32, 2, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4691833
-No: 11 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+No: 11 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1675,7 +1675,7 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 2, 64]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1042124
-No: 12 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+No: 12 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1798,7 +1798,7 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 32, 1, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 32, 16]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10013405
-No: 13 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+No: 13 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1921,7 +1921,7 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 8, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 32]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6732082
-No: 14 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+No: 14 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2044,7 +2044,7 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 4, 32]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 128]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7536735
-No: 15 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+No: 15 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2167,7 +2167,7 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 1, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 128, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,482121
-No: 16 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+No: 16 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2290,7 +2290,7 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 1, 16]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 32, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2824525
-No: 17 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+No: 17 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2413,7 +2413,7 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 64, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4559286
-No: 18 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+No: 18 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2536,7 +2536,7 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 32, 16]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 512]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9677544
-No: 19 GFLOPS: 0.00/110.83 result: Traceback (most recent call last):
+No: 19 GFLOPS: 0.00/110.46 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 738, in __call__
yield remote, remote.load_module(os.path.split(build_result.filename)[1])
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 702, in run_through_rpc
@@ -2624,7 +2624,7 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
15: _PyEval_EvalFrameDefault
14: 0x0000000000537c30
13: _PyObject_FastCallKeywords
- 12: 0x00007ffab91b6fa2
+ 12: 0x00007f97971cffa2
11: _ctypes_callproc
10: ffi_call
9: ffi_call_unix64
@@ -2689,7 +2689,7 @@ Traceback (most recent call last):
21: _PyFunction_FastCallKeywords
20: _PyEval_EvalFrameDefault
19: _PyFunction_FastCall [('tile_f', [-1, 8, 2, 16]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6390073
-No: 20 GFLOPS: 144.17/144.17 result: MeasureResult(costs=(0.00160570644,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4077048301696777, timestamp=1655930484.4690123) [('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9881539
+No: 20 GFLOPS: 144.77/144.77 result: MeasureResult(costs=(0.00159906648,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4326999187469482, timestamp=1655930935.759078) [('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9881539
</pre></div>
</div>
<p>Finally we can inspect the best config from log file, check correctness,
@@ -2730,7 +2730,7 @@ and measure running time.</p>
Best config:
[('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9881539
Finish loading 20 records
-Time cost of this operator: 0.001957
+Time cost of this operator: 0.002022
</pre></div>
</div>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autotvm-tune-conv2d-cuda-py">
diff --git a/docs/how_to/work_with_microtvm/micro_autotune.html b/docs/how_to/work_with_microtvm/micro_autotune.html
index b6173a354..a1192dcba 100644
--- a/docs/how_to/work_with_microtvm/micro_autotune.html
+++ b/docs/how_to/work_with_microtvm/micro_autotune.html
@@ -578,10 +578,10 @@ the tuned operator.</p>
########## Build without Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs
--------- --- -------- ------- ----- ------ -------
-tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 318.6 98.76 (1, 2, 10, 10, 3) 2 1
-tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.076 0.953 (1, 6, 10, 10) 1 1
-tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.924 0.286 (1, 1, 10, 10, 3) 1 1
-Total_time - 322.6 - - - -
+tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 315.3 98.749 (1, 2, 10, 10, 3) 2 1
+tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.079 0.964 (1, 6, 10, 10) 1 1
+tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.916 0.287 (1, 1, 10, 10, 3) 1 1
+Total_time - 319.295 - - - -
</pre></div>
</div>
</div>
@@ -634,10 +634,10 @@ Total_time -
########## Build with Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs
--------- --- -------- ------- ----- ------ -------
-tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 190.6 98.591 (1, 1, 10, 10, 6) 2 1
-tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.902 0.984 (1, 6, 10, 10) 1 1
-tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.823 0.426 (1, 3, 10, 10, 1) 1 1
-Total_time - 193.325 - - - -
+tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 135.1 98.066 (1, 6, 10, 10, 1) 2 1
+tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.748 1.269 (1, 6, 10, 10) 1 1
+tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.916 0.665 (1, 1, 10, 10, 3) 1 1
+Total_time - 137.765 - - - -
</pre></div>
</div>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-autotune-py">
diff --git a/docs/how_to/work_with_microtvm/micro_train.html b/docs/how_to/work_with_microtvm/micro_train.html
index 71e46e5e4..4fe789120 100644
--- a/docs/how_to/work_with_microtvm/micro_train.html
+++ b/docs/how_to/work_with_microtvm/micro_train.html
@@ -510,7 +510,7 @@ take about <strong>2 minutes</strong> to download the Stanford Cars, while COCO
<a href="https://docs.python.org/3/library/shutil.html#shutil.move" title="shutil.move" class="sphx-glr-backref-module-shutil sphx-glr-backref-type-py-function"><span class="n">shutil</span><span class="o">.</span><span class="n">move</span></a><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><a href="https://docs.python.org/3/library/stdtypes.html#str" title="builtins.str" class="sphx-glr-backref-module-builtins sphx-glr-backref-typ [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>'/tmp/tmp5adly3xq/images/random'
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>'/tmp/tmptfwkoswh/images/random'
</pre></div>
</div>
</div>
@@ -570,8 +570,8 @@ objects to other stuff? We can display some examples from our datasets using <co
<span class="n">plt</span><span class="o">.</span><span class="n">axis</span><span class="p">(</span><span class="s2">"off"</span><span class="p">)</span>
</pre></div>
</div>
-<img src="../../_images/sphx_glr_micro_train_001.png" srcset="../../_images/sphx_glr_micro_train_001.png" alt="[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>/tmp/tmp5adly3xq/images/target contains 8144 images
-/tmp/tmp5adly3xq/images/random contains 5000 images
+<img src="../../_images/sphx_glr_micro_train_001.png" srcset="../../_images/sphx_glr_micro_train_001.png" alt="[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>/tmp/tmptfwkoswh/images/target contains 8144 images
+/tmp/tmptfwkoswh/images/random contains 5000 images
</pre></div>
</div>
</div>
@@ -683,13 +683,13 @@ the time on our validation set).</p>
</pre></div>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Epoch 1/3
-328/328 - 55s - loss: 0.2082 - accuracy: 0.9277 - val_loss: 0.1597 - val_accuracy: 0.9535
+328/328 - 55s - loss: 0.2163 - accuracy: 0.9270 - val_loss: 0.1386 - val_accuracy: 0.9528
Epoch 2/3
-328/328 - 52s - loss: 0.0971 - accuracy: 0.9631 - val_loss: 0.1233 - val_accuracy: 0.9630
+328/328 - 52s - loss: 0.0909 - accuracy: 0.9670 - val_loss: 0.1189 - val_accuracy: 0.9581
Epoch 3/3
-328/328 - 52s - loss: 0.0650 - accuracy: 0.9757 - val_loss: 0.1162 - val_accuracy: 0.9600
+328/328 - 52s - loss: 0.0696 - accuracy: 0.9729 - val_loss: 0.1105 - val_accuracy: 0.9641
-<keras.callbacks.History object at 0x7f7ed75d5e10>
+<keras.callbacks.History object at 0x7f489d506f10>
</pre></div>
</div>
</div>
@@ -951,7 +951,7 @@ as intended.</p>
<p>From here, we could modify the model to read live images from the camera - we have another
Arduino tutorial for how to do that <a class="reference external" href="https://github.com/guberti/tvm-arduino-demos/tree/master/examples/person_detection">on GitHub</a>. Alternatively, we could also
<a class="reference external" href="https://tvm.apache.org/docs/how_to/work_with_microtvm/micro_autotune.html">use TVM’s autotuning capabilities</a> to dramatically improve the model’s performance.</p>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 8 minutes 7.957 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 10 minutes 12.692 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-train-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/b52cec46baf4f78d6bcd94cbe269c8a6/micro_train.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">micro_train.py</span></code></a></p>
diff --git a/docs/how_to/work_with_microtvm/sg_execution_times.html b/docs/how_to/work_with_microtvm/sg_execution_times.html
index 4bffce807..6c977af94 100644
--- a/docs/how_to/work_with_microtvm/sg_execution_times.html
+++ b/docs/how_to/work_with_microtvm/sg_execution_times.html
@@ -322,7 +322,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-work-with-microtvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>08:52.919</strong> total execution time for <strong>how_to_work_with_microtvm</strong> files:</p>
+<p><strong>10:58.917</strong> total execution time for <strong>how_to_work_with_microtvm</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 83%" />
@@ -331,15 +331,15 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="micro_train.html#sphx-glr-how-to-work-with-microtvm-micro-train-py"><span class="std std-ref">Training Vision Models for microTVM on Arduino</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_train.py</span></code>)</p></td>
-<td><p>08:07.957</p></td>
+<td><p>10:12.692</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="micro_autotune.html#sphx-glr-how-to-work-with-microtvm-micro-autotune-py"><span class="std std-ref">Autotuning with microTVM</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_autotune.py</span></code>)</p></td>
-<td><p>00:41.550</p></td>
+<td><p>00:42.778</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="micro_tflite.html#sphx-glr-how-to-work-with-microtvm-micro-tflite-py"><span class="std std-ref">microTVM with TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_tflite.py</span></code>)</p></td>
-<td><p>00:03.412</p></td>
+<td><p>00:03.447</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="micro_ethosu.html#sphx-glr-how-to-work-with-microtvm-micro-ethosu-py"><span class="std std-ref">Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_ethosu.py</span></code>)</p></td>
diff --git a/docs/how_to/work_with_relay/sg_execution_times.html b/docs/how_to/work_with_relay/sg_execution_times.html
index 2bb2fe3d9..17e787639 100644
--- a/docs/how_to/work_with_relay/sg_execution_times.html
+++ b/docs/how_to/work_with_relay/sg_execution_times.html
@@ -322,7 +322,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-work-with-relay-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:11.361</strong> total execution time for <strong>how_to_work_with_relay</strong> files:</p>
+<p><strong>00:11.381</strong> total execution time for <strong>how_to_work_with_relay</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 83%" />
@@ -331,11 +331,11 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="using_external_lib.html#sphx-glr-how-to-work-with-relay-using-external-lib-py"><span class="std std-ref">Using External Libraries in Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_external_lib.py</span></code>)</p></td>
-<td><p>00:09.853</p></td>
+<td><p>00:09.879</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="build_gcn.html#sphx-glr-how-to-work-with-relay-build-gcn-py"><span class="std std-ref">Building a Graph Convolutional Network</span></a> (<code class="docutils literal notranslate"><span class="pre">build_gcn.py</span></code>)</p></td>
-<td><p>00:01.502</p></td>
+<td><p>00:01.496</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="using_relay_viz.html#sphx-glr-how-to-work-with-relay-using-relay-viz-py"><span class="std std-ref">Use Relay Visualizer to Visualize Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_relay_viz.py</span></code>)</p></td>
diff --git a/docs/how_to/work_with_schedules/intrin_math.html b/docs/how_to/work_with_schedules/intrin_math.html
index 4d6c477b4..e5281259e 100644
--- a/docs/how_to/work_with_schedules/intrin_math.html
+++ b/docs/how_to/work_with_schedules/intrin_math.html
@@ -515,7 +515,7 @@ The following example customizes CUDA lowering rule for <code class="code docuti
<a href="../../reference/api/python/ir.html#tvm.ir.register_intrin_lowering" title="tvm.ir.register_intrin_lowering" class="sphx-glr-backref-module-tvm-ir sphx-glr-backref-type-py-function"><span class="n">register_intrin_lowering</span></a><span class="p">(</span><span class="s2">"tir.exp"</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">"cuda"</span><span class="p">,</span> <span class="n">f</span><span class="o">= [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span><function my_cuda_math_rule at 0x7f7e3fb81440>
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span><function my_cuda_math_rule at 0x7f4810e0c950>
</pre></div>
</div>
<p>Register the rule to TVM with override option to override existing rule.
diff --git a/docs/how_to/work_with_schedules/sg_execution_times.html b/docs/how_to/work_with_schedules/sg_execution_times.html
index cc0a25541..c8f17193d 100644
--- a/docs/how_to/work_with_schedules/sg_execution_times.html
+++ b/docs/how_to/work_with_schedules/sg_execution_times.html
@@ -322,7 +322,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-work-with-schedules-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:04.002</strong> total execution time for <strong>how_to_work_with_schedules</strong> files:</p>
+<p><strong>00:03.981</strong> total execution time for <strong>how_to_work_with_schedules</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 83%" />
@@ -331,19 +331,19 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="intrin_math.html#sphx-glr-how-to-work-with-schedules-intrin-math-py"><span class="std std-ref">Intrinsics and Math Functions</span></a> (<code class="docutils literal notranslate"><span class="pre">intrin_math.py</span></code>)</p></td>
-<td><p>00:01.863</p></td>
+<td><p>00:01.854</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tensorize.html#sphx-glr-how-to-work-with-schedules-tensorize-py"><span class="std std-ref">Use Tensorize to Leverage Hardware Intrinsics</span></a> (<code class="docutils literal notranslate"><span class="pre">tensorize.py</span></code>)</p></td>
-<td><p>00:00.949</p></td>
+<td><p>00:00.930</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="reduction.html#sphx-glr-how-to-work-with-schedules-reduction-py"><span class="std std-ref">Reduction</span></a> (<code class="docutils literal notranslate"><span class="pre">reduction.py</span></code>)</p></td>
-<td><p>00:00.514</p></td>
+<td><p>00:00.522</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="scan.html#sphx-glr-how-to-work-with-schedules-scan-py"><span class="std std-ref">Scan and Recurrent Kernel</span></a> (<code class="docutils literal notranslate"><span class="pre">scan.py</span></code>)</p></td>
-<td><p>00:00.505</p></td>
+<td><p>00:00.503</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="extern_op.html#sphx-glr-how-to-work-with-schedules-extern-op-py"><span class="std std-ref">External Tensor Functions</span></a> (<code class="docutils literal notranslate"><span class="pre">extern_op.py</span></code>)</p></td>
@@ -355,11 +355,11 @@
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="tedd.html#sphx-glr-how-to-work-with-schedules-tedd-py"><span class="std std-ref">Use Tensor Expression Debug Display (TEDD) for Visualization</span></a> (<code class="docutils literal notranslate"><span class="pre">tedd.py</span></code>)</p></td>
-<td><p>00:00.026</p></td>
+<td><p>00:00.027</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tuple_inputs.html#sphx-glr-how-to-work-with-schedules-tuple-inputs-py"><span class="std std-ref">Compute and Reduce with Tuple Inputs</span></a> (<code class="docutils literal notranslate"><span class="pre">tuple_inputs.py</span></code>)</p></td>
-<td><p>00:00.013</p></td>
+<td><p>00:00.012</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/work_with_schedules/tensorize.html b/docs/how_to/work_with_schedules/tensorize.html
index 57a0c20b0..4ae758169 100644
--- a/docs/how_to/work_with_schedules/tensorize.html
+++ b/docs/how_to/work_with_schedules/tensorize.html
@@ -571,7 +571,7 @@ The importing needs to happen before the tensorized GEMV being executed.</p>
C: Buffer(C_2: Pointer(float32), float32, [524288], [])}
buffer_map = {A_1: A, B_1: B, C_1: C}
preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [1024, 64], []), B_1: B_3: Buffer(B_2, float32, [512, 64], []), C_1: C_3: Buffer(C_2, float32, [1024, 512], [])} {
- attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmpa_jm11ei/input0.cc'\nsource_filename = \"/tmp/tmpa_jm11ei/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = allo [...]
+ attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmpbs1xqx8t/input0.cc'\nsource_filename = \"/tmp/tmpbs1xqx8t/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = allo [...]
for (i, 0, 1024) {
for (j.outer: int32, 0, 32) {
@tir.call_extern("gemv_update", @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), C_2, ((i*512) + (j.outer*16)), 16, 2, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), A_2, (i*64), 64, 1, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), B_2, (j.outer*1024), 1024, 1, dtype=handle), 16, 64, 64, dtype=int32)
diff --git a/docs/reference/api/python/auto_scheduler.html b/docs/reference/api/python/auto_scheduler.html
index c1bb074da..2f779f220 100644
--- a/docs/reference/api/python/auto_scheduler.html
+++ b/docs/reference/api/python/auto_scheduler.html
@@ -1737,7 +1737,7 @@ Can be the a function or the function name.</p></li>
<dl class="py function">
<dt class="sig sig-object py" id="tvm.auto_scheduler.auto_schedule">
-<span class="sig-prename descclassname"><span class="pre">tvm.auto_scheduler.</span></span><span class="sig-name descname"><span class="pre">auto_schedule</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">task</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">search_policy</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em clas [...]
+<span class="sig-prename descclassname"><span class="pre">tvm.auto_scheduler.</span></span><span class="sig-name descname"><span class="pre">auto_schedule</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">task</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">search_policy</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em clas [...]
<dd><p>THIS API IS DEPRECATED.</p>
<p>Run auto scheduling search for a task.</p>
<dl class="field-list simple">
@@ -1774,7 +1774,7 @@ the initial naive schedule (state).</p>
<dl class="py class">
<dt class="sig sig-object py" id="tvm.auto_scheduler.SketchPolicy">
-<em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">tvm.auto_scheduler.</span></span><span class="sig-name descname"><span class="pre">SketchPolicy</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">task</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">program_cost_model</span></span><span class="o"><span class="pre">=</span></span><span class="defau [...]
+<em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">tvm.auto_scheduler.</span></span><span class="sig-name descname"><span class="pre">SketchPolicy</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">task</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">program_cost_model</span></span><span class="o"><span class="pre">=</span></span><span class="defau [...]
<dd><p>The search policy that searches in a hierarchical search space defined by sketches.
The policy randomly samples programs from the space defined by sketches and use evolutionary
search to fine-tune them.</p>
diff --git a/docs/reference/api/typedoc/classes/bytestreamreader.html b/docs/reference/api/typedoc/classes/bytestreamreader.html
index 17557ffc4..c5756b9bf 100644
--- a/docs/reference/api/typedoc/classes/bytestreamreader.html
+++ b/docs/reference/api/typedoc/classes/bytestreamreader.html
@@ -119,7 +119,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/rpc_server.ts#L43">rpc_server.ts:43</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/rpc_server.ts#L43">rpc_server.ts:43</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -141,7 +141,7 @@
<div class="tsd-signature tsd-kind-icon">bytes<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">Uint8Array</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/rpc_server.ts#L43">rpc_server.ts:43</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/rpc_server.ts#L43">rpc_server.ts:43</a></li>
</ul>
</aside>
</section>
@@ -151,7 +151,7 @@
<div class="tsd-signature tsd-kind-icon">offset<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">number</span><span class="tsd-signature-symbol"> = 0</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/rpc_server.ts#L42">rpc_server.ts:42</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/rpc_server.ts#L42">rpc_server.ts:42</a></li>
</ul>
</aside>
</section>
@@ -168,7 +168,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/rpc_server.ts#L63">rpc_server.ts:63</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/rpc_server.ts#L63">rpc_server.ts:63</a></li>
</ul>
</aside>
<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">Uint8Array</span></h4>
@@ -185,7 +185,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/rpc_server.ts#L49">rpc_server.ts:49</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/rpc_server.ts#L49">rpc_server.ts:49</a></li>
</ul>
</aside>
<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">number</span></h4>
@@ -202,7 +202,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/rpc_server.ts#L57">rpc_server.ts:57</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/rpc_server.ts#L57">rpc_server.ts:57</a></li>
</ul>
</aside>
<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">number</span></h4>
diff --git a/docs/reference/api/typedoc/classes/cachedcallstack.html b/docs/reference/api/typedoc/classes/cachedcallstack.html
index f0d6660dc..80e16e813 100644
--- a/docs/reference/api/typedoc/classes/cachedcallstack.html
+++ b/docs/reference/api/typedoc/classes/cachedcallstack.html
@@ -144,7 +144,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L223">memory.ts:223</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L223">memory.ts:223</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -172,7 +172,7 @@
<div class="tsd-signature tsd-kind-icon">temp<wbr>Args<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">Array</span><span class="tsd-signature-symbol"><</span><a href="../interfaces/disposable.html" class="tsd-signature-type">Disposable</a><span class="tsd-signature-symbol">></span><span class="tsd-signature-symbol"> = []</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L208">memory.ts:208</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L208">memory.ts:208</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -194,7 +194,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L312">memory.ts:312</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L312">memory.ts:312</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -226,7 +226,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L284">memory.ts:284</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L284">memory.ts:284</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -262,7 +262,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L388">memory.ts:388</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L388">memory.ts:388</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -300,7 +300,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L376">memory.ts:376</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L376">memory.ts:376</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -340,7 +340,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L267">memory.ts:267</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L267">memory.ts:267</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -373,7 +373,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L243">memory.ts:243</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L243">memory.ts:243</a></li>
</ul>
</aside>
<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">void</span></h4>
@@ -390,7 +390,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L321">memory.ts:321</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L321">memory.ts:321</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -422,7 +422,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L252">memory.ts:252</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L252">memory.ts:252</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -444,7 +444,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L359">memory.ts:359</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L359">memory.ts:359</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -470,7 +470,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L342">memory.ts:342</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L342">memory.ts:342</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -496,7 +496,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L350">memory.ts:350</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L350">memory.ts:350</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -522,7 +522,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L326">memory.ts:326</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L326">memory.ts:326</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -548,7 +548,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L363">memory.ts:363</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L363">memory.ts:363</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -574,7 +574,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L346">memory.ts:346</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L346">memory.ts:346</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -600,7 +600,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L334">memory.ts:334</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L334">memory.ts:334</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
diff --git a/docs/reference/api/typedoc/classes/dldatatype.html b/docs/reference/api/typedoc/classes/dldatatype.html
index 9b32b4196..358d9e3c6 100644
--- a/docs/reference/api/typedoc/classes/dldatatype.html
+++ b/docs/reference/api/typedoc/classes/dldatatype.html
@@ -119,7 +119,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L262">runtime.ts:262</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L262">runtime.ts:262</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -147,7 +147,7 @@
<div class="tsd-signature tsd-kind-icon">bits<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">number</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L260">runtime.ts:260</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L260">runtime.ts:260</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -162,7 +162,7 @@
<div class="tsd-signature tsd-kind-icon">code<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">number</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L258">runtime.ts:258</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L258">runtime.ts:258</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -177,7 +177,7 @@
<div class="tsd-signature tsd-kind-icon">lanes<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">number</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L262">runtime.ts:262</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L262">runtime.ts:262</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -199,7 +199,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L279">runtime.ts:279</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L279">runtime.ts:279</a></li>
</ul>
</aside>
<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">number</span></h4>
@@ -216,7 +216,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L270">runtime.ts:270</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L270">runtime.ts:270</a></li>
</ul>
</aside>
<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">string</span></h4>
diff --git a/docs/reference/api/typedoc/classes/dldevice.html b/docs/reference/api/typedoc/classes/dldevice.html
index 0047e0215..ea2c78bcd 100644
--- a/docs/reference/api/typedoc/classes/dldevice.html
+++ b/docs/reference/api/typedoc/classes/dldevice.html
@@ -118,7 +118,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L202">runtime.ts:202</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L202">runtime.ts:202</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -146,7 +146,7 @@
<div class="tsd-signature tsd-kind-icon">device<wbr>Id<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">number</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L200">runtime.ts:200</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L200">runtime.ts:200</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -161,7 +161,7 @@
<div class="tsd-signature tsd-kind-icon">device<wbr>Type<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">number</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L198">runtime.ts:198</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L198">runtime.ts:198</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -183,7 +183,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L223">runtime.ts:223</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L223">runtime.ts:223</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -205,7 +205,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L230">runtime.ts:230</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L230">runtime.ts:230</a></li>
</ul>
</aside>
<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">string</span></h4>
diff --git a/docs/reference/api/typedoc/classes/environment.html b/docs/reference/api/typedoc/classes/environment.html
index 48af489a1..6602a7e8b 100644
--- a/docs/reference/api/typedoc/classes/environment.html
+++ b/docs/reference/api/typedoc/classes/environment.html
@@ -125,7 +125,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/environment.ts#L86">environment.ts:86</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/environment.ts#L86">environment.ts:86</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -169,7 +169,7 @@
<aside class="tsd-sources">
<p>Implementation of <a href="../interfaces/libraryprovider.html">LibraryProvider</a>.<a href="../interfaces/libraryprovider.html#imports">imports</a></p>
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/environment.ts#L70">environment.ts:70</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/environment.ts#L70">environment.ts:70</a></li>
</ul>
</aside>
</section>
@@ -179,7 +179,7 @@
<div class="tsd-signature tsd-kind-icon">logger<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-symbol">(</span>msg<span class="tsd-signature-symbol">: </span><span class="tsd-signature-type">string</span><span class="tsd-signature-symbol">)</span><span class="tsd-signature-symbol"> => </span><span class="tsd-signature-type">void</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/environment.ts#L69">environment.ts:69</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/environment.ts#L69">environment.ts:69</a></li>
</ul>
</aside>
<div class="tsd-type-declaration">
@@ -210,7 +210,7 @@
<div class="tsd-signature tsd-kind-icon">packedCFunc<wbr>Table<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">Array</span><span class="tsd-signature-symbol"><</span><span class="tsd-signature-type">ctypes.FTVMWasmPackedCFunc</span><span class="tsd-signature-symbol"> | </span><span class="tsd-signature-type">undefined</span><span class="tsd-signature-symbol">></span><span class="tsd-signature-symbol"> = [undefined,]</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/environment.ts#L78">environment.ts:78</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/environment.ts#L78">environment.ts:78</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -228,7 +228,7 @@
<div class="tsd-signature tsd-kind-icon">packedCFunc<wbr>Table<wbr>Free<wbr>Id<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">Array</span><span class="tsd-signature-symbol"><</span><span class="tsd-signature-type">number</span><span class="tsd-signature-symbol">></span><span class="tsd-signature-symbol"> = []</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/environment.ts#L84">environment.ts:84</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/environment.ts#L84">environment.ts:84</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -250,7 +250,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/environment.ts#L105">environment.ts:105</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/environment.ts#L105">environment.ts:105</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
diff --git a/docs/reference/api/typedoc/classes/ffilibrary.html b/docs/reference/api/typedoc/classes/ffilibrary.html
index 877be18ef..fdf81936f 100644
--- a/docs/reference/api/typedoc/classes/ffilibrary.html
+++ b/docs/reference/api/typedoc/classes/ffilibrary.html
@@ -131,7 +131,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L49">runtime.ts:49</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L49">runtime.ts:49</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -156,7 +156,7 @@
<div class="tsd-signature tsd-kind-icon">exports<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">Record</span><span class="tsd-signature-symbol"><</span><span class="tsd-signature-type">string</span><span class="tsd-signature-symbol">, </span><span class="tsd-signature-type">Function</span><span class="tsd-signature-symbol">></span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L46">runtime.ts:46</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L46">runtime.ts:46</a></li>
</ul>
</aside>
</section>
@@ -166,7 +166,7 @@
<div class="tsd-signature tsd-kind-icon">memory<span class="tsd-signature-symbol">:</span> <a href="memory.html" class="tsd-signature-type">Memory</a></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L45">runtime.ts:45</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L45">runtime.ts:45</a></li>
</ul>
</aside>
</section>
@@ -176,7 +176,7 @@
<div class="tsd-signature tsd-kind-icon">wasm32<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">boolean</span></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L44">runtime.ts:44</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L44">runtime.ts:44</a></li>
</ul>
</aside>
</section>
@@ -186,7 +186,7 @@
<div class="tsd-signature tsd-kind-icon">webGPUContext<span class="tsd-signature-symbol">:</span> <a href="webgpucontext.html" class="tsd-signature-type">WebGPUContext</a></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L47">runtime.ts:47</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L47">runtime.ts:47</a></li>
</ul>
</aside>
</section>
@@ -203,7 +203,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L76">runtime.ts:76</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L76">runtime.ts:76</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -226,7 +226,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L66">runtime.ts:66</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L66">runtime.ts:66</a></li>
</ul>
</aside>
<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">void</span></h4>
@@ -243,7 +243,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L84">runtime.ts:84</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L84">runtime.ts:84</a></li>
</ul>
</aside>
<h4 class="tsd-returns-title">Returns <a href="cachedcallstack.html" class="tsd-signature-type">CachedCallStack</a></h4>
@@ -260,7 +260,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L95">runtime.ts:95</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L95">runtime.ts:95</a></li>
</ul>
</aside>
<h4 class="tsd-parameters-title">Parameters</h4>
@@ -283,7 +283,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L72">runtime.ts:72</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L72">runtime.ts:72</a></li>
</ul>
</aside>
<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">number</span></h4>
diff --git a/docs/reference/api/typedoc/classes/graphexecutor.html b/docs/reference/api/typedoc/classes/graphexecutor.html
index aae61b0c6..fbb7419e4 100644
--- a/docs/reference/api/typedoc/classes/graphexecutor.html
+++ b/docs/reference/api/typedoc/classes/graphexecutor.html
@@ -130,7 +130,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L583">runtime.ts:583</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L583">runtime.ts:583</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -162,7 +162,7 @@
<div class="tsd-signature tsd-kind-icon">module<span class="tsd-signature-symbol">:</span> <a href="module.html" class="tsd-signature-type">Module</a></div>
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L579">runtime.ts:579</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L579">runtime.ts:579</a></li>
</ul>
</aside>
</section>
@@ -179,7 +179,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L654">runtime.ts:654</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L654">runtime.ts:654</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -224,7 +224,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L597">runtime.ts:597</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L597">runtime.ts:597</a></li>
</ul>
</aside>
<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">void</span></h4>
@@ -241,7 +241,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L631">runtime.ts:631</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L631">runtime.ts:631</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -279,7 +279,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
<ul>
- <li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L644">runtime.ts:644</a></li>
+ <li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L644">runtime.ts:644</a></li>
</ul>
</aside>
<div class="tsd-comment tsd-typography">
@@ -310,7 +310,7 @@
<li class="tsd-description">
<aside class="tsd-sources">
... 2373 lines suppressed ...