You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by tq...@apache.org on 2022/06/22 21:28:37 UTC

[tvm-site] branch asf-site updated: deploying docs (apache/tvm@c334790bf88694db8d748d2299f50f2b04c46486)

This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/tvm-site.git


The following commit(s) were added to refs/heads/asf-site by this push:
     new ad7a1837c deploying docs (apache/tvm@c334790bf88694db8d748d2299f50f2b04c46486)
ad7a1837c is described below

commit ad7a1837cc5a6272774c07c7647945d3a39329a0
Author: tvm-bot <95...@users.noreply.github.com>
AuthorDate: Wed Jun 22 21:28:32 2022 +0000

    deploying docs (apache/tvm@c334790bf88694db8d748d2299f50f2b04c46486)
---
 .../how_to/compile_models/from_mxnet.rst.txt       |    2 +-
 .../how_to/compile_models/from_oneflow.rst.txt     |    2 +-
 .../how_to/compile_models/from_paddle.rst.txt      |    2 +-
 .../how_to/compile_models/from_pytorch.rst.txt     |    2 +-
 .../how_to/compile_models/from_tensorflow.rst.txt  |    2 +-
 .../compile_models/sg_execution_times.rst.txt      |   22 +-
 .../deploy_models/deploy_model_on_android.rst.txt  |    2 +-
 .../deploy_object_detection_pytorch.rst.txt        |    4 +-
 .../deploy_models/deploy_prequantized.rst.txt      |    6 +-
 .../deploy_prequantized_tflite.rst.txt             |    4 +-
 .../how_to/deploy_models/deploy_quantized.rst.txt  |    2 +-
 .../deploy_models/deploy_ssd_gluoncv.rst.txt       |    4 +-
 .../deploy_models/sg_execution_times.rst.txt       |   16 +-
 .../extend_tvm/bring_your_own_datatypes.rst.txt    |    2 +-
 .../how_to/extend_tvm/sg_execution_times.rst.txt   |    8 +-
 .../how_to/extend_tvm/use_pass_instrument.rst.txt  |   16 +-
 .../optimize_operators/opt_conv_cuda.rst.txt       |    2 +-
 .../optimize_operators/opt_conv_tensorcore.rst.txt |    2 +-
 .../how_to/optimize_operators/opt_gemm.rst.txt     |   16 +-
 .../optimize_operators/sg_execution_times.rst.txt  |    8 +-
 .../sg_execution_times.rst.txt                     |   14 +-
 .../tune_conv2d_layer_cuda.rst.txt                 | 2790 +++++++++++++++++++-
 .../tune_network_cuda.rst.txt                      |    2 +-
 .../tune_network_x86.rst.txt                       |    4 +-
 .../tune_sparse_x86.rst.txt                        |  119 +-
 .../tune_with_autotvm/sg_execution_times.rst.txt   |    6 +-
 .../tune_with_autotvm/tune_conv2d_cuda.rst.txt     |   34 +-
 .../work_with_microtvm/micro_autotune.rst.txt      |   16 +-
 .../how_to/work_with_microtvm/micro_train.rst.txt  |   16 +-
 .../work_with_microtvm/sg_execution_times.rst.txt  |    8 +-
 .../work_with_relay/sg_execution_times.rst.txt     |    6 +-
 .../how_to/work_with_schedules/intrin_math.rst.txt |    2 +-
 .../work_with_schedules/sg_execution_times.rst.txt |   14 +-
 .../how_to/work_with_schedules/tensorize.rst.txt   |    2 +-
 .../tutorials/autotvm/sg_execution_times.rst.txt   |    4 +-
 .../frontend/deploy_classification.rst.txt         |    2 +-
 .../tutorials/frontend/deploy_detection.rst.txt    |    2 +-
 .../tutorials/frontend/sg_execution_times.rst.txt  |    6 +-
 .../tutorials/optimize/sg_execution_times.rst.txt  |    4 +-
 .../topic/vta/tutorials/sg_execution_times.rst.txt |    6 +-
 .../tutorial/auto_scheduler_matmul_x86.rst.txt     |   11 +-
 docs/_sources/tutorial/autotvm_matmul_x86.rst.txt  |   20 +-
 docs/_sources/tutorial/autotvm_relay_x86.rst.txt   |   54 +-
 .../tutorial/cross_compilation_and_rpc.rst.txt     |    2 +-
 docs/_sources/tutorial/intro_topi.rst.txt          |    2 +-
 docs/_sources/tutorial/sg_execution_times.rst.txt  |   22 +-
 .../tutorial/tensor_expr_get_started.rst.txt       |   44 +-
 docs/commit_hash                                   |    2 +-
 docs/how_to/compile_models/from_mxnet.html         |    2 +-
 docs/how_to/compile_models/from_oneflow.html       |  236 +-
 docs/how_to/compile_models/from_paddle.html        |    2 +-
 docs/how_to/compile_models/from_pytorch.html       |   24 +-
 docs/how_to/compile_models/from_tensorflow.html    |    2 +-
 docs/how_to/compile_models/sg_execution_times.html |   22 +-
 .../deploy_models/deploy_model_on_android.html     |    2 +-
 .../deploy_object_detection_pytorch.html           |   22 +-
 docs/how_to/deploy_models/deploy_prequantized.html |    7 +-
 .../deploy_models/deploy_prequantized_tflite.html  |    4 +-
 docs/how_to/deploy_models/deploy_quantized.html    |    2 +-
 docs/how_to/deploy_models/deploy_ssd_gluoncv.html  |   37 +-
 docs/how_to/deploy_models/sg_execution_times.html  |   16 +-
 .../extend_tvm/bring_your_own_datatypes.html       |    2 +-
 docs/how_to/extend_tvm/sg_execution_times.html     |    8 +-
 docs/how_to/extend_tvm/use_pass_instrument.html    |   16 +-
 docs/how_to/optimize_operators/opt_conv_cuda.html  |    2 +-
 .../optimize_operators/opt_conv_tensorcore.html    |    2 +-
 docs/how_to/optimize_operators/opt_gemm.html       |   16 +-
 .../optimize_operators/sg_execution_times.html     |    8 +-
 .../sg_execution_times.html                        |   14 +-
 .../tune_conv2d_layer_cuda.html                    | 2790 +++++++++++++++++++-
 .../tune_with_autoscheduler/tune_network_cuda.html |    2 +-
 .../tune_with_autoscheduler/tune_network_x86.html  |    4 +-
 .../tune_with_autoscheduler/tune_sparse_x86.html   |  119 +-
 .../tune_with_autotvm/sg_execution_times.html      |    6 +-
 .../how_to/tune_with_autotvm/tune_conv2d_cuda.html |   34 +-
 docs/how_to/work_with_microtvm/micro_autotune.html |   16 +-
 docs/how_to/work_with_microtvm/micro_train.html    |   16 +-
 .../work_with_microtvm/sg_execution_times.html     |    8 +-
 .../how_to/work_with_relay/sg_execution_times.html |    6 +-
 docs/how_to/work_with_schedules/intrin_math.html   |    2 +-
 .../work_with_schedules/sg_execution_times.html    |   14 +-
 docs/how_to/work_with_schedules/tensorize.html     |    2 +-
 docs/reference/api/python/auto_scheduler.html      |    4 +-
 .../api/typedoc/classes/bytestreamreader.html      |   12 +-
 .../api/typedoc/classes/cachedcallstack.html       |   34 +-
 docs/reference/api/typedoc/classes/dldatatype.html |   12 +-
 docs/reference/api/typedoc/classes/dldevice.html   |   10 +-
 .../reference/api/typedoc/classes/environment.html |   12 +-
 docs/reference/api/typedoc/classes/ffilibrary.html |   20 +-
 .../api/typedoc/classes/graphexecutor.html         |   16 +-
 docs/reference/api/typedoc/classes/instance.html   |   40 +-
 docs/reference/api/typedoc/classes/memory.html     |   34 +-
 docs/reference/api/typedoc/classes/module.html     |   10 +-
 docs/reference/api/typedoc/classes/ndarray.html    |   22 +-
 .../api/typedoc/classes/packedfunccell.html        |    6 +-
 docs/reference/api/typedoc/classes/rpcserver.html  |   14 +-
 docs/reference/api/typedoc/classes/scalar.html     |    6 +-
 .../api/typedoc/classes/webgpucontext.html         |   12 +-
 docs/reference/api/typedoc/enums/argtypecode.html  |   30 +-
 .../api/typedoc/enums/aynccallbackcode.html        |    4 +-
 .../api/typedoc/enums/dldatatypecode.html          |    8 +-
 .../api/typedoc/enums/rpcserverstate.html          |   12 +-
 docs/reference/api/typedoc/enums/sizeof.html       |   18 +-
 docs/reference/api/typedoc/index.html              |  112 +-
 .../api/typedoc/interfaces/disposable.html         |    2 +-
 .../api/typedoc/interfaces/functioninfo.html       |    6 +-
 .../api/typedoc/interfaces/libraryprovider.html    |    4 +-
 docs/searchindex.js                                |    2 +-
 .../vta/tutorials/autotvm/sg_execution_times.html  |    4 +-
 .../tutorials/frontend/deploy_classification.html  |    2 +-
 .../vta/tutorials/frontend/deploy_detection.html   |    2 +-
 .../vta/tutorials/frontend/sg_execution_times.html |    6 +-
 .../vta/tutorials/optimize/sg_execution_times.html |    4 +-
 docs/topic/vta/tutorials/sg_execution_times.html   |    6 +-
 docs/tutorial/auto_scheduler_matmul_x86.html       |    7 +-
 docs/tutorial/autotvm_matmul_x86.html              |   20 +-
 docs/tutorial/autotvm_relay_x86.html               |  258 +-
 docs/tutorial/cross_compilation_and_rpc.html       |    2 +-
 docs/tutorial/intro_topi.html                      |    2 +-
 docs/tutorial/sg_execution_times.html              |   26 +-
 docs/tutorial/tensor_expr_get_started.html         |   44 +-
 121 files changed, 6428 insertions(+), 1220 deletions(-)

diff --git a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
index f327c91ce..2ecd0219d 100644
--- a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
@@ -114,7 +114,7 @@ In this section, we download a pretrained imagenet model and classify an image.
 
  .. code-block:: none
 
-    Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip812ab175-3adc-45fc-a776-1bd65330f280 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+    Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip2cde5cc7-4677-4c09-92ea-a4a047eb10a2 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
     x (1, 3, 224, 224)
 
 
diff --git a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
index dc3846153..d09c0e5df 100644
--- a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
@@ -112,7 +112,7 @@ Load a pretrained OneFlow model and save model
  .. code-block:: none
 
     Downloading: "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip" to /workspace/.oneflow/flowvision_cache/resnet18.zip
-
      0%|          | 0.00/41.5M [00:00<?, ?B/s]
      0%|          | 16.0k/41.5M [00:00<07:57, 91.1kB/s]
      0%|          | 40.0k/41.5M [00:00<06:09, 117kB/s] 
      0%|          | 72.0k/41.5M [00:00<04:56, 147kB/s]
      0%|          | 96.0k/41.5M [00:00<05:05, 142kB/s]
      0%|          | 128k/41.5M [00:00<04:37, 156kB/s] 
      0%|          | 160k/41.5M [00:01<04:23, 165kB/s]
      0%|          | 192k/41.5M [00:01<04:14, 170kB/s]
      1%|          | 232k/41.5M [00:01<03:50, 188kB/s]
      1%|          | 264k/41.5M [00:01<03:52, 186kB/s]
      1%|          | 304k/41.5M [00:01<03:37, 198kB/s]
      1%|          | 352k/41.5M [00:01<03:15, 221kB/s]
      1%|          | 392k/41.5M [00:02<03:13, 222kB/s]
      1%|1         | 440k/41.5M [00:02<03:01, 237kB/s]
      1%|1         | 488k/41.5M [00:02<02:53, 248kB/s]
      1%|1         | 536k/41.5M [00:02<02:48, 255kB/s]
      1%|1         | 584k/41.5M [00:02<02:45, 260kB/s]
      2%|1         | 640k/41.5M [00:03<02:34, 277kB/s]
      
 2%|1         | 696k/41.5M [00:03<02:28, 289kB/s]
      2%|1         | 760k/41.5M [00:03<02:17, 311kB/s]
      2%|1         | 824k/41.5M [00:03<02:10, 326kB/s]
      2%|2         | 888k/41.5M [00:03<02:06, 337kB/s]
      2%|2         | 960k/41.5M [00:03<01:58, 358kB/s]
      2%|2         | 1.01M/41.5M [00:04<01:53, 373kB/s]
      3%|2         | 1.09M/41.5M [00:04<01:46, 397kB/s]
      3%|2         | 1.16M/41.5M [00:04<01:42, 413kB/s]
      3%|2         | 1.24M/41.5M [00:04<01:39, 425kB/s]
      3%|3         | 1.33M/41.5M [00:04<01:34, 447kB/s]
      3%|3         | 1.42M/41.5M [00:05<01:28, 476kB/s]
      4%|3         | 1.52M/41.5M [00:05<01:24, 496kB/s]
      4%|3         | 1.62M/41.5M [00:05<01:19, 524kB/s]
      4%|4         | 1.73M/41.5M [00:05<01:14, 557kB/s]
      4%|4         | 1.84M/41.5M [00:05<01:11, 580kB/s]
      5%|4         | 1.95M/41.5M [00:05<01:08, 610kB/s]
      5%|5         | 2.09M/41.5M [00:06<01:02, 658kB/s]
      5%|5         | 2.23M/41.5M [00:06<00:58, 705kB/s]
       6%|5         | 2.38M/41.5M [00:06<00:54, 751kB/s]
      6%|6         | 2.54M/41.5M [00:06<00:50, 811kB/s]
      7%|6         | 2.71M/41.5M [00:06<00:46, 866kB/s]
      7%|6         | 2.90M/41.5M [00:07<00:43, 932kB/s]
      7%|7         | 3.09M/41.5M [00:07<00:40, 992kB/s]
      8%|7         | 3.31M/41.5M [00:07<00:37, 1.07MB/s]
      9%|8         | 3.54M/41.5M [00:07<00:34, 1.15MB/s]
      9%|9         | 3.78M/41.5M [00:07<00:32, 1.22MB/s]
     10%|9         | 4.04M/41.5M [00:07<00:30, 1.30MB/s]
     10%|#         | 4.31M/41.5M [00:08<00:28, 1.39MB/s]
     11%|#1        | 4.61M/41.5M [00:08<00:25, 1.49MB/s]
     12%|#1        | 4.91M/41.5M [00:08<00:22, 1.74MB/s]
     13%|#2        | 5.24M/41.5M [00:08<00:19, 1.93MB/s]
     13%|#3        | 5.44M/41.5M [00:08<00:19, 1.95MB/s]
     14%|#3        | 5.63M/41.5M [00:08<00:22, 1.66MB/s]
     14%|#4        | 5.98M/41.5M [00:08<00:18, 1.99MB/s]
     15%|#5        | 6.37M/41.5M [00:09<00:16, 2.27MB/s]
     16%|#5        | 6.59M/41.5M 
 [00:09<00:16, 2.29MB/s]
     16%|#6        | 6.82M/41.5M [00:09<00:18, 1.93MB/s]
     17%|#7        | 7.23M/41.5M [00:09<00:15, 2.36MB/s]
     19%|#8        | 7.69M/41.5M [00:09<00:12, 2.91MB/s]
     19%|#9        | 7.99M/41.5M [00:09<00:12, 2.71MB/s]
     20%|#9        | 8.27M/41.5M [00:09<00:15, 2.32MB/s]
     21%|##1       | 8.72M/41.5M [00:10<00:12, 2.72MB/s]
     22%|##2       | 9.26M/41.5M [00:10<00:09, 3.40MB/s]
     23%|##3       | 9.62M/41.5M [00:10<00:10, 3.16MB/s]
     24%|##3       | 9.95M/41.5M [00:10<00:12, 2.71MB/s]
     25%|##5       | 10.5M/41.5M [00:10<00:11, 2.92MB/s]
     27%|##6       | 11.1M/41.5M [00:10<00:08, 3.61MB/s]
     28%|##8       | 11.8M/41.5M [00:10<00:07, 4.27MB/s]
     29%|##9       | 12.2M/41.5M [00:11<00:08, 3.82MB/s]
     30%|###       | 12.6M/41.5M [00:11<00:09, 3.28MB/s]
     32%|###1      | 13.2M/41.5M [00:11<00:08, 3.58MB/s]
     34%|###3      | 14.0M/41.5M [00:11<00:06, 4.40MB/s]
     36%|###5      | 14.8M/41.5M [00:11<00:05, 5.20MB/s]
    
  37%|###7      | 15.4M/41.5M [00:11<00:05, 4.65MB/s]
     38%|###8      | 15.8M/41.5M [00:11<00:06, 4.01MB/s]
     40%|###9      | 16.6M/41.5M [00:12<00:05, 4.86MB/s]
     42%|####2     | 17.5M/41.5M [00:12<00:04, 5.71MB/s]
     44%|####3     | 18.1M/41.5M [00:12<00:04, 5.20MB/s]
     45%|####4     | 18.6M/41.5M [00:12<00:05, 4.49MB/s]
     47%|####7     | 19.5M/41.5M [00:12<00:04, 5.68MB/s]
     50%|####9     | 20.6M/41.5M [00:12<00:03, 6.64MB/s]
     51%|#####1    | 21.2M/41.5M [00:12<00:03, 6.02MB/s]
     53%|#####2    | 21.9M/41.5M [00:13<00:03, 5.19MB/s]
     55%|#####5    | 23.0M/41.5M [00:13<00:02, 6.58MB/s]
     58%|#####8    | 24.1M/41.5M [00:13<00:02, 7.67MB/s]
     60%|######    | 24.9M/41.5M [00:13<00:02, 6.96MB/s]
     62%|######1   | 25.6M/41.5M [00:13<00:02, 6.02MB/s]
     65%|######4   | 26.9M/41.5M [00:13<00:02, 7.55MB/s]
     68%|######8   | 28.2M/41.5M [00:13<00:01, 8.86MB/s]
     70%|#######   | 29.2M/41.5M [00:13<00:01, 8.01MB/s]
     72%|#######2  | 30.0M/41.5M
  [00:14<00:01, 6.94MB/s]
     75%|#######5  | 31.3M/41.5M [00:14<00:01, 8.35MB/s]
     79%|#######8  | 32.6M/41.5M [00:14<00:00, 9.59MB/s]
     81%|########1 | 33.6M/41.5M [00:14<00:00, 8.55MB/s]
     83%|########3 | 34.5M/41.5M [00:14<00:00, 7.40MB/s]
     86%|########6 | 35.7M/41.5M [00:14<00:00, 8.45MB/s]
     89%|########9 | 37.1M/41.5M [00:14<00:00, 9.58MB/s]
     92%|#########1| 38.0M/41.5M [00:15<00:00, 8.60MB/s]
     94%|#########3| 38.9M/41.5M [00:15<00:00, 7.43MB/s]
     97%|#########6| 40.1M/41.5M [00:15<00:00, 8.47MB/s]
    100%|#########9| 41.5M/41.5M [00:15<00:00, 9.60MB/s]
    100%|##########| 41.5M/41.5M [00:15<00:00, 2.82MB/s]
+
      0%|          | 0.00/41.5M [00:00<?, ?B/s]
      0%|          | 16.0k/41.5M [00:00<08:17, 87.5kB/s]
      0%|          | 32.0k/41.5M [00:00<08:18, 87.2kB/s]
      0%|          | 48.0k/41.5M [00:00<08:18, 87.1kB/s]
      0%|          | 64.0k/41.5M [00:00<08:18, 87.1kB/s]
      0%|          | 80.0k/41.5M [00:00<08:19, 87.0kB/s]
      0%|          | 96.0k/41.5M [00:01<08:18, 87.0kB/s]
      0%|          | 112k/41.5M [00:01<08:18, 87.0kB/s] 
      0%|          | 128k/41.5M [00:01<08:18, 87.0kB/s]
      0%|          | 144k/41.5M [00:01<08:18, 87.0kB/s]
      0%|          | 168k/41.5M [00:01<07:11, 100kB/s] 
      0%|          | 184k/41.5M [00:02<07:29, 96.3kB/s]
      0%|          | 208k/41.5M [00:02<06:45, 107kB/s] 
      1%|          | 232k/41.5M [00:02<06:19, 114kB/s]
      1%|          | 256k/41.5M [00:02<06:03, 119kB/s]
      1%|          | 280k/41.5M [00:02<05:53, 122kB/s]
      1%|          | 304k/41.5M [00:03<05:45, 125kB/s]
      1%|          | 336k/41.5M [00:03<05:09, 140
 kB/s]
      1%|          | 368k/41.5M [00:03<04:47, 150kB/s]
      1%|          | 400k/41.5M [00:03<04:34, 157kB/s]
      1%|1         | 440k/41.5M [00:03<04:05, 175kB/s]
      1%|1         | 480k/41.5M [00:03<03:48, 188kB/s]
      1%|1         | 528k/41.5M [00:04<03:24, 210kB/s]
      1%|1         | 584k/41.5M [00:04<03:00, 238kB/s]
      2%|1         | 640k/41.5M [00:04<02:45, 258kB/s]
      2%|1         | 696k/41.5M [00:04<02:37, 272kB/s]
      2%|1         | 768k/41.5M [00:04<02:18, 308kB/s]
      2%|1         | 848k/41.5M [00:05<02:03, 346kB/s]
      2%|2         | 928k/41.5M [00:05<01:54, 373kB/s]
      2%|2         | 1.00M/41.5M [00:05<01:41, 418kB/s]
      3%|2         | 1.10M/41.5M [00:05<01:31, 461kB/s]
      3%|2         | 1.15M/41.5M [00:06<02:48, 250kB/s]
      3%|3         | 1.43M/41.5M [00:06<01:19, 529kB/s]
      4%|3         | 1.52M/41.5M [00:06<01:21, 516kB/s]
      4%|3         | 1.60M/41.5M [00:06<01:22, 507kB/s]
      4%|4         | 1.69M/41.5M [00:06<01:23, 500
 kB/s]
      4%|4         | 1.79M/41.5M [00:07<01:20, 517kB/s]
      5%|4         | 1.88M/41.5M [00:07<01:19, 519kB/s]
      5%|4         | 1.98M/41.5M [00:07<01:17, 533kB/s]
      5%|5         | 2.09M/41.5M [00:07<01:16, 542kB/s]
      5%|5         | 2.20M/41.5M [00:07<01:13, 562kB/s]
      6%|5         | 2.30M/41.5M [00:08<01:13, 563kB/s]
      6%|5         | 2.41M/41.5M [00:08<01:11, 577kB/s]
      6%|6         | 2.52M/41.5M [00:08<01:09, 586kB/s]
      6%|6         | 2.63M/41.5M [00:08<01:07, 606kB/s]
      7%|6         | 2.74M/41.5M [00:08<01:06, 607kB/s]
      7%|6         | 2.85M/41.5M [00:09<01:06, 608kB/s]
      7%|7         | 2.97M/41.5M [00:09<01:05, 621kB/s]
      7%|7         | 3.09M/41.5M [00:09<01:03, 631kB/s]
      8%|7         | 3.20M/41.5M [00:09<01:04, 624kB/s]
      8%|7         | 3.31M/41.5M [00:09<01:03, 633kB/s]
      8%|8         | 3.43M/41.5M [00:09<01:02, 639kB/s]
      9%|8         | 3.54M/41.5M [00:10<01:03, 630kB/s]
      9%|8         | 3.66M/41.5M [00:10
 <01:02, 637kB/s]
      9%|9         | 3.77M/41.5M [00:10<01:02, 628kB/s]
      9%|9         | 3.88M/41.5M [00:10<01:02, 636kB/s]
     10%|9         | 3.99M/41.5M [00:10<01:02, 628kB/s]
     10%|9         | 4.11M/41.5M [00:11<01:01, 635kB/s]
     10%|#         | 4.23M/41.5M [00:11<01:01, 641kB/s]
     10%|#         | 4.34M/41.5M [00:11<01:01, 631kB/s]
     11%|#         | 4.45M/41.5M [00:11<01:00, 638kB/s]
     11%|#1        | 4.57M/41.5M [00:11<01:00, 642kB/s]
     11%|#1        | 4.69M/41.5M [00:12<00:59, 645kB/s]
     12%|#1        | 4.80M/41.5M [00:12<00:59, 648kB/s]
     12%|#1        | 4.92M/41.5M [00:12<00:59, 649kB/s]
     12%|#2        | 5.04M/41.5M [00:12<00:58, 650kB/s]
     12%|#2        | 5.16M/41.5M [00:12<00:58, 651kB/s]
     13%|#2        | 5.28M/41.5M [00:12<00:57, 665kB/s]
     13%|#3        | 5.41M/41.5M [00:13<00:56, 674kB/s]
     13%|#3        | 5.53M/41.5M [00:13<00:55, 681kB/s]
     14%|#3        | 5.66M/41.5M [00:13<00:53, 698kB/s]
     14%|#3        | 5.80M/4
 1.5M [00:13<00:52, 711kB/s]
     14%|#4        | 5.94M/41.5M [00:13<00:50, 732kB/s]
     15%|#4        | 6.08M/41.5M [00:14<00:49, 748kB/s]
     15%|#4        | 6.22M/41.5M [00:14<00:48, 758kB/s]
     15%|#5        | 6.37M/41.5M [00:14<00:47, 779kB/s]
     16%|#5        | 6.52M/41.5M [00:14<00:45, 806kB/s]
     16%|#6        | 6.69M/41.5M [00:14<00:43, 839kB/s]
     17%|#6        | 6.85M/41.5M [00:15<00:42, 861kB/s]
     17%|#6        | 7.03M/41.5M [00:15<00:40, 903kB/s]
     17%|#7        | 7.21M/41.5M [00:15<00:38, 932kB/s]
     18%|#7        | 7.41M/41.5M [00:15<00:36, 979kB/s]
     18%|#8        | 7.60M/41.5M [00:15<00:35, 1.01MB/s]
     19%|#8        | 7.81M/41.5M [00:16<00:33, 1.06MB/s]
     19%|#9        | 8.02M/41.5M [00:16<00:32, 1.10MB/s]
     20%|#9        | 8.26M/41.5M [00:16<00:30, 1.16MB/s]
     20%|##        | 8.49M/41.5M [00:16<00:28, 1.20MB/s]
     21%|##1       | 8.74M/41.5M [00:16<00:27, 1.26MB/s]
     22%|##1       | 9.01M/41.5M [00:16<00:25, 1.33MB/s]
     22%|#
 #2       | 9.28M/41.5M [00:17<00:24, 1.38MB/s]
     23%|##3       | 9.57M/41.5M [00:17<00:23, 1.45MB/s]
     24%|##3       | 9.87M/41.5M [00:17<00:21, 1.51MB/s]
     25%|##4       | 10.2M/41.5M [00:17<00:20, 1.58MB/s]
     25%|##5       | 10.5M/41.5M [00:17<00:19, 1.67MB/s]
     26%|##6       | 10.9M/41.5M [00:18<00:18, 1.77MB/s]
     27%|##7       | 11.2M/41.5M [00:18<00:17, 1.85MB/s]
     28%|##8       | 11.6M/41.5M [00:18<00:16, 1.95MB/s]
     29%|##9       | 12.0M/41.5M [00:18<00:15, 2.06MB/s]
     30%|###       | 12.5M/41.5M [00:18<00:14, 2.17MB/s]
     31%|###1      | 12.9M/41.5M [00:19<00:13, 2.28MB/s]
     32%|###2      | 13.4M/41.5M [00:19<00:12, 2.39MB/s]
     34%|###3      | 13.9M/41.5M [00:19<00:11, 2.51MB/s]
     35%|###4      | 14.4M/41.5M [00:19<00:10, 2.62MB/s]
     36%|###6      | 15.0M/41.5M [00:19<00:10, 2.76MB/s]
     38%|###7      | 15.6M/41.5M [00:19<00:09, 2.90MB/s]
     39%|###8      | 16.2M/41.5M [00:20<00:08, 3.06MB/s]
     41%|####      | 16.8M/41.5M [00:2
 0<00:08, 3.21MB/s]
     42%|####2     | 17.5M/41.5M [00:20<00:07, 3.37MB/s]
     44%|####3     | 18.2M/41.5M [00:20<00:06, 3.53MB/s]
     46%|####5     | 18.9M/41.5M [00:20<00:06, 3.70MB/s]
     47%|####7     | 19.7M/41.5M [00:21<00:05, 3.87MB/s]
     49%|####9     | 20.5M/41.5M [00:21<00:04, 4.42MB/s]
     51%|#####1    | 21.2M/41.5M [00:21<00:04, 5.06MB/s]
     52%|#####2    | 21.7M/41.5M [00:21<00:04, 4.66MB/s]
     54%|#####3    | 22.2M/41.5M [00:21<00:05, 3.98MB/s]
     56%|#####5    | 23.0M/41.5M [00:21<00:04, 4.70MB/s]
     58%|#####7    | 23.9M/41.5M [00:21<00:03, 5.57MB/s]
     59%|#####8    | 24.5M/41.5M [00:22<00:03, 5.12MB/s]
     60%|######    | 25.0M/41.5M [00:22<00:03, 4.91MB/s]
     62%|######2   | 25.9M/41.5M [00:22<00:02, 6.01MB/s]
     64%|######3   | 26.5M/41.5M [00:22<00:02, 5.46MB/s]
     65%|######5   | 27.1M/41.5M [00:22<00:02, 5.30MB/s]
     68%|######7   | 28.1M/41.5M [00:22<00:02, 6.58MB/s]
     69%|######9   | 28.8M/41.5M [00:22<00:02, 5.94MB/s]
     71%|
 #######   | 29.4M/41.5M [00:22<00:02, 5.76MB/s]
     74%|#######3  | 30.5M/41.5M [00:23<00:01, 7.16MB/s]
     75%|#######5  | 31.3M/41.5M [00:23<00:01, 6.45MB/s]
     77%|#######7  | 32.0M/41.5M [00:23<00:01, 6.23MB/s]
     80%|#######9  | 33.1M/41.5M [00:23<00:01, 7.76MB/s]
     82%|########1 | 33.9M/41.5M [00:23<00:01, 6.96MB/s]
     84%|########3 | 34.7M/41.5M [00:23<00:01, 6.73MB/s]
     87%|########6 | 35.9M/41.5M [00:23<00:00, 8.30MB/s]
     89%|########8 | 36.8M/41.5M [00:23<00:00, 7.44MB/s]
     90%|######### | 37.5M/41.5M [00:24<00:00, 7.20MB/s]
     94%|#########3| 38.9M/41.5M [00:24<00:00, 8.84MB/s]
     96%|#########5| 39.8M/41.5M [00:24<00:00, 7.91MB/s]
     98%|#########7| 40.6M/41.5M [00:24<00:00, 6.69MB/s]
    100%|##########| 41.5M/41.5M [00:24<00:00, 1.78MB/s]
 
 
 
diff --git a/docs/_sources/how_to/compile_models/from_paddle.rst.txt b/docs/_sources/how_to/compile_models/from_paddle.rst.txt
index b8d915831..c3780c103 100644
--- a/docs/_sources/how_to/compile_models/from_paddle.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_paddle.rst.txt
@@ -235,7 +235,7 @@ Look up prediction top 1 index in 1000 class synset.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  7.006 seconds)
+   **Total running time of the script:** ( 1 minutes  7.747 seconds)
 
 
 .. _sphx_glr_download_how_to_compile_models_from_paddle.py:
diff --git a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
index 2e3b7c9d8..23edca199 100644
--- a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
@@ -93,7 +93,7 @@ Load a pretrained PyTorch model
  .. code-block:: none
 
     Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
-
      0%|          | 0.00/44.7M [00:00<?, ?B/s]
      6%|5         | 2.66M/44.7M [00:00<00:01, 27.8MB/s]
     12%|#2        | 5.52M/44.7M [00:00<00:01, 28.9MB/s]
     23%|##2       | 10.3M/44.7M [00:00<00:00, 38.3MB/s]
     35%|###4      | 15.5M/44.7M [00:00<00:00, 44.5MB/s]
     45%|####5     | 20.2M/44.7M [00:00<00:00, 46.4MB/s]
     58%|#####7    | 25.8M/44.7M [00:00<00:00, 49.5MB/s]
     70%|######9   | 31.1M/44.7M [00:00<00:00, 51.7MB/s]
     82%|########1 | 36.5M/44.7M [00:00<00:00, 53.2MB/s]
     94%|#########3| 41.9M/44.7M [00:00<00:00, 54.1MB/s]
    100%|##########| 44.7M/44.7M [00:00<00:00, 49.1MB/s]
+
      0%|          | 0.00/44.7M [00:00<?, ?B/s]
      7%|7         | 3.23M/44.7M [00:00<00:01, 33.9MB/s]
     14%|#4        | 6.47M/44.7M [00:00<00:01, 33.3MB/s]
     22%|##1       | 9.75M/44.7M [00:00<00:01, 33.7MB/s]
     29%|##9       | 13.0M/44.7M [00:00<00:01, 31.9MB/s]
     38%|###8      | 17.1M/44.7M [00:00<00:00, 35.3MB/s]
     46%|####5     | 20.5M/44.7M [00:00<00:00, 34.8MB/s]
     53%|#####3    | 23.8M/44.7M [00:00<00:01, 21.6MB/s]
     59%|#####9    | 26.5M/44.7M [00:01<00:00, 21.8MB/s]
     67%|######6   | 29.8M/44.7M [00:01<00:00, 24.9MB/s]
     75%|#######4  | 33.3M/44.7M [00:01<00:00, 27.8MB/s]
     83%|########2 | 37.0M/44.7M [00:01<00:00, 29.9MB/s]
     90%|########9 | 40.1M/44.7M [00:01<00:00, 28.3MB/s]
     96%|#########6| 43.0M/44.7M [00:01<00:00, 28.8MB/s]
    100%|##########| 44.7M/44.7M [00:01<00:00, 28.6MB/s]
 
 
 
diff --git a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
index 53f770229..90458dd40 100644
--- a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
@@ -422,7 +422,7 @@ Run the corresponding model on tensorflow
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  4.119 seconds)
+   **Total running time of the script:** ( 1 minutes  1.170 seconds)
 
 
 .. _sphx_glr_download_how_to_compile_models_from_tensorflow.py:
diff --git a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
index 62d09c4c6..397dde91d 100644
--- a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
@@ -5,26 +5,26 @@
 
 Computation times
 =================
-**05:47.852** total execution time for **how_to_compile_models** files:
+**05:55.609** total execution time for **how_to_compile_models** files:
 
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``)         | 01:07.006 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``)         | 01:07.747 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``) | 01:04.119 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``) | 01:01.170 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``)       | 00:56.601 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``)       | 00:57.992 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``)       | 00:40.653 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``)       | 00:50.009 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``)         | 00:38.824 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``)         | 00:36.725 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``)           | 00:22.452 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``)           | 00:22.695 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``)         | 00:21.643 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``)         | 00:21.516 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``)       | 00:19.948 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``)       | 00:20.885 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``)           | 00:14.254 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``)           | 00:14.522 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``)             | 00:02.351 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``)             | 00:02.349 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
index e2de49339..5f20da138 100644
--- a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
@@ -440,7 +440,7 @@ Execute on TVM
     Evaluate inference time cost...
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      16.0557      15.8693      16.5941      15.7658       0.3119   
+      15.9465      15.9620      16.0734      15.7781       0.0909   
                
 
 
diff --git a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
index d47ec9c7b..0e908041c 100644
--- a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
@@ -122,7 +122,7 @@ Load pre-trained maskrcnn from torchvision and do tracing
  .. code-block:: none
 
     Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
-
      0%|          | 0.00/170M [00:00<?, ?B/s]
      6%|5         | 9.44M/170M [00:00<00:01, 98.6MB/s]
     15%|#5        | 25.9M/170M [00:00<00:01, 142MB/s] 
     25%|##5       | 42.5M/170M [00:00<00:00, 156MB/s]
     35%|###4      | 58.8M/170M [00:00<00:00, 162MB/s]
     44%|####4     | 75.3M/170M [00:00<00:00, 166MB/s]
     54%|#####4    | 91.9M/170M [00:00<00:00, 169MB/s]
     64%|######3   | 108M/170M [00:00<00:00, 170MB/s] 
     74%|#######3  | 125M/170M [00:00<00:00, 171MB/s]
     83%|########3 | 141M/170M [00:00<00:00, 171MB/s]
     93%|#########2| 158M/170M [00:01<00:00, 172MB/s]
    100%|##########| 170M/170M [00:01<00:00, 166MB/s]
+
      0%|          | 0.00/170M [00:00<?, ?B/s]
      8%|8         | 14.2M/170M [00:00<00:01, 148MB/s]
     21%|##        | 35.6M/170M [00:00<00:00, 193MB/s]
     34%|###3      | 56.9M/170M [00:00<00:00, 207MB/s]
     46%|####6     | 78.3M/170M [00:00<00:00, 213MB/s]
     59%|#####8    | 99.7M/170M [00:00<00:00, 217MB/s]
     71%|#######1  | 121M/170M [00:00<00:00, 218MB/s] 
     84%|########3 | 142M/170M [00:00<00:00, 219MB/s]
     96%|#########6| 163M/170M [00:00<00:00, 221MB/s]
    100%|##########| 170M/170M [00:00<00:00, 214MB/s]
     /usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:3878: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
       for i in range(dim)
     /usr/local/lib/python3.7/dist-packages/torchvision/models/detection/anchor_utils.py:127: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
@@ -291,7 +291,7 @@ Get boxes with score larger than 0.9
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 2 minutes  51.510 seconds)
+   **Total running time of the script:** ( 2 minutes  55.836 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_object_detection_pytorch.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
index f994d18a5..f5006051c 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
@@ -219,7 +219,7 @@ training. Other models require a full post training calibration.
  .. code-block:: none
 
     Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
-
      0%|          | 0.00/13.6M [00:00<?, ?B/s]
    100%|##########| 13.6M/13.6M [00:00<00:00, 153MB/s]
+
      0%|          | 0.00/13.6M [00:00<?, ?B/s]
     68%|######7   | 9.20M/13.6M [00:00<00:00, 95.6MB/s]
    100%|##########| 13.6M/13.6M [00:00<00:00, 110MB/s] 
 
 
 
@@ -399,7 +399,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      90.5119      90.4735      93.1524      90.1430       0.3397   
+      90.3645      90.2495      96.8307      90.1843       0.6696   
                
 
 
@@ -448,7 +448,7 @@ TODO
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  5.729 seconds)
+   **Total running time of the script:** ( 1 minutes  7.076 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_prequantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
index bf790ef4f..c615a7b62 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
@@ -426,7 +426,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      118.8315     118.7852     125.2959     117.9721      0.7412   
+      119.8498     119.8288     121.2662     119.0162      0.3249   
                
 
 
@@ -463,7 +463,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 2 minutes  9.625 seconds)
+   **Total running time of the script:** ( 2 minutes  3.701 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_prequantized_tflite.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
index e05d5fd7d..79d6eb426 100644
--- a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
@@ -254,7 +254,7 @@ We create a Relay VM to build and execute the model.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  14.689 seconds)
+   **Total running time of the script:** ( 1 minutes  32.299 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_quantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
index 1687fdf29..de82f8975 100644
--- a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
@@ -157,7 +157,7 @@ Convert and compile model for CPU.
             data: None
       input_sym_arg_type = in_param.infer_type()[0]
     Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
-
      0%|          | 0/132723 [00:00<?, ?KB/s]
      2%|1         | 2047/132723 [00:00<00:06, 20406.67KB/s]
      5%|4         | 6578/132723 [00:00<00:03, 35035.25KB/s]
     11%|#         | 14180/132723 [00:00<00:02, 53734.70KB/s]
     17%|#6        | 22154/132723 [00:00<00:01, 63994.50KB/s]
     23%|##2       | 30073/132723 [00:00<00:01, 69470.33KB/s]
     29%|##8       | 38102/132723 [00:00<00:01, 73147.09KB/s]
     35%|###4      | 46097/132723 [00:00<00:01, 75365.68KB/s]
     41%|####      | 54077/132723 [00:00<00:01, 76774.94KB/s]
     47%|####6     | 62103/132723 [00:00<00:00, 77862.69KB/s]
     53%|#####2    | 70157/132723 [00:01<00:00, 78685.87KB/s]
     59%|#####8    | 78228/132723 [00:01<00:00, 79301.38KB/s]
     65%|######5   | 86317/132723 [00:01<00:00, 79781.27KB/s]
     71%|#######1  | 94447/132723 [00:01<00:00, 80239.53KB/s]
     77%|#######7  | 102596/132723 [00:01<00:00, 80616.06KB/s]
     83%|########3 | 110731/132723 [00:01<00:00, 80836.08KB/s]
     90%|########9 
 | 118922/132723 [00:01<00:00, 81158.05KB/s]
     96%|#########5| 127088/132723 [00:01<00:00, 81306.03KB/s]
    100%|##########| 132723/132723 [00:01<00:00, 74969.03KB/s]
+
      0%|          | 0/132723 [00:00<?, ?KB/s]
      2%|2         | 3124/132723 [00:00<00:04, 31238.61KB/s]
      8%|7         | 10039/132723 [00:00<00:02, 53537.03KB/s]
     14%|#3        | 18569/132723 [00:00<00:01, 68032.11KB/s]
     21%|##        | 27284/132723 [00:00<00:01, 75576.21KB/s]
     27%|##7       | 36034/132723 [00:00<00:01, 79872.26KB/s]
     34%|###3      | 44669/132723 [00:00<00:01, 82069.42KB/s]
     40%|####      | 53394/132723 [00:00<00:00, 83759.28KB/s]
     47%|####6     | 62119/132723 [00:00<00:00, 84868.29KB/s]
     53%|#####3    | 70833/132723 [00:00<00:00, 85572.69KB/s]
     60%|#####9    | 79608/132723 [00:01<00:00, 86241.46KB/s]
     67%|######6   | 88369/132723 [00:01<00:00, 86655.55KB/s]
     73%|#######3  | 97076/132723 [00:01<00:00, 86778.49KB/s]
     80%|#######9  | 105828/132723 [00:01<00:00, 87001.18KB/s]
     86%|########6 | 114574/132723 [00:01<00:00, 87137.46KB/s]
     93%|#########2| 123289/132723 [00:01<00:00, 87138.26KB/s]
     99%|########
 #9| 132003/132723 [00:01<00:00, 87121.86KB/s]
    100%|##########| 132723/132723 [00:01<00:00, 82401.45KB/s]
 
 
 
@@ -240,7 +240,7 @@ Display result
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 2 minutes  15.415 seconds)
+   **Total running time of the script:** ( 2 minutes  17.409 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_ssd_gluoncv.py:
diff --git a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
index 4b4d5d094..92132cbf8 100644
--- a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
@@ -5,22 +5,22 @@
 
 Computation times
 =================
-**10:27.491** total execution time for **how_to_deploy_models** files:
+**10:47.671** total execution time for **how_to_deploy_models** files:
 
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``) | 02:51.510 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``) | 02:55.836 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``)                           | 02:15.415 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``)                           | 02:17.409 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``)           | 02:09.625 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``)           | 02:03.701 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``)                               | 01:14.689 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``)                               | 01:32.299 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``)                         | 01:05.729 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``)                         | 01:07.076 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``)                 | 00:28.241 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``)                 | 00:28.877 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``)                       | 00:22.277 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``)                       | 00:22.467 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_deploy_models_deploy_sparse.py` (``deploy_sparse.py``)                                     | 00:00.006 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
index d89fd6416..a7e17f704 100644
--- a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
@@ -463,7 +463,7 @@ First let us define two helper functions to get the mobilenet model and a cat im
 
  .. code-block:: none
 
-    Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip28234e9f-61c7-4f97-854a-564afd9983fc from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+    Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip8ca90524-9141-4870-9ca3-1f55665bdfc5 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
 
 
 
diff --git a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
index c43f5ebc4..d3fe3e141 100644
--- a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
 
 Computation times
 =================
-**00:41.355** total execution time for **how_to_extend_tvm** files:
+**00:40.008** total execution time for **how_to_extend_tvm** files:
 
 +-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``) | 00:38.244 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``) | 00:36.865 | 0.0 MB |
 +-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``)           | 00:02.187 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``)           | 00:02.214 | 0.0 MB |
 +-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``)                     | 00:00.917 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``)                     | 00:00.923 | 0.0 MB |
 +-------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_extend_tvm_low_level_custom_pass.py` (``low_level_custom_pass.py``)       | 00:00.006 | 0.0 MB |
 +-------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
index 46b69bc15..c80aaa6c6 100644
--- a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
@@ -215,10 +215,10 @@ profile the execution time of each passes.
  .. code-block:: none
 
     Printing results of timing profile...
-    InferType: 6848us [6848us] (45.94%; 45.94%)
-    FoldScaleAxis: 8058us [6us] (54.06%; 54.06%)
-            FoldConstant: 8051us [1612us] (54.02%; 99.92%)
-                    InferType: 6439us [6439us] (43.20%; 79.98%)
+    InferType: 6849us [6849us] (46.58%; 46.58%)
+    FoldScaleAxis: 7855us [6us] (53.42%; 53.42%)
+            FoldConstant: 7849us [1571us] (53.38%; 99.92%)
+                    InferType: 6278us [6278us] (42.70%; 79.99%)
 
 
 
@@ -257,10 +257,10 @@ Refer to following sections and :py:func:`tvm.instrument.pass_instrument` for th
  .. code-block:: none
 
     Printing results of timing profile...
-    InferType: 6432us [6432us] (44.63%; 44.63%)
-    FoldScaleAxis: 7981us [6us] (55.37%; 55.37%)
-            FoldConstant: 7975us [1667us] (55.33%; 99.92%)
-                    InferType: 6308us [6308us] (43.77%; 79.10%)
+    InferType: 6312us [6312us] (44.55%; 44.55%)
+    FoldScaleAxis: 7857us [5us] (55.45%; 55.45%)
+            FoldConstant: 7852us [1593us] (55.41%; 99.93%)
+                    InferType: 6259us [6259us] (44.17%; 79.72%)
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
index 03703cbb6..299baf95f 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
@@ -327,7 +327,7 @@ latency of convolution.
 
  .. code-block:: none
 
-    Convolution: 54.167125 ms
+    Convolution: 43.185978 ms
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
index 373942f4c..655342216 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
@@ -658,7 +658,7 @@ be able to run on our build server
 
  .. code-block:: none
 
-    conv2d with tensor core: 6.873242 ms
+    conv2d with tensor core: 11.878459 ms
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
index 91eb779e9..75f5f141c 100644
--- a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
@@ -130,8 +130,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
 
  .. code-block:: none
 
-    Numpy running time: 0.018172
-    Baseline: 3.395604
+    Numpy running time: 0.018950
+    Baseline: 3.338291
 
 
 
@@ -226,7 +226,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
 
  .. code-block:: none
 
-    Opt1: 0.300426
+    Opt1: 0.308090
 
 
 
@@ -329,7 +329,7 @@ In this tutorial, we chose to vectorize the inner loop row data since it is cach
 
  .. code-block:: none
 
-    Opt2: 0.332632
+    Opt2: 0.331753
 
 
 
@@ -425,7 +425,7 @@ the access pattern for A matrix is more cache friendly.
 
  .. code-block:: none
 
-    Opt3: 0.117883
+    Opt3: 0.121683
 
 
 
@@ -550,7 +550,7 @@ flattening.
 
  .. code-block:: none
 
-    Opt4: 0.110434
+    Opt4: 0.111281
 
 
 
@@ -672,7 +672,7 @@ write to C when all the block results are ready.
 
  .. code-block:: none
 
-    Opt5: 0.111427
+    Opt5: 0.111284
 
 
 
@@ -797,7 +797,7 @@ Futhermore, we can also utilize multi-core processors to do the thread-level par
 
  .. code-block:: none
 
-    Opt6: 0.145331
+    Opt6: 0.145534
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
index 6800c8e63..6f039c30b 100644
--- a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
@@ -5,12 +5,12 @@
 
 Computation times
 =================
-**00:34.322** total execution time for **how_to_optimize_operators** files:
+**00:34.419** total execution time for **how_to_optimize_operators** files:
 
 +-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``)                       | 00:32.069 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``)                       | 00:32.140 | 0.0 MB |
 +-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``) | 00:01.234 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``) | 00:01.283 | 0.0 MB |
 +-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``)             | 00:01.020 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``)             | 00:00.996 | 0.0 MB |
 +-----------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
index 3859aea9e..2ca94f4b3 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
@@ -5,18 +5,18 @@
 
 Computation times
 =================
-**05:12.442** total execution time for **how_to_tune_with_autoscheduler** files:
+**05:12.594** total execution time for **how_to_tune_with_autoscheduler** files:
 
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``) | 02:35.530 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``) | 02:34.321 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``)             | 01:19.801 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``)             | 01:20.685 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``)           | 00:42.629 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``)           | 00:43.097 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``)               | 00:17.693 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``)               | 00:17.412 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``)           | 00:08.451 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``)           | 00:08.696 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``)             | 00:08.338 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``)             | 00:08.384 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
index 2f5cf921b..c69713124 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
@@ -239,72 +239,128 @@ cooperative fetching, unrolling and operator fusion.
                  compute: Buffer(compute_2: Pointer(float32), float32, [25088], [])}
       buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, compute_1: compute}
       preflattened_buffer_map = {data_1: data_3: Buffer(data_2, float32, [1, 512, 7, 7], []), kernel_1: kernel_3: Buffer(kernel_2, float32, [512, 512, 3, 3], []), bias_1: bias_3: Buffer(bias_2, float32, [1, 512, 1, 1], []), compute_1: compute_3: Buffer(compute_2, float32, [1, 512, 7, 7], [])} {
-      attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 64;
-      allocate(conv2d_nchw: Pointer(local float32), float32, [2]), storage_scope = local;
-      allocate(pad_temp.shared: Pointer(shared float32), float32, [2016]), storage_scope = shared;
-      allocate(kernel.shared: Pointer(shared float32), float32, [768]), storage_scope = shared;
-      attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196 {
-        conv2d_nchw_1: Buffer(conv2d_nchw, float32, [1], [], scope="local", align=4)[0] = 0f32
+      attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 112;
+      allocate(conv2d_nchw: Pointer(local float32), float32, [7]), storage_scope = local;
+      allocate(pad_temp.shared: Pointer(shared float32), float32, [144]), storage_scope = shared;
+      allocate(kernel.shared: Pointer(shared float32), float32, [1536]), storage_scope = shared;
+      attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32 {
+        conv2d_nchw_1: Buffer(conv2d_nchw, float32, [7], [], scope="local", align=16)[0] = 0f32
         conv2d_nchw_1[1] = 0f32
-        for (rc.outer.outer: int32, 0, 16) {
+        conv2d_nchw_1[2] = 0f32
+        conv2d_nchw_1[3] = 0f32
+        conv2d_nchw_1[4] = 0f32
+        conv2d_nchw_1[5] = 0f32
+        conv2d_nchw_1[6] = 0f32
+        for (rc.outer.outer: int32, 0, 32) {
           for (ry.outer.outer: int32, 0, 3) {
-            let cse_var_2: int32 = (rc.outer.outer*1568)
-            let cse_var_1: int32 = (ry.outer.outer*7)
+            let cse_var_4: int32 = (rc.outer.outer*784)
+            let cse_var_3: int32 = (ry.outer.outer*7)
+            let cse_var_2: int32 = (rc.outer.outer*144)
+            let cse_var_1: int32 = (ry.outer.outer*3)
              {
-              attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
-              pad_temp.shared_1: Buffer(pad_temp.shared, float32, [2016], [], scope="shared")[threadIdx.x_1] = @tir.if_then_else(((((1 <= (floordiv(floormod(threadIdx.x_1, 63), 9) + ry.outer.outer)) && ((floordiv(floormod(threadIdx.x_1, 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 9))) && (floormod(threadIdx.x_1, 9) < 8)), data[((((cse_var_2 + (floordiv(threadIdx.x_1, 9)*7)) + cse_var_1) + floormod(threadIdx.x_1, 9)) - 8)], 0f32, dtype=float32)
-              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
-              pad_temp.shared_1[(threadIdx.x_1 + 196)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 196), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 196), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 7), 9))) && (floormod((threadIdx.x_1 + 7), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 196), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 7), 9)) - 8)], 0f32, dtype=float32)
-              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
-              pad_temp.shared_1[(threadIdx.x_1 + 392)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 392), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 392), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 5), 9))) && (floormod((threadIdx.x_1 + 5), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 392), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 5), 9)) - 8)], 0f32, dtype=float32)
-              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
-              pad_temp.shared_1[(threadIdx.x_1 + 588)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 588), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 588), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 3), 9))) && (floormod((threadIdx.x_1 + 3), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 588), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 3), 9)) - 8)], 0f32, dtype=float32)
-              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
-              pad_temp.shared_1[(threadIdx.x_1 + 784)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 784), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 784), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 1), 9))) && (floormod((threadIdx.x_1 + 1), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 784), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 1), 9)) - 8)], 0f32, dtype=float32)
-              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
-              pad_temp.shared_1[(threadIdx.x_1 + 980)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 980), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 980), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 8), 9))) && (floormod((threadIdx.x_1 + 8), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 980), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 8), 9)) - 8)], 0f32, dtype=float32)
-              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
-              pad_temp.shared_1[(threadIdx.x_1 + 1176)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 1176), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 1176), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 6), 9))) && (floormod((threadIdx.x_1 + 6), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1176), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 6), 9)) - 8)], 0f32, dtype=float32)
-              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
-              pad_temp.shared_1[(threadIdx.x_1 + 1372)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 1372), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 1372), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 4), 9))) && (floormod((threadIdx.x_1 + 4), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1372), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 4), 9)) - 8)], 0f32, dtype=float32)
-              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
-              pad_temp.shared_1[(threadIdx.x_1 + 1568)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 1568), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 1568), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 2), 9))) && (floormod((threadIdx.x_1 + 2), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1568), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 2), 9)) - 8)], 0f32, dtype=float32)
-              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
-              pad_temp.shared_1[(threadIdx.x_1 + 1764)] = @tir.if_then_else(((((1 <= (floordiv(floormod(threadIdx.x_1, 63), 9) + ry.outer.outer)) && ((floordiv(floormod(threadIdx.x_1, 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod(threadIdx.x_1, 9))) && (floormod(threadIdx.x_1, 9) < 8)), data[((((cse_var_2 + (floordiv(threadIdx.x_1, 9)*7)) + cse_var_1) + floormod(threadIdx.x_1, 9)) + 1364)], 0f32, dtype=float32)
-              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
-              if @tir.likely((threadIdx.x_1 < 56), dtype=bool) {
-                pad_temp.shared_1[(threadIdx.x_1 + 1960)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 1960), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1 + 1960), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1 + 7), 9))) && (floormod((threadIdx.x_1 + 7), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1960), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 7), 9)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+              pad_temp.shared_1: Buffer(pad_temp.shared, float32, [144], [], scope="shared")[threadIdx.x_1] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(threadIdx.x_1, 9))) && (floormod(threadIdx.x_1, 9) < 8)), data[(((((cse_var_4 + (floordiv(threadIdx.x_1, 9)*49)) + cse_var_3) + (floormod(blockIdx.x, 7)*7)) + floormod(threadIdx.x_1, 9)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+              pad_temp.shared_1[(threadIdx.x_1 + 32)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1 + 5), 9))) && (floormod((threadIdx.x_1 + 5), 9) < 8)), data[(((((cse_var_4 + (floordiv((threadIdx.x_1 + 32), 9)*49)) + cse_var_3) + (floormod(blockIdx.x, 7)*7)) + floormod((threadIdx.x_1 + 5), 9)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+              pad_temp.shared_1[(threadIdx.x_1 + 64)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1 + 1), 9))) && (floormod((threadIdx.x_1 + 1), 9) < 8)), data[(((((cse_var_4 + (floordiv((threadIdx.x_1 + 64), 9)*49)) + cse_var_3) + (floormod(blockIdx.x, 7)*7)) + floormod((threadIdx.x_1 + 1), 9)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+              pad_temp.shared_1[(threadIdx.x_1 + 96)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1 + 6), 9))) && (floormod((threadIdx.x_1 + 6), 9) < 8)), data[(((((cse_var_4 + (floordiv((threadIdx.x_1 + 96), 9)*49)) + cse_var_3) + (floormod(blockIdx.x, 7)*7)) + floormod((threadIdx.x_1 + 6), 9)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+              if @tir.likely((threadIdx.x_1 < 16), dtype=bool) {
+                pad_temp.shared_1[(threadIdx.x_1 + 128)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1 + 2), 9))) && (floormod((threadIdx.x_1 + 2), 9) < 8)), data[(((((cse_var_4 + (floordiv((threadIdx.x_1 + 128), 9)*49)) + cse_var_3) + (floormod(blockIdx.x, 7)*7)) + floormod((threadIdx.x_1 + 2), 9)) - 8)], 0f32, dtype=float32)
               }
-              attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196 {
-                kernel.shared_1: Buffer(kernel.shared, float32, [768], [], scope="shared")[(threadIdx.x_2*3)] = kernel[(((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 32)*4608)) + (rc.outer.outer*288)) + (floormod(threadIdx.x_2, 32)*9)) + (ry.outer.outer*3))]
-                kernel.shared_1[((threadIdx.x_2*3) + 1)] = kernel[((((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 32)*4608)) + (rc.outer.outer*288)) + (floormod(threadIdx.x_2, 32)*9)) + (ry.outer.outer*3)) + 1)]
-                kernel.shared_1[((threadIdx.x_2*3) + 2)] = kernel[((((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 32)*4608)) + (rc.outer.outer*288)) + (floormod(threadIdx.x_2, 32)*9)) + (ry.outer.outer*3)) + 2)]
-              }
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
-              if @tir.likely((threadIdx.x_2 < 60), dtype=bool) {
-                kernel.shared_1[((threadIdx.x_2*3) + 588)] = kernel[(((((blockIdx.x*36864) + (floordiv((floordiv(threadIdx.x_2, 4) + 49), 8)*4608)) + (rc.outer.outer*288)) + (floormod((threadIdx.x_2 + 4), 32)*9)) + (ry.outer.outer*3))]
-                kernel.shared_1[((threadIdx.x_2*3) + 589)] = kernel[((((((blockIdx.x*36864) + (floordiv((floordiv(threadIdx.x_2, 4) + 49), 8)*4608)) + (rc.outer.outer*288)) + (floormod((threadIdx.x_2 + 4), 32)*9)) + (ry.outer.outer*3)) + 1)]
-                kernel.shared_1[((threadIdx.x_2*3) + 590)] = kernel[((((((blockIdx.x*36864) + (floordiv((floordiv(threadIdx.x_2, 4) + 49), 8)*4608)) + (rc.outer.outer*288)) + (floormod((threadIdx.x_2 + 4), 32)*9)) + (ry.outer.outer*3)) + 2)]
-              }
-              for (rc.outer.inner: int32, 0, 16) {
-                conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7))]*kernel.shared_1[((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6))]))
-                conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7))]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 384)]))
-                conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 1)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 1)]))
-                conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 1)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 385)]))
-                conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 2)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 2)]))
-                conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 2)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 386)]))
-                conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 3)]))
-                conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 387)]))
-                conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 64)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 4)]))
-                conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 64)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 388)]))
-                conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 65)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 5)]))
-                conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 65)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 389)]))
+              attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+              kernel.shared_1: Buffer(kernel.shared, float32, [1536], [], scope="shared")[ramp((threadIdx.x_2*4), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(threadIdx.x_2, 12)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp((threadIdx.x_2*4), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp(threadIdx.x_2, 1, 4), broadcast(3, 4)))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+              kernel.shared_1[ramp(((threadIdx.x_2*4) + 128), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 128), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 128), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 32), 1, 4), broadcast(3, 4)))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+              kernel.shared_1[ramp(((threadIdx.x_2*4) + 256), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 256), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 256), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 64), 1, 4), broadcast(3, 4)))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+              kernel.shared_1[ramp(((threadIdx.x_2*4) + 384), 1, 4)] = kernel[(((broadcast(((((floordiv(blockIdx.x, 7)*147456) + (floordiv(threadIdx.x_2, 12)*4608)) + cse_var_2) + 36864), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 384), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 96), 1, 4), broadcast(3, 4)))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+              kernel.shared_1[ramp(((threadIdx.x_2*4) + 512), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 512), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 512), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 128), 1, 4), broadcast(3, 4)))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+              kernel.shared_1[ramp(((threadIdx.x_2*4) + 640), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 640), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 640), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 160), 1, 4), broadcast(3, 4)))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+              kernel.shared_1[ramp(((threadIdx.x_2*4) + 768), 1, 4)] = kernel[(((broadcast(((((floordiv(blockIdx.x, 7)*147456) + (floordiv(threadIdx.x_2, 12)*4608)) + cse_var_2) + 73728), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 768), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 192), 1, 4), broadcast(3, 4)))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+              kernel.shared_1[ramp(((threadIdx.x_2*4) + 896), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 896), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 896), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 224), 1, 4), broadcast(3, 4)))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+              kernel.shared_1[ramp(((threadIdx.x_2*4) + 1024), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 1024), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 1024), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 256), 1, 4), broadcast(3, 4)))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+              kernel.shared_1[ramp(((threadIdx.x_2*4) + 1152), 1, 4)] = kernel[(((broadcast(((((floordiv(blockIdx.x, 7)*147456) + (floordiv(threadIdx.x_2, 12)*4608)) + cse_var_2) + 110592), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 1152), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 288), 1, 4), broadcast(3, 4)))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+              kernel.shared_1[ramp(((threadIdx.x_2*4) + 1280), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 1280), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 1280), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 320), 1, 4), broadcast(3, 4)))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 32;
+              kernel.shared_1[ramp(((threadIdx.x_2*4) + 1408), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 1408), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 1408), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 352), 1, 4), broadcast(3, 4)))]
+              for (rc.outer.inner: int32, 0, 8) {
+                let cse_var_19: int32 = (rc.outer.inner*18)
+                let cse_var_18: int32 = (cse_var_19 + 7)
+                let cse_var_17: int32 = (cse_var_19 + 6)
+                let cse_var_16: int32 = (cse_var_19 + 5)
+                let cse_var_15: int32 = (cse_var_19 + 4)
+                let cse_var_14: int32 = (cse_var_19 + 3)
+                let cse_var_13: int32 = (cse_var_19 + 2)
+                let cse_var_12: int32 = (cse_var_19 + 16)
+                let cse_var_11: int32 = (cse_var_19 + 15)
+                let cse_var_10: int32 = (cse_var_19 + 14)
+                let cse_var_9: int32 = (cse_var_19 + 13)
+                let cse_var_8: int32 = (cse_var_19 + 12)
+                let cse_var_7: int32 = (cse_var_19 + 11)
+                let cse_var_6: int32 = (cse_var_19 + 10)
+                let cse_var_5: int32 = (cse_var_19 + 1)
+                 {
+                  conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_19]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+                  conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_5]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+                  conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_13]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+                  conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_14]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+                  conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_15]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+                  conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_16]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+                  conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_17]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+                  conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_5]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+                  conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_13]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+                  conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_14]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+                  conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_15]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+                  conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_16]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+                  conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_17]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+                  conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_18]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+                  conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_13]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+                  conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_14]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+                  conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_15]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+                  conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_16]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+                  conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_17]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+                  conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_18]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+                  conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(cse_var_19 + 8)]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+                  conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(cse_var_19 + 9)]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+                  conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_6]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+                  conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_7]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+                  conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_8]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+                  conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_9]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+                  conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_10]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+                  conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_11]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+                  conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_6]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+                  conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_7]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+                  conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_8]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+                  conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_9]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+                  conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_10]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+                  conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_11]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+                  conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_12]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+                  conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_7]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+                  conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_8]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+                  conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_9]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+                  conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_10]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+                  conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_11]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+                  conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_12]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+                  conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(cse_var_19 + 17)]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+                }
               }
             }
           }
         }
-        compute[((blockIdx.x*392) + threadIdx.x)] = max((conv2d_nchw_1[0] + bias[((blockIdx.x*8) + floordiv(threadIdx.x, 49))]), 0f32)
-        compute[(((blockIdx.x*392) + threadIdx.x) + 196)] = max((conv2d_nchw_1[1] + bias[(((blockIdx.x*8) + floordiv(threadIdx.x, 49)) + 4)]), 0f32)
+        for (i3.inner: int32, 0, 7) {
+          compute[((((floordiv(blockIdx.x, 7)*1568) + (threadIdx.x*49)) + (floormod(blockIdx.x, 7)*7)) + i3.inner)] = max((conv2d_nchw_1[i3.inner] + bias[((floordiv(blockIdx.x, 7)*32) + threadIdx.x)]), 0f32)
+        }
       }
     }
 
@@ -358,7 +414,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 0.321 ms
+    Execution time of this operator: 0.322 ms
 
 
 
@@ -408,18 +464,18 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
     conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
     conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
     conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=1)
-    conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=4)
-    conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=2)
+    conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=32)
+    conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
     conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
     conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
-    conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=7)
+    conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=1)
     conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
-    conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
+    conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=7)
     conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
-    conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=7)
+    conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
     conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
     conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=2)
-    conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=16)
+    conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=8)
     conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1)
     conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=1)
     conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=3)
@@ -429,13 +485,13 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
     compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
     compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
     compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=1)
-    compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=4)
-    compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=2)
+    compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=32)
+    compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
     compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
-    compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=7)
+    compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=1)
     compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
-    compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
-    compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=7)
+    compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
+    compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
     compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
     s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
     s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
@@ -453,14 +509,14 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
     compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused = s[compute].fuse(compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i)
     s[compute].bind(compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused, te.thread_axis("threadIdx.x"))
     kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
-    kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=3)
+    kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=4)
     s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-    kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=196)
+    kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=32)
     s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
     pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
     pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
     s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=196)
+    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=32)
     s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
     s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 64)
     s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "unroll_explicit", True)
@@ -480,55 +536,2561 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
       #define int64_t long long
       #define uint64_t unsigned long long
     #endif
-    extern "C" __global__ void __launch_bounds__(196) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
-      float conv2d_nchw[2];
-      __shared__ float pad_temp_shared[2016];
-      __shared__ float kernel_shared[768];
+    extern "C" __global__ void __launch_bounds__(32) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+      float conv2d_nchw[7];
+      __shared__ float pad_temp_shared[144];
+      __shared__ float kernel_shared[1536];
       conv2d_nchw[0] = 0.000000e+00f;
       conv2d_nchw[1] = 0.000000e+00f;
-      for (int rc_outer_outer = 0; rc_outer_outer < 16; ++rc_outer_outer) {
+      conv2d_nchw[2] = 0.000000e+00f;
+      conv2d_nchw[3] = 0.000000e+00f;
+      conv2d_nchw[4] = 0.000000e+00f;
+      conv2d_nchw[5] = 0.000000e+00f;
+      conv2d_nchw[6] = 0.000000e+00f;
+      for (int rc_outer_outer = 0; rc_outer_outer < 32; ++rc_outer_outer) {
         for (int ry_outer_outer = 0; ry_outer_outer < 3; ++ry_outer_outer) {
           __syncthreads();
-          pad_temp_shared[((int)threadIdx.x)] = (((((1 <= (((((int)threadIdx.x) % 63) / 9) + ry_outer_outer)) && ((((((int)threadIdx.x) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + ((((int)threadIdx.x) / 9) * 7)) + (ry_outer_outer * 7)) + (((int)threadIdx.x) % 9)) - 8)] : 0.000000e+00f);
-          pad_temp_shared[(((int)threadIdx.x) + 196)] = (((((1 <= ((((((int)threadIdx.x) + 7) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 7) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 7) % 9))) && (((((int)threadIdx.x) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 196) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 7) % 9)) - 8)] : 0.000000e+00f);
-          pad_temp_shared[(((int)threadIdx.x) + 392)] = (((((1 <= ((((((int)threadIdx.x) + 14) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 14) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 5) % 9))) && (((((int)threadIdx.x) + 5) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 392) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 5) % 9)) - 8)] : 0.000000e+00f);
-          pad_temp_shared[(((int)threadIdx.x) + 588)] = (((((1 <= ((((((int)threadIdx.x) + 21) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 21) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 3) % 9))) && (((((int)threadIdx.x) + 3) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 588) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 3) % 9)) - 8)] : 0.000000e+00f);
-          pad_temp_shared[(((int)threadIdx.x) + 784)] = (((((1 <= ((((((int)threadIdx.x) + 28) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 28) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 1) % 9))) && (((((int)threadIdx.x) + 1) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 784) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 1) % 9)) - 8)] : 0.000000e+00f);
-          pad_temp_shared[(((int)threadIdx.x) + 980)] = (((((1 <= ((((((int)threadIdx.x) + 35) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 35) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 8) % 9))) && (((((int)threadIdx.x) + 8) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 980) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 8) % 9)) - 8)] : 0.000000e+00f);
-          pad_temp_shared[(((int)threadIdx.x) + 1176)] = (((((1 <= ((((((int)threadIdx.x) + 42) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 42) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 6) % 9))) && (((((int)threadIdx.x) + 6) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1176) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 6) % 9)) - 8)] : 0.000000e+00f);
-          pad_temp_shared[(((int)threadIdx.x) + 1372)] = (((((1 <= ((((((int)threadIdx.x) + 49) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 49) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 4) % 9))) && (((((int)threadIdx.x) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1372) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 4) % 9)) - 8)] : 0.000000e+00f);
-          pad_temp_shared[(((int)threadIdx.x) + 1568)] = (((((1 <= ((((((int)threadIdx.x) + 56) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 56) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 2) % 9))) && (((((int)threadIdx.x) + 2) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1568) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 2) % 9)) - 8)] : 0.000000e+00f);
-          pad_temp_shared[(((int)threadIdx.x) + 1764)] = (((((1 <= (((((int)threadIdx.x) % 63) / 9) + ry_outer_outer)) && ((((((int)threadIdx.x) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + ((((int)threadIdx.x) / 9) * 7)) + (ry_outer_outer * 7)) + (((int)threadIdx.x) % 9)) + 1364)] : 0.000000e+00f);
-          if (((int)threadIdx.x) < 56) {
-            pad_temp_shared[(((int)threadIdx.x) + 1960)] = (((((1 <= ((((((int)threadIdx.x) + 7) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) + 7) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) + 7) % 9))) && (((((int)threadIdx.x) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1960) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 7) % 9)) - 8)] : 0.000000e+00f);
-          }
-          kernel_shared[(((int)threadIdx.x) * 3)] = kernel[(((((((int)blockIdx.x) * 36864) + ((((int)threadIdx.x) >> 5) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) & 31) * 9)) + (ry_outer_outer * 3))];
-          kernel_shared[((((int)threadIdx.x) * 3) + 1)] = kernel[((((((((int)blockIdx.x) * 36864) + ((((int)threadIdx.x) >> 5) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) & 31) * 9)) + (ry_outer_outer * 3)) + 1)];
-          kernel_shared[((((int)threadIdx.x) * 3) + 2)] = kernel[((((((((int)blockIdx.x) * 36864) + ((((int)threadIdx.x) >> 5) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) & 31) * 9)) + (ry_outer_outer * 3)) + 2)];
-          if (((int)threadIdx.x) < 60) {
-            kernel_shared[((((int)threadIdx.x) * 3) + 588)] = kernel[(((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 196) >> 5) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 4) & 31) * 9)) + (ry_outer_outer * 3))];
-            kernel_shared[((((int)threadIdx.x) * 3) + 589)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 196) >> 5) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 4) & 31) * 9)) + (ry_outer_outer * 3)) + 1)];
-            kernel_shared[((((int)threadIdx.x) * 3) + 590)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 196) >> 5) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 4) & 31) * 9)) + (ry_outer_outer * 3)) + 2)];
+          pad_temp_shared[((int)threadIdx.x)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + ((((int)threadIdx.x) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((int)threadIdx.x) % 9)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 32)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) + 5) % 9))) && (((((int)threadIdx.x) + 5) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 32) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 5) % 9)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 64)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) + 1) % 9))) && (((((int)threadIdx.x) + 1) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 64) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 1) % 9)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 96)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) + 6) % 9))) && (((((int)threadIdx.x) + 6) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 96) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 6) % 9)) - 8)] : 0.000000e+00f);
+          if (((int)threadIdx.x) < 16) {
+            pad_temp_shared[(((int)threadIdx.x) + 128)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) + 2) % 9))) && (((((int)threadIdx.x) + 2) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 128) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 2) % 9)) - 8)] : 0.000000e+00f);
           }
+          int4 _1;
+            int4 _2;
+              int4 _3;
+                int4 _4 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)));
+                int4 _5;
+                  int4 _6;
+                    int4 _7;
+                      int4 _8 = make_int4(((((int)threadIdx.x) * 4))+(1*0), ((((int)threadIdx.x) * 4))+(1*1), ((((int)threadIdx.x) * 4))+(1*2), ((((int)threadIdx.x) * 4))+(1*3));
+                      int4 _9 = make_int4(3, 3, 3, 3);
+                      _7.x = (_8.x%_9.x);
+                      _7.y = (_8.y%_9.y);
+                      _7.z = (_8.z%_9.z);
+                      _7.w = (_8.w%_9.w);
+                    int4 _10;
+                      int4 _11 = make_int4(((((int)threadIdx.x) * 4))+(1*0), ((((int)threadIdx.x) * 4))+(1*1), ((((int)threadIdx.x) * 4))+(1*2), ((((int)threadIdx.x) * 4))+(1*3));
+                      int4 _12 = make_int4(3, 3, 3, 3);
+                      _10.x = (_11.x/_12.x);
+                      _10.y = (_11.y/_12.y);
+                      _10.z = (_11.z/_12.z);
+                      _10.w = (_11.w/_12.w);
+                    int4 _13;
+                    ushort4 _14;
+                      ushort4 _15;
+                        ushort4 _16;
+                          int4 _17 = make_int4(3, 3, 3, 3);
+                          int4 _18 = make_int4(0, 0, 0, 0);
+                          _16.x = (_17.x>=_18.x);
+                          _16.y = (_17.y>=_18.y);
+                          _16.z = (_17.z>=_18.z);
+                          _16.w = (_17.w>=_18.w);
+                        ushort4 _19;
+                          int4 _20 = make_int4(0, 0, 0, 0);
+                          _19.x = (_7.x>=_20.x);
+                          _19.y = (_7.y>=_20.y);
+                          _19.z = (_7.z>=_20.z);
+                          _19.w = (_7.w>=_20.w);
+                        _15.x = (_16.x&&_19.x);
+                        _15.y = (_16.y&&_19.y);
+                        _15.z = (_16.z&&_19.z);
+                        _15.w = (_16.w&&_19.w);
+                      ushort4 _21;
+                        ushort4 _22;
+                          int4 _23 = make_int4(3, 3, 3, 3);
+                          int4 _24 = make_int4(0, 0, 0, 0);
+                          _22.x = (_23.x<_24.x);
+                          _22.y = (_23.y<_24.y);
+                          _22.z = (_23.z<_24.z);
+                          _22.w = (_23.w<_24.w);
+                        ushort4 _25;
+                          int4 _26 = make_int4(0, 0, 0, 0);
+                          _25.x = (_7.x<=_26.x);
+                          _25.y = (_7.y<=_26.y);
+                          _25.z = (_7.z<=_26.z);
+                          _25.w = (_7.w<=_26.w);
+                        _21.x = (_22.x&&_25.x);
+                        _21.y = (_22.y&&_25.y);
+                        _21.z = (_22.z&&_25.z);
+                        _21.w = (_22.w&&_25.w);
+                      _14.x = (_15.x||_21.x);
+                      _14.y = (_15.y||_21.y);
+                      _14.z = (_15.z||_21.z);
+                      _14.w = (_15.w||_21.w);
+                    int4 _27;
+                      int4 _28 = make_int4(1, 1, 1, 1);
+                      _27.x = (_10.x-_28.x);
+                      _27.y = (_10.y-_28.y);
+                      _27.z = (_10.z-_28.z);
+                      _27.w = (_10.w-_28.w);
+                    _13.x = (bool(_14.x)?_10.x:_27.x);
+                    _13.y = (bool(_14.y)?_10.y:_27.y);
+                    _13.z = (bool(_14.z)?_10.z:_27.z);
+                    _13.w = (bool(_14.w)?_10.w:_27.w);
+                    int4 _29 = make_int4(16, 16, 16, 16);
+                    _6.x = (_13.x%_29.x);
+                    _6.y = (_13.y%_29.y);
+                    _6.z = (_13.z%_29.z);
+                    _6.w = (_13.w%_29.w);
+                  int4 _30;
+                  ushort4 _31;
+                    ushort4 _32;
+                      ushort4 _33;
+                        int4 _34 = make_int4(16, 16, 16, 16);
+                        int4 _35 = make_int4(0, 0, 0, 0);
+                        _33.x = (_34.x>=_35.x);
+                        _33.y = (_34.y>=_35.y);
+                        _33.z = (_34.z>=_35.z);
+                        _33.w = (_34.w>=_35.w);
+                      ushort4 _36;
+                        int4 _37 = make_int4(0, 0, 0, 0);
+                        _36.x = (_6.x>=_37.x);
+                        _36.y = (_6.y>=_37.y);
+                        _36.z = (_6.z>=_37.z);
+                        _36.w = (_6.w>=_37.w);
+                      _32.x = (_33.x&&_36.x);
+                      _32.y = (_33.y&&_36.y);
+                      _32.z = (_33.z&&_36.z);
+                      _32.w = (_33.w&&_36.w);
+                    ushort4 _38;
+                      ushort4 _39;
+                        int4 _40 = make_int4(16, 16, 16, 16);
+                        int4 _41 = make_int4(0, 0, 0, 0);
+                        _39.x = (_40.x<_41.x);
+                        _39.y = (_40.y<_41.y);
+                        _39.z = (_40.z<_41.z);
+                        _39.w = (_40.w<_41.w);
+                      ushort4 _42;
+                        int4 _43 = make_int4(0, 0, 0, 0);
+                        _42.x = (_6.x<=_43.x);
+                        _42.y = (_6.y<=_43.y);
+                        _42.z = (_6.z<=_43.z);
+                        _42.w = (_6.w<=_43.w);
+                      _38.x = (_39.x&&_42.x);
+                      _38.y = (_39.y&&_42.y);
+                      _38.z = (_39.z&&_42.z);
+                      _38.w = (_39.w&&_42.w);
+                    _31.x = (_32.x||_38.x);
+                    _31.y = (_32.y||_38.y);
+                    _31.z = (_32.z||_38.z);
+                    _31.w = (_32.w||_38.w);
+                  int4 _44;
+                    int4 _45 = make_int4(16, 16, 16, 16);
+                    _44.x = (_6.x+_45.x);
+                    _44.y = (_6.y+_45.y);
+                    _44.z = (_6.z+_45.z);
+                    _44.w = (_6.w+_45.w);
+                  _30.x = (bool(_31.x)?_6.x:_44.x);
+                  _30.y = (bool(_31.y)?_6.y:_44.y);
+                  _30.z = (bool(_31.z)?_6.z:_44.z);
+                  _30.w = (bool(_31.w)?_6.w:_44.w);
+                  int4 _46 = make_int4(9, 9, 9, 9);
+                  _5.x = (_30.x*_46.x);
+                  _5.y = (_30.y*_46.y);
+                  _5.z = (_30.z*_46.z);
+                  _5.w = (_30.w*_46.w);
+                _3.x = (_4.x+_5.x);
+                _3.y = (_4.y+_5.y);
+                _3.z = (_4.z+_5.z);
+                _3.w = (_4.w+_5.w);
+              int4 _47 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+              _2.x = (_3.x+_47.x);
+              _2.y = (_3.y+_47.y);
+              _2.z = (_3.z+_47.z);
+              _2.w = (_3.w+_47.w);
+            int4 _48;
+              int4 _49 = make_int4((((int)threadIdx.x))+(1*0), (((int)threadIdx.x))+(1*1), (((int)threadIdx.x))+(1*2), (((int)threadIdx.x))+(1*3));
+              int4 _50 = make_int4(3, 3, 3, 3);
+              _48.x = (_49.x%_50.x);
+              _48.y = (_49.y%_50.y);
+              _48.z = (_49.z%_50.z);
+              _48.w = (_49.w%_50.w);
+            int4 _51;
+            ushort4 _52;
+              ushort4 _53;
+                ushort4 _54;
+                  int4 _55 = make_int4(3, 3, 3, 3);
+                  int4 _56 = make_int4(0, 0, 0, 0);
+                  _54.x = (_55.x>=_56.x);
+                  _54.y = (_55.y>=_56.y);
+                  _54.z = (_55.z>=_56.z);
+                  _54.w = (_55.w>=_56.w);
+                ushort4 _57;
+                  int4 _58 = make_int4(0, 0, 0, 0);
+                  _57.x = (_48.x>=_58.x);
+                  _57.y = (_48.y>=_58.y);
+                  _57.z = (_48.z>=_58.z);
+                  _57.w = (_48.w>=_58.w);
+                _53.x = (_54.x&&_57.x);
+                _53.y = (_54.y&&_57.y);
+                _53.z = (_54.z&&_57.z);
+                _53.w = (_54.w&&_57.w);
+              ushort4 _59;
+                ushort4 _60;
+                  int4 _61 = make_int4(3, 3, 3, 3);
+                  int4 _62 = make_int4(0, 0, 0, 0);
+                  _60.x = (_61.x<_62.x);
+                  _60.y = (_61.y<_62.y);
+                  _60.z = (_61.z<_62.z);
+                  _60.w = (_61.w<_62.w);
+                ushort4 _63;
+                  int4 _64 = make_int4(0, 0, 0, 0);
+                  _63.x = (_48.x<=_64.x);
+                  _63.y = (_48.y<=_64.y);
+                  _63.z = (_48.z<=_64.z);
+                  _63.w = (_48.w<=_64.w);
+                _59.x = (_60.x&&_63.x);
+                _59.y = (_60.y&&_63.y);
+                _59.z = (_60.z&&_63.z);
+                _59.w = (_60.w&&_63.w);
+              _52.x = (_53.x||_59.x);
+              _52.y = (_53.y||_59.y);
+              _52.z = (_53.z||_59.z);
+              _52.w = (_53.w||_59.w);
+            int4 _65;
+              int4 _66 = make_int4(3, 3, 3, 3);
+              _65.x = (_48.x+_66.x);
+              _65.y = (_48.y+_66.y);
+              _65.z = (_48.z+_66.z);
+              _65.w = (_48.w+_66.w);
+            _51.x = (bool(_52.x)?_48.x:_65.x);
+            _51.y = (bool(_52.y)?_48.y:_65.y);
+            _51.z = (bool(_52.z)?_48.z:_65.z);
+            _51.w = (bool(_52.w)?_48.w:_65.w);
+            _1.x = (_2.x+_51.x);
+            _1.y = (_2.y+_51.y);
+            _1.z = (_2.z+_51.z);
+            _1.w = (_2.w+_51.w);
+          *(float4*)(kernel_shared + (((int)threadIdx.x) * 4)) = make_float4(kernel[_1.x],kernel[_1.y],kernel[_1.z],kernel[_1.w]);
+          int4 _67;
+            int4 _68;
+              int4 _69;
+                int4 _70 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 128) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 128) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 128) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 128) / 48) * 4608)) + (rc_outer_outer  [...]
+                int4 _71;
+                  int4 _72;
+                    int4 _73;
+                      int4 _74 = make_int4((((((int)threadIdx.x) * 4) + 128))+(1*0), (((((int)threadIdx.x) * 4) + 128))+(1*1), (((((int)threadIdx.x) * 4) + 128))+(1*2), (((((int)threadIdx.x) * 4) + 128))+(1*3));
+                      int4 _75 = make_int4(3, 3, 3, 3);
+                      _73.x = (_74.x%_75.x);
+                      _73.y = (_74.y%_75.y);
+                      _73.z = (_74.z%_75.z);
+                      _73.w = (_74.w%_75.w);
+                    int4 _76;
+                      int4 _77 = make_int4((((((int)threadIdx.x) * 4) + 128))+(1*0), (((((int)threadIdx.x) * 4) + 128))+(1*1), (((((int)threadIdx.x) * 4) + 128))+(1*2), (((((int)threadIdx.x) * 4) + 128))+(1*3));
+                      int4 _78 = make_int4(3, 3, 3, 3);
+                      _76.x = (_77.x/_78.x);
+                      _76.y = (_77.y/_78.y);
+                      _76.z = (_77.z/_78.z);
+                      _76.w = (_77.w/_78.w);
+                    int4 _79;
+                    ushort4 _80;
+                      ushort4 _81;
+                        ushort4 _82;
+                          int4 _83 = make_int4(3, 3, 3, 3);
+                          int4 _84 = make_int4(0, 0, 0, 0);
+                          _82.x = (_83.x>=_84.x);
+                          _82.y = (_83.y>=_84.y);
+                          _82.z = (_83.z>=_84.z);
+                          _82.w = (_83.w>=_84.w);
+                        ushort4 _85;
+                          int4 _86 = make_int4(0, 0, 0, 0);
+                          _85.x = (_73.x>=_86.x);
+                          _85.y = (_73.y>=_86.y);
+                          _85.z = (_73.z>=_86.z);
+                          _85.w = (_73.w>=_86.w);
+                        _81.x = (_82.x&&_85.x);
+                        _81.y = (_82.y&&_85.y);
+                        _81.z = (_82.z&&_85.z);
+                        _81.w = (_82.w&&_85.w);
+                      ushort4 _87;
+                        ushort4 _88;
+                          int4 _89 = make_int4(3, 3, 3, 3);
+                          int4 _90 = make_int4(0, 0, 0, 0);
+                          _88.x = (_89.x<_90.x);
+                          _88.y = (_89.y<_90.y);
+                          _88.z = (_89.z<_90.z);
+                          _88.w = (_89.w<_90.w);
+                        ushort4 _91;
+                          int4 _92 = make_int4(0, 0, 0, 0);
+                          _91.x = (_73.x<=_92.x);
+                          _91.y = (_73.y<=_92.y);
+                          _91.z = (_73.z<=_92.z);
+                          _91.w = (_73.w<=_92.w);
+                        _87.x = (_88.x&&_91.x);
+                        _87.y = (_88.y&&_91.y);
+                        _87.z = (_88.z&&_91.z);
+                        _87.w = (_88.w&&_91.w);
+                      _80.x = (_81.x||_87.x);
+                      _80.y = (_81.y||_87.y);
+                      _80.z = (_81.z||_87.z);
+                      _80.w = (_81.w||_87.w);
+                    int4 _93;
+                      int4 _94 = make_int4(1, 1, 1, 1);
+                      _93.x = (_76.x-_94.x);
+                      _93.y = (_76.y-_94.y);
+                      _93.z = (_76.z-_94.z);
+                      _93.w = (_76.w-_94.w);
+                    _79.x = (bool(_80.x)?_76.x:_93.x);
+                    _79.y = (bool(_80.y)?_76.y:_93.y);
+                    _79.z = (bool(_80.z)?_76.z:_93.z);
+                    _79.w = (bool(_80.w)?_76.w:_93.w);
+                    int4 _95 = make_int4(16, 16, 16, 16);
+                    _72.x = (_79.x%_95.x);
+                    _72.y = (_79.y%_95.y);
+                    _72.z = (_79.z%_95.z);
+                    _72.w = (_79.w%_95.w);
+                  int4 _96;
+                  ushort4 _97;
+                    ushort4 _98;
+                      ushort4 _99;
+                        int4 _100 = make_int4(16, 16, 16, 16);
+                        int4 _101 = make_int4(0, 0, 0, 0);
+                        _99.x = (_100.x>=_101.x);
+                        _99.y = (_100.y>=_101.y);
+                        _99.z = (_100.z>=_101.z);
+                        _99.w = (_100.w>=_101.w);
+                      ushort4 _102;
+                        int4 _103 = make_int4(0, 0, 0, 0);
+                        _102.x = (_72.x>=_103.x);
+                        _102.y = (_72.y>=_103.y);
+                        _102.z = (_72.z>=_103.z);
+                        _102.w = (_72.w>=_103.w);
+                      _98.x = (_99.x&&_102.x);
+                      _98.y = (_99.y&&_102.y);
+                      _98.z = (_99.z&&_102.z);
+                      _98.w = (_99.w&&_102.w);
+                    ushort4 _104;
+                      ushort4 _105;
+                        int4 _106 = make_int4(16, 16, 16, 16);
+                        int4 _107 = make_int4(0, 0, 0, 0);
+                        _105.x = (_106.x<_107.x);
+                        _105.y = (_106.y<_107.y);
+                        _105.z = (_106.z<_107.z);
+                        _105.w = (_106.w<_107.w);
+                      ushort4 _108;
+                        int4 _109 = make_int4(0, 0, 0, 0);
+                        _108.x = (_72.x<=_109.x);
+                        _108.y = (_72.y<=_109.y);
+                        _108.z = (_72.z<=_109.z);
+                        _108.w = (_72.w<=_109.w);
+                      _104.x = (_105.x&&_108.x);
+                      _104.y = (_105.y&&_108.y);
+                      _104.z = (_105.z&&_108.z);
+                      _104.w = (_105.w&&_108.w);
+                    _97.x = (_98.x||_104.x);
+                    _97.y = (_98.y||_104.y);
+                    _97.z = (_98.z||_104.z);
+                    _97.w = (_98.w||_104.w);
+                  int4 _110;
+                    int4 _111 = make_int4(16, 16, 16, 16);
+                    _110.x = (_72.x+_111.x);
+                    _110.y = (_72.y+_111.y);
+                    _110.z = (_72.z+_111.z);
+                    _110.w = (_72.w+_111.w);
+                  _96.x = (bool(_97.x)?_72.x:_110.x);
+                  _96.y = (bool(_97.y)?_72.y:_110.y);
+                  _96.z = (bool(_97.z)?_72.z:_110.z);
+                  _96.w = (bool(_97.w)?_72.w:_110.w);
+                  int4 _112 = make_int4(9, 9, 9, 9);
+                  _71.x = (_96.x*_112.x);
+                  _71.y = (_96.y*_112.y);
+                  _71.z = (_96.z*_112.z);
+                  _71.w = (_96.w*_112.w);
+                _69.x = (_70.x+_71.x);
+                _69.y = (_70.y+_71.y);
+                _69.z = (_70.z+_71.z);
+                _69.w = (_70.w+_71.w);
+              int4 _113 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+              _68.x = (_69.x+_113.x);
+              _68.y = (_69.y+_113.y);
+              _68.z = (_69.z+_113.z);
+              _68.w = (_69.w+_113.w);
+            int4 _114;
+              int4 _115 = make_int4(((((int)threadIdx.x) + 32))+(1*0), ((((int)threadIdx.x) + 32))+(1*1), ((((int)threadIdx.x) + 32))+(1*2), ((((int)threadIdx.x) + 32))+(1*3));
+              int4 _116 = make_int4(3, 3, 3, 3);
+              _114.x = (_115.x%_116.x);
+              _114.y = (_115.y%_116.y);
+              _114.z = (_115.z%_116.z);
+              _114.w = (_115.w%_116.w);
+            int4 _117;
+            ushort4 _118;
+              ushort4 _119;
+                ushort4 _120;
+                  int4 _121 = make_int4(3, 3, 3, 3);
+                  int4 _122 = make_int4(0, 0, 0, 0);
+                  _120.x = (_121.x>=_122.x);
+                  _120.y = (_121.y>=_122.y);
+                  _120.z = (_121.z>=_122.z);
+                  _120.w = (_121.w>=_122.w);
+                ushort4 _123;
+                  int4 _124 = make_int4(0, 0, 0, 0);
+                  _123.x = (_114.x>=_124.x);
+                  _123.y = (_114.y>=_124.y);
+                  _123.z = (_114.z>=_124.z);
+                  _123.w = (_114.w>=_124.w);
+                _119.x = (_120.x&&_123.x);
+                _119.y = (_120.y&&_123.y);
+                _119.z = (_120.z&&_123.z);
+                _119.w = (_120.w&&_123.w);
+              ushort4 _125;
+                ushort4 _126;
+                  int4 _127 = make_int4(3, 3, 3, 3);
+                  int4 _128 = make_int4(0, 0, 0, 0);
+                  _126.x = (_127.x<_128.x);
+                  _126.y = (_127.y<_128.y);
+                  _126.z = (_127.z<_128.z);
+                  _126.w = (_127.w<_128.w);
+                ushort4 _129;
+                  int4 _130 = make_int4(0, 0, 0, 0);
+                  _129.x = (_114.x<=_130.x);
+                  _129.y = (_114.y<=_130.y);
+                  _129.z = (_114.z<=_130.z);
+                  _129.w = (_114.w<=_130.w);
+                _125.x = (_126.x&&_129.x);
+                _125.y = (_126.y&&_129.y);
+                _125.z = (_126.z&&_129.z);
+                _125.w = (_126.w&&_129.w);
+              _118.x = (_119.x||_125.x);
+              _118.y = (_119.y||_125.y);
+              _118.z = (_119.z||_125.z);
+              _118.w = (_119.w||_125.w);
+            int4 _131;
+              int4 _132 = make_int4(3, 3, 3, 3);
+              _131.x = (_114.x+_132.x);
+              _131.y = (_114.y+_132.y);
+              _131.z = (_114.z+_132.z);
+              _131.w = (_114.w+_132.w);
+            _117.x = (bool(_118.x)?_114.x:_131.x);
+            _117.y = (bool(_118.y)?_114.y:_131.y);
+            _117.z = (bool(_118.z)?_114.z:_131.z);
+            _117.w = (bool(_118.w)?_114.w:_131.w);
+            _67.x = (_68.x+_117.x);
+            _67.y = (_68.y+_117.y);
+            _67.z = (_68.z+_117.z);
+            _67.w = (_68.w+_117.w);
+          *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 128)) = make_float4(kernel[_67.x],kernel[_67.y],kernel[_67.z],kernel[_67.w]);
+          int4 _133;
+            int4 _134;
+              int4 _135;
+                int4 _136 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 256) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 256) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 256) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 256) / 48) * 4608)) + (rc_outer_outer [...]
+                int4 _137;
+                  int4 _138;
+                    int4 _139;
+                      int4 _140 = make_int4((((((int)threadIdx.x) * 4) + 256))+(1*0), (((((int)threadIdx.x) * 4) + 256))+(1*1), (((((int)threadIdx.x) * 4) + 256))+(1*2), (((((int)threadIdx.x) * 4) + 256))+(1*3));
+                      int4 _141 = make_int4(3, 3, 3, 3);
+                      _139.x = (_140.x%_141.x);
+                      _139.y = (_140.y%_141.y);
+                      _139.z = (_140.z%_141.z);
+                      _139.w = (_140.w%_141.w);
+                    int4 _142;
+                      int4 _143 = make_int4((((((int)threadIdx.x) * 4) + 256))+(1*0), (((((int)threadIdx.x) * 4) + 256))+(1*1), (((((int)threadIdx.x) * 4) + 256))+(1*2), (((((int)threadIdx.x) * 4) + 256))+(1*3));
+                      int4 _144 = make_int4(3, 3, 3, 3);
+                      _142.x = (_143.x/_144.x);
+                      _142.y = (_143.y/_144.y);
+                      _142.z = (_143.z/_144.z);
+                      _142.w = (_143.w/_144.w);
+                    int4 _145;
+                    ushort4 _146;
+                      ushort4 _147;
+                        ushort4 _148;
+                          int4 _149 = make_int4(3, 3, 3, 3);
+                          int4 _150 = make_int4(0, 0, 0, 0);
+                          _148.x = (_149.x>=_150.x);
+                          _148.y = (_149.y>=_150.y);
+                          _148.z = (_149.z>=_150.z);
+                          _148.w = (_149.w>=_150.w);
+                        ushort4 _151;
+                          int4 _152 = make_int4(0, 0, 0, 0);
+                          _151.x = (_139.x>=_152.x);
+                          _151.y = (_139.y>=_152.y);
+                          _151.z = (_139.z>=_152.z);
+                          _151.w = (_139.w>=_152.w);
+                        _147.x = (_148.x&&_151.x);
+                        _147.y = (_148.y&&_151.y);
+                        _147.z = (_148.z&&_151.z);
+                        _147.w = (_148.w&&_151.w);
+                      ushort4 _153;
+                        ushort4 _154;
+                          int4 _155 = make_int4(3, 3, 3, 3);
+                          int4 _156 = make_int4(0, 0, 0, 0);
+                          _154.x = (_155.x<_156.x);
+                          _154.y = (_155.y<_156.y);
+                          _154.z = (_155.z<_156.z);
+                          _154.w = (_155.w<_156.w);
+                        ushort4 _157;
+                          int4 _158 = make_int4(0, 0, 0, 0);
+                          _157.x = (_139.x<=_158.x);
+                          _157.y = (_139.y<=_158.y);
+                          _157.z = (_139.z<=_158.z);
+                          _157.w = (_139.w<=_158.w);
+                        _153.x = (_154.x&&_157.x);
+                        _153.y = (_154.y&&_157.y);
+                        _153.z = (_154.z&&_157.z);
+                        _153.w = (_154.w&&_157.w);
+                      _146.x = (_147.x||_153.x);
+                      _146.y = (_147.y||_153.y);
+                      _146.z = (_147.z||_153.z);
+                      _146.w = (_147.w||_153.w);
+                    int4 _159;
+                      int4 _160 = make_int4(1, 1, 1, 1);
+                      _159.x = (_142.x-_160.x);
+                      _159.y = (_142.y-_160.y);
+                      _159.z = (_142.z-_160.z);
+                      _159.w = (_142.w-_160.w);
+                    _145.x = (bool(_146.x)?_142.x:_159.x);
+                    _145.y = (bool(_146.y)?_142.y:_159.y);
+                    _145.z = (bool(_146.z)?_142.z:_159.z);
+                    _145.w = (bool(_146.w)?_142.w:_159.w);
+                    int4 _161 = make_int4(16, 16, 16, 16);
+                    _138.x = (_145.x%_161.x);
+                    _138.y = (_145.y%_161.y);
+                    _138.z = (_145.z%_161.z);
+                    _138.w = (_145.w%_161.w);
+                  int4 _162;
+                  ushort4 _163;
+                    ushort4 _164;
+                      ushort4 _165;
+                        int4 _166 = make_int4(16, 16, 16, 16);
+                        int4 _167 = make_int4(0, 0, 0, 0);
+                        _165.x = (_166.x>=_167.x);
+                        _165.y = (_166.y>=_167.y);
+                        _165.z = (_166.z>=_167.z);
+                        _165.w = (_166.w>=_167.w);
+                      ushort4 _168;
+                        int4 _169 = make_int4(0, 0, 0, 0);
+                        _168.x = (_138.x>=_169.x);
+                        _168.y = (_138.y>=_169.y);
+                        _168.z = (_138.z>=_169.z);
+                        _168.w = (_138.w>=_169.w);
+                      _164.x = (_165.x&&_168.x);
+                      _164.y = (_165.y&&_168.y);
+                      _164.z = (_165.z&&_168.z);
+                      _164.w = (_165.w&&_168.w);
+                    ushort4 _170;
+                      ushort4 _171;
+                        int4 _172 = make_int4(16, 16, 16, 16);
+                        int4 _173 = make_int4(0, 0, 0, 0);
+                        _171.x = (_172.x<_173.x);
+                        _171.y = (_172.y<_173.y);
+                        _171.z = (_172.z<_173.z);
+                        _171.w = (_172.w<_173.w);
+                      ushort4 _174;
+                        int4 _175 = make_int4(0, 0, 0, 0);
+                        _174.x = (_138.x<=_175.x);
+                        _174.y = (_138.y<=_175.y);
+                        _174.z = (_138.z<=_175.z);
+                        _174.w = (_138.w<=_175.w);
+                      _170.x = (_171.x&&_174.x);
+                      _170.y = (_171.y&&_174.y);
+                      _170.z = (_171.z&&_174.z);
+                      _170.w = (_171.w&&_174.w);
+                    _163.x = (_164.x||_170.x);
+                    _163.y = (_164.y||_170.y);
+                    _163.z = (_164.z||_170.z);
+                    _163.w = (_164.w||_170.w);
+                  int4 _176;
+                    int4 _177 = make_int4(16, 16, 16, 16);
+                    _176.x = (_138.x+_177.x);
+                    _176.y = (_138.y+_177.y);
+                    _176.z = (_138.z+_177.z);
+                    _176.w = (_138.w+_177.w);
+                  _162.x = (bool(_163.x)?_138.x:_176.x);
+                  _162.y = (bool(_163.y)?_138.y:_176.y);
+                  _162.z = (bool(_163.z)?_138.z:_176.z);
+                  _162.w = (bool(_163.w)?_138.w:_176.w);
+                  int4 _178 = make_int4(9, 9, 9, 9);
+                  _137.x = (_162.x*_178.x);
+                  _137.y = (_162.y*_178.y);
+                  _137.z = (_162.z*_178.z);
+                  _137.w = (_162.w*_178.w);
+                _135.x = (_136.x+_137.x);
+                _135.y = (_136.y+_137.y);
+                _135.z = (_136.z+_137.z);
+                _135.w = (_136.w+_137.w);
+              int4 _179 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+              _134.x = (_135.x+_179.x);
+              _134.y = (_135.y+_179.y);
+              _134.z = (_135.z+_179.z);
+              _134.w = (_135.w+_179.w);
+            int4 _180;
+              int4 _181 = make_int4(((((int)threadIdx.x) + 64))+(1*0), ((((int)threadIdx.x) + 64))+(1*1), ((((int)threadIdx.x) + 64))+(1*2), ((((int)threadIdx.x) + 64))+(1*3));
+              int4 _182 = make_int4(3, 3, 3, 3);
+              _180.x = (_181.x%_182.x);
+              _180.y = (_181.y%_182.y);
+              _180.z = (_181.z%_182.z);
+              _180.w = (_181.w%_182.w);
+            int4 _183;
+            ushort4 _184;
+              ushort4 _185;
+                ushort4 _186;
+                  int4 _187 = make_int4(3, 3, 3, 3);
+                  int4 _188 = make_int4(0, 0, 0, 0);
+                  _186.x = (_187.x>=_188.x);
+                  _186.y = (_187.y>=_188.y);
+                  _186.z = (_187.z>=_188.z);
+                  _186.w = (_187.w>=_188.w);
+                ushort4 _189;
+                  int4 _190 = make_int4(0, 0, 0, 0);
+                  _189.x = (_180.x>=_190.x);
+                  _189.y = (_180.y>=_190.y);
+                  _189.z = (_180.z>=_190.z);
+                  _189.w = (_180.w>=_190.w);
+                _185.x = (_186.x&&_189.x);
+                _185.y = (_186.y&&_189.y);
+                _185.z = (_186.z&&_189.z);
+                _185.w = (_186.w&&_189.w);
+              ushort4 _191;
+                ushort4 _192;
+                  int4 _193 = make_int4(3, 3, 3, 3);
+                  int4 _194 = make_int4(0, 0, 0, 0);
+                  _192.x = (_193.x<_194.x);
+                  _192.y = (_193.y<_194.y);
+                  _192.z = (_193.z<_194.z);
+                  _192.w = (_193.w<_194.w);
+                ushort4 _195;
+                  int4 _196 = make_int4(0, 0, 0, 0);
+                  _195.x = (_180.x<=_196.x);
+                  _195.y = (_180.y<=_196.y);
+                  _195.z = (_180.z<=_196.z);
+                  _195.w = (_180.w<=_196.w);
+                _191.x = (_192.x&&_195.x);
+                _191.y = (_192.y&&_195.y);
+                _191.z = (_192.z&&_195.z);
+                _191.w = (_192.w&&_195.w);
+              _184.x = (_185.x||_191.x);
+              _184.y = (_185.y||_191.y);
+              _184.z = (_185.z||_191.z);
+              _184.w = (_185.w||_191.w);
+            int4 _197;
+              int4 _198 = make_int4(3, 3, 3, 3);
+              _197.x = (_180.x+_198.x);
+              _197.y = (_180.y+_198.y);
+              _197.z = (_180.z+_198.z);
+              _197.w = (_180.w+_198.w);
+            _183.x = (bool(_184.x)?_180.x:_197.x);
+            _183.y = (bool(_184.y)?_180.y:_197.y);
+            _183.z = (bool(_184.z)?_180.z:_197.z);
+            _183.w = (bool(_184.w)?_180.w:_197.w);
+            _133.x = (_134.x+_183.x);
+            _133.y = (_134.y+_183.y);
+            _133.z = (_134.z+_183.z);
+            _133.w = (_134.w+_183.w);
+          *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 256)) = make_float4(kernel[_133.x],kernel[_133.y],kernel[_133.z],kernel[_133.w]);
+          int4 _199;
+            int4 _200;
+              int4 _201;
+                int4 _202 = make_int4((((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 36864), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 36864), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 36864), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 36864));
+                int4 _203;
+                  int4 _204;
+                    int4 _205;
+                      int4 _206 = make_int4((((((int)threadIdx.x) * 4) + 384))+(1*0), (((((int)threadIdx.x) * 4) + 384))+(1*1), (((((int)threadIdx.x) * 4) + 384))+(1*2), (((((int)threadIdx.x) * 4) + 384))+(1*3));
+                      int4 _207 = make_int4(3, 3, 3, 3);
+                      _205.x = (_206.x%_207.x);
+                      _205.y = (_206.y%_207.y);
+                      _205.z = (_206.z%_207.z);
+                      _205.w = (_206.w%_207.w);
+                    int4 _208;
+                      int4 _209 = make_int4((((((int)threadIdx.x) * 4) + 384))+(1*0), (((((int)threadIdx.x) * 4) + 384))+(1*1), (((((int)threadIdx.x) * 4) + 384))+(1*2), (((((int)threadIdx.x) * 4) + 384))+(1*3));
+                      int4 _210 = make_int4(3, 3, 3, 3);
+                      _208.x = (_209.x/_210.x);
+                      _208.y = (_209.y/_210.y);
+                      _208.z = (_209.z/_210.z);
+                      _208.w = (_209.w/_210.w);
+                    int4 _211;
+                    ushort4 _212;
+                      ushort4 _213;
+                        ushort4 _214;
+                          int4 _215 = make_int4(3, 3, 3, 3);
+                          int4 _216 = make_int4(0, 0, 0, 0);
+                          _214.x = (_215.x>=_216.x);
+                          _214.y = (_215.y>=_216.y);
+                          _214.z = (_215.z>=_216.z);
+                          _214.w = (_215.w>=_216.w);
+                        ushort4 _217;
+                          int4 _218 = make_int4(0, 0, 0, 0);
+                          _217.x = (_205.x>=_218.x);
+                          _217.y = (_205.y>=_218.y);
+                          _217.z = (_205.z>=_218.z);
+                          _217.w = (_205.w>=_218.w);
+                        _213.x = (_214.x&&_217.x);
+                        _213.y = (_214.y&&_217.y);
+                        _213.z = (_214.z&&_217.z);
+                        _213.w = (_214.w&&_217.w);
+                      ushort4 _219;
+                        ushort4 _220;
+                          int4 _221 = make_int4(3, 3, 3, 3);
+                          int4 _222 = make_int4(0, 0, 0, 0);
+                          _220.x = (_221.x<_222.x);
+                          _220.y = (_221.y<_222.y);
+                          _220.z = (_221.z<_222.z);
+                          _220.w = (_221.w<_222.w);
+                        ushort4 _223;
+                          int4 _224 = make_int4(0, 0, 0, 0);
+                          _223.x = (_205.x<=_224.x);
+                          _223.y = (_205.y<=_224.y);
+                          _223.z = (_205.z<=_224.z);
+                          _223.w = (_205.w<=_224.w);
+                        _219.x = (_220.x&&_223.x);
+                        _219.y = (_220.y&&_223.y);
+                        _219.z = (_220.z&&_223.z);
+                        _219.w = (_220.w&&_223.w);
+                      _212.x = (_213.x||_219.x);
+                      _212.y = (_213.y||_219.y);
+                      _212.z = (_213.z||_219.z);
+                      _212.w = (_213.w||_219.w);
+                    int4 _225;
+                      int4 _226 = make_int4(1, 1, 1, 1);
+                      _225.x = (_208.x-_226.x);
+                      _225.y = (_208.y-_226.y);
+                      _225.z = (_208.z-_226.z);
+                      _225.w = (_208.w-_226.w);
+                    _211.x = (bool(_212.x)?_208.x:_225.x);
+                    _211.y = (bool(_212.y)?_208.y:_225.y);
+                    _211.z = (bool(_212.z)?_208.z:_225.z);
+                    _211.w = (bool(_212.w)?_208.w:_225.w);
+                    int4 _227 = make_int4(16, 16, 16, 16);
+                    _204.x = (_211.x%_227.x);
+                    _204.y = (_211.y%_227.y);
+                    _204.z = (_211.z%_227.z);
+                    _204.w = (_211.w%_227.w);
+                  int4 _228;
+                  ushort4 _229;
+                    ushort4 _230;
+                      ushort4 _231;
+                        int4 _232 = make_int4(16, 16, 16, 16);
+                        int4 _233 = make_int4(0, 0, 0, 0);
+                        _231.x = (_232.x>=_233.x);
+                        _231.y = (_232.y>=_233.y);
+                        _231.z = (_232.z>=_233.z);
+                        _231.w = (_232.w>=_233.w);
+                      ushort4 _234;
+                        int4 _235 = make_int4(0, 0, 0, 0);
+                        _234.x = (_204.x>=_235.x);
+                        _234.y = (_204.y>=_235.y);
+                        _234.z = (_204.z>=_235.z);
+                        _234.w = (_204.w>=_235.w);
+                      _230.x = (_231.x&&_234.x);
+                      _230.y = (_231.y&&_234.y);
+                      _230.z = (_231.z&&_234.z);
+                      _230.w = (_231.w&&_234.w);
+                    ushort4 _236;
+                      ushort4 _237;
+                        int4 _238 = make_int4(16, 16, 16, 16);
+                        int4 _239 = make_int4(0, 0, 0, 0);
+                        _237.x = (_238.x<_239.x);
+                        _237.y = (_238.y<_239.y);
+                        _237.z = (_238.z<_239.z);
+                        _237.w = (_238.w<_239.w);
+                      ushort4 _240;
+                        int4 _241 = make_int4(0, 0, 0, 0);
+                        _240.x = (_204.x<=_241.x);
+                        _240.y = (_204.y<=_241.y);
+                        _240.z = (_204.z<=_241.z);
+                        _240.w = (_204.w<=_241.w);
+                      _236.x = (_237.x&&_240.x);
+                      _236.y = (_237.y&&_240.y);
+                      _236.z = (_237.z&&_240.z);
+                      _236.w = (_237.w&&_240.w);
+                    _229.x = (_230.x||_236.x);
+                    _229.y = (_230.y||_236.y);
+                    _229.z = (_230.z||_236.z);
+                    _229.w = (_230.w||_236.w);
+                  int4 _242;
+                    int4 _243 = make_int4(16, 16, 16, 16);
+                    _242.x = (_204.x+_243.x);
+                    _242.y = (_204.y+_243.y);
+                    _242.z = (_204.z+_243.z);
+                    _242.w = (_204.w+_243.w);
+                  _228.x = (bool(_229.x)?_204.x:_242.x);
+                  _228.y = (bool(_229.y)?_204.y:_242.y);
+                  _228.z = (bool(_229.z)?_204.z:_242.z);
+                  _228.w = (bool(_229.w)?_204.w:_242.w);
+                  int4 _244 = make_int4(9, 9, 9, 9);
+                  _203.x = (_228.x*_244.x);
+                  _203.y = (_228.y*_244.y);
+                  _203.z = (_228.z*_244.z);
+                  _203.w = (_228.w*_244.w);
+                _201.x = (_202.x+_203.x);
+                _201.y = (_202.y+_203.y);
+                _201.z = (_202.z+_203.z);
+                _201.w = (_202.w+_203.w);
+              int4 _245 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+              _200.x = (_201.x+_245.x);
+              _200.y = (_201.y+_245.y);
+              _200.z = (_201.z+_245.z);
+              _200.w = (_201.w+_245.w);
+            int4 _246;
+              int4 _247 = make_int4(((((int)threadIdx.x) + 96))+(1*0), ((((int)threadIdx.x) + 96))+(1*1), ((((int)threadIdx.x) + 96))+(1*2), ((((int)threadIdx.x) + 96))+(1*3));
+              int4 _248 = make_int4(3, 3, 3, 3);
+              _246.x = (_247.x%_248.x);
+              _246.y = (_247.y%_248.y);
+              _246.z = (_247.z%_248.z);
+              _246.w = (_247.w%_248.w);
+            int4 _249;
+            ushort4 _250;
+              ushort4 _251;
+                ushort4 _252;
+                  int4 _253 = make_int4(3, 3, 3, 3);
+                  int4 _254 = make_int4(0, 0, 0, 0);
+                  _252.x = (_253.x>=_254.x);
+                  _252.y = (_253.y>=_254.y);
+                  _252.z = (_253.z>=_254.z);
+                  _252.w = (_253.w>=_254.w);
+                ushort4 _255;
+                  int4 _256 = make_int4(0, 0, 0, 0);
+                  _255.x = (_246.x>=_256.x);
+                  _255.y = (_246.y>=_256.y);
+                  _255.z = (_246.z>=_256.z);
+                  _255.w = (_246.w>=_256.w);
+                _251.x = (_252.x&&_255.x);
+                _251.y = (_252.y&&_255.y);
+                _251.z = (_252.z&&_255.z);
+                _251.w = (_252.w&&_255.w);
+              ushort4 _257;
+                ushort4 _258;
+                  int4 _259 = make_int4(3, 3, 3, 3);
+                  int4 _260 = make_int4(0, 0, 0, 0);
+                  _258.x = (_259.x<_260.x);
+                  _258.y = (_259.y<_260.y);
+                  _258.z = (_259.z<_260.z);
+                  _258.w = (_259.w<_260.w);
+                ushort4 _261;
+                  int4 _262 = make_int4(0, 0, 0, 0);
+                  _261.x = (_246.x<=_262.x);
+                  _261.y = (_246.y<=_262.y);
+                  _261.z = (_246.z<=_262.z);
+                  _261.w = (_246.w<=_262.w);
+                _257.x = (_258.x&&_261.x);
+                _257.y = (_258.y&&_261.y);
+                _257.z = (_258.z&&_261.z);
+                _257.w = (_258.w&&_261.w);
+              _250.x = (_251.x||_257.x);
+              _250.y = (_251.y||_257.y);
+              _250.z = (_251.z||_257.z);
+              _250.w = (_251.w||_257.w);
+            int4 _263;
+              int4 _264 = make_int4(3, 3, 3, 3);
+              _263.x = (_246.x+_264.x);
+              _263.y = (_246.y+_264.y);
+              _263.z = (_246.z+_264.z);
+              _263.w = (_246.w+_264.w);
+            _249.x = (bool(_250.x)?_246.x:_263.x);
+            _249.y = (bool(_250.y)?_246.y:_263.y);
+            _249.z = (bool(_250.z)?_246.z:_263.z);
+            _249.w = (bool(_250.w)?_246.w:_263.w);
+            _199.x = (_200.x+_249.x);
+            _199.y = (_200.y+_249.y);
+            _199.z = (_200.z+_249.z);
+            _199.w = (_200.w+_249.w);
+          *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 384)) = make_float4(kernel[_199.x],kernel[_199.y],kernel[_199.z],kernel[_199.w]);
+          int4 _265;
+            int4 _266;
+              int4 _267;
+                int4 _268 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 512) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 512) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 512) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 512) / 48) * 4608)) + (rc_outer_outer [...]
+                int4 _269;
+                  int4 _270;
+                    int4 _271;
+                      int4 _272 = make_int4((((((int)threadIdx.x) * 4) + 512))+(1*0), (((((int)threadIdx.x) * 4) + 512))+(1*1), (((((int)threadIdx.x) * 4) + 512))+(1*2), (((((int)threadIdx.x) * 4) + 512))+(1*3));
+                      int4 _273 = make_int4(3, 3, 3, 3);
+                      _271.x = (_272.x%_273.x);
+                      _271.y = (_272.y%_273.y);
+                      _271.z = (_272.z%_273.z);
+                      _271.w = (_272.w%_273.w);
+                    int4 _274;
+                      int4 _275 = make_int4((((((int)threadIdx.x) * 4) + 512))+(1*0), (((((int)threadIdx.x) * 4) + 512))+(1*1), (((((int)threadIdx.x) * 4) + 512))+(1*2), (((((int)threadIdx.x) * 4) + 512))+(1*3));
+                      int4 _276 = make_int4(3, 3, 3, 3);
+                      _274.x = (_275.x/_276.x);
+                      _274.y = (_275.y/_276.y);
+                      _274.z = (_275.z/_276.z);
+                      _274.w = (_275.w/_276.w);
+                    int4 _277;
+                    ushort4 _278;
+                      ushort4 _279;
+                        ushort4 _280;
+                          int4 _281 = make_int4(3, 3, 3, 3);
+                          int4 _282 = make_int4(0, 0, 0, 0);
+                          _280.x = (_281.x>=_282.x);
+                          _280.y = (_281.y>=_282.y);
+                          _280.z = (_281.z>=_282.z);
+                          _280.w = (_281.w>=_282.w);
+                        ushort4 _283;
+                          int4 _284 = make_int4(0, 0, 0, 0);
+                          _283.x = (_271.x>=_284.x);
+                          _283.y = (_271.y>=_284.y);
+                          _283.z = (_271.z>=_284.z);
+                          _283.w = (_271.w>=_284.w);
+                        _279.x = (_280.x&&_283.x);
+                        _279.y = (_280.y&&_283.y);
+                        _279.z = (_280.z&&_283.z);
+                        _279.w = (_280.w&&_283.w);
+                      ushort4 _285;
+                        ushort4 _286;
+                          int4 _287 = make_int4(3, 3, 3, 3);
+                          int4 _288 = make_int4(0, 0, 0, 0);
+                          _286.x = (_287.x<_288.x);
+                          _286.y = (_287.y<_288.y);
+                          _286.z = (_287.z<_288.z);
+                          _286.w = (_287.w<_288.w);
+                        ushort4 _289;
+                          int4 _290 = make_int4(0, 0, 0, 0);
+                          _289.x = (_271.x<=_290.x);
+                          _289.y = (_271.y<=_290.y);
+                          _289.z = (_271.z<=_290.z);
+                          _289.w = (_271.w<=_290.w);
+                        _285.x = (_286.x&&_289.x);
+                        _285.y = (_286.y&&_289.y);
+                        _285.z = (_286.z&&_289.z);
+                        _285.w = (_286.w&&_289.w);
+                      _278.x = (_279.x||_285.x);
+                      _278.y = (_279.y||_285.y);
+                      _278.z = (_279.z||_285.z);
+                      _278.w = (_279.w||_285.w);
+                    int4 _291;
+                      int4 _292 = make_int4(1, 1, 1, 1);
+                      _291.x = (_274.x-_292.x);
+                      _291.y = (_274.y-_292.y);
+                      _291.z = (_274.z-_292.z);
+                      _291.w = (_274.w-_292.w);
+                    _277.x = (bool(_278.x)?_274.x:_291.x);
+                    _277.y = (bool(_278.y)?_274.y:_291.y);
+                    _277.z = (bool(_278.z)?_274.z:_291.z);
+                    _277.w = (bool(_278.w)?_274.w:_291.w);
+                    int4 _293 = make_int4(16, 16, 16, 16);
+                    _270.x = (_277.x%_293.x);
+                    _270.y = (_277.y%_293.y);
+                    _270.z = (_277.z%_293.z);
+                    _270.w = (_277.w%_293.w);
+                  int4 _294;
+                  ushort4 _295;
+                    ushort4 _296;
+                      ushort4 _297;
+                        int4 _298 = make_int4(16, 16, 16, 16);
+                        int4 _299 = make_int4(0, 0, 0, 0);
+                        _297.x = (_298.x>=_299.x);
+                        _297.y = (_298.y>=_299.y);
+                        _297.z = (_298.z>=_299.z);
+                        _297.w = (_298.w>=_299.w);
+                      ushort4 _300;
+                        int4 _301 = make_int4(0, 0, 0, 0);
+                        _300.x = (_270.x>=_301.x);
+                        _300.y = (_270.y>=_301.y);
+                        _300.z = (_270.z>=_301.z);
+                        _300.w = (_270.w>=_301.w);
+                      _296.x = (_297.x&&_300.x);
+                      _296.y = (_297.y&&_300.y);
+                      _296.z = (_297.z&&_300.z);
+                      _296.w = (_297.w&&_300.w);
+                    ushort4 _302;
+                      ushort4 _303;
+                        int4 _304 = make_int4(16, 16, 16, 16);
+                        int4 _305 = make_int4(0, 0, 0, 0);
+                        _303.x = (_304.x<_305.x);
+                        _303.y = (_304.y<_305.y);
+                        _303.z = (_304.z<_305.z);
+                        _303.w = (_304.w<_305.w);
+                      ushort4 _306;
+                        int4 _307 = make_int4(0, 0, 0, 0);
+                        _306.x = (_270.x<=_307.x);
+                        _306.y = (_270.y<=_307.y);
+                        _306.z = (_270.z<=_307.z);
+                        _306.w = (_270.w<=_307.w);
+                      _302.x = (_303.x&&_306.x);
+                      _302.y = (_303.y&&_306.y);
+                      _302.z = (_303.z&&_306.z);
+                      _302.w = (_303.w&&_306.w);
+                    _295.x = (_296.x||_302.x);
+                    _295.y = (_296.y||_302.y);
+                    _295.z = (_296.z||_302.z);
+                    _295.w = (_296.w||_302.w);
+                  int4 _308;
+                    int4 _309 = make_int4(16, 16, 16, 16);
+                    _308.x = (_270.x+_309.x);
+                    _308.y = (_270.y+_309.y);
+                    _308.z = (_270.z+_309.z);
+                    _308.w = (_270.w+_309.w);
+                  _294.x = (bool(_295.x)?_270.x:_308.x);
+                  _294.y = (bool(_295.y)?_270.y:_308.y);
+                  _294.z = (bool(_295.z)?_270.z:_308.z);
+                  _294.w = (bool(_295.w)?_270.w:_308.w);
+                  int4 _310 = make_int4(9, 9, 9, 9);
+                  _269.x = (_294.x*_310.x);
+                  _269.y = (_294.y*_310.y);
+                  _269.z = (_294.z*_310.z);
+                  _269.w = (_294.w*_310.w);
+                _267.x = (_268.x+_269.x);
+                _267.y = (_268.y+_269.y);
+                _267.z = (_268.z+_269.z);
+                _267.w = (_268.w+_269.w);
+              int4 _311 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+              _266.x = (_267.x+_311.x);
+              _266.y = (_267.y+_311.y);
+              _266.z = (_267.z+_311.z);
+              _266.w = (_267.w+_311.w);
+            int4 _312;
+              int4 _313 = make_int4(((((int)threadIdx.x) + 128))+(1*0), ((((int)threadIdx.x) + 128))+(1*1), ((((int)threadIdx.x) + 128))+(1*2), ((((int)threadIdx.x) + 128))+(1*3));
+              int4 _314 = make_int4(3, 3, 3, 3);
+              _312.x = (_313.x%_314.x);
+              _312.y = (_313.y%_314.y);
+              _312.z = (_313.z%_314.z);
+              _312.w = (_313.w%_314.w);
+            int4 _315;
+            ushort4 _316;
+              ushort4 _317;
+                ushort4 _318;
+                  int4 _319 = make_int4(3, 3, 3, 3);
+                  int4 _320 = make_int4(0, 0, 0, 0);
+                  _318.x = (_319.x>=_320.x);
+                  _318.y = (_319.y>=_320.y);
+                  _318.z = (_319.z>=_320.z);
+                  _318.w = (_319.w>=_320.w);
+                ushort4 _321;
+                  int4 _322 = make_int4(0, 0, 0, 0);
+                  _321.x = (_312.x>=_322.x);
+                  _321.y = (_312.y>=_322.y);
+                  _321.z = (_312.z>=_322.z);
+                  _321.w = (_312.w>=_322.w);
+                _317.x = (_318.x&&_321.x);
+                _317.y = (_318.y&&_321.y);
+                _317.z = (_318.z&&_321.z);
+                _317.w = (_318.w&&_321.w);
+              ushort4 _323;
+                ushort4 _324;
+                  int4 _325 = make_int4(3, 3, 3, 3);
+                  int4 _326 = make_int4(0, 0, 0, 0);
+                  _324.x = (_325.x<_326.x);
+                  _324.y = (_325.y<_326.y);
+                  _324.z = (_325.z<_326.z);
+                  _324.w = (_325.w<_326.w);
+                ushort4 _327;
+                  int4 _328 = make_int4(0, 0, 0, 0);
+                  _327.x = (_312.x<=_328.x);
+                  _327.y = (_312.y<=_328.y);
+                  _327.z = (_312.z<=_328.z);
+                  _327.w = (_312.w<=_328.w);
+                _323.x = (_324.x&&_327.x);
+                _323.y = (_324.y&&_327.y);
+                _323.z = (_324.z&&_327.z);
+                _323.w = (_324.w&&_327.w);
+              _316.x = (_317.x||_323.x);
+              _316.y = (_317.y||_323.y);
+              _316.z = (_317.z||_323.z);
+              _316.w = (_317.w||_323.w);
+            int4 _329;
+              int4 _330 = make_int4(3, 3, 3, 3);
+              _329.x = (_312.x+_330.x);
+              _329.y = (_312.y+_330.y);
+              _329.z = (_312.z+_330.z);
+              _329.w = (_312.w+_330.w);
+            _315.x = (bool(_316.x)?_312.x:_329.x);
+            _315.y = (bool(_316.y)?_312.y:_329.y);
+            _315.z = (bool(_316.z)?_312.z:_329.z);
+            _315.w = (bool(_316.w)?_312.w:_329.w);
+            _265.x = (_266.x+_315.x);
+            _265.y = (_266.y+_315.y);
+            _265.z = (_266.z+_315.z);
+            _265.w = (_266.w+_315.w);
+          *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 512)) = make_float4(kernel[_265.x],kernel[_265.y],kernel[_265.z],kernel[_265.w]);
+          int4 _331;
+            int4 _332;
+              int4 _333;
+                int4 _334 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 640) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 640) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 640) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 640) / 48) * 4608)) + (rc_outer_outer [...]
+                int4 _335;
+                  int4 _336;
+                    int4 _337;
+                      int4 _338 = make_int4((((((int)threadIdx.x) * 4) + 640))+(1*0), (((((int)threadIdx.x) * 4) + 640))+(1*1), (((((int)threadIdx.x) * 4) + 640))+(1*2), (((((int)threadIdx.x) * 4) + 640))+(1*3));
+                      int4 _339 = make_int4(3, 3, 3, 3);
+                      _337.x = (_338.x%_339.x);
+                      _337.y = (_338.y%_339.y);
+                      _337.z = (_338.z%_339.z);
+                      _337.w = (_338.w%_339.w);
+                    int4 _340;
+                      int4 _341 = make_int4((((((int)threadIdx.x) * 4) + 640))+(1*0), (((((int)threadIdx.x) * 4) + 640))+(1*1), (((((int)threadIdx.x) * 4) + 640))+(1*2), (((((int)threadIdx.x) * 4) + 640))+(1*3));
+                      int4 _342 = make_int4(3, 3, 3, 3);
+                      _340.x = (_341.x/_342.x);
+                      _340.y = (_341.y/_342.y);
+                      _340.z = (_341.z/_342.z);
+                      _340.w = (_341.w/_342.w);
+                    int4 _343;
+                    ushort4 _344;
+                      ushort4 _345;
+                        ushort4 _346;
+                          int4 _347 = make_int4(3, 3, 3, 3);
+                          int4 _348 = make_int4(0, 0, 0, 0);
+                          _346.x = (_347.x>=_348.x);
+                          _346.y = (_347.y>=_348.y);
+                          _346.z = (_347.z>=_348.z);
+                          _346.w = (_347.w>=_348.w);
+                        ushort4 _349;
+                          int4 _350 = make_int4(0, 0, 0, 0);
+                          _349.x = (_337.x>=_350.x);
+                          _349.y = (_337.y>=_350.y);
+                          _349.z = (_337.z>=_350.z);
+                          _349.w = (_337.w>=_350.w);
+                        _345.x = (_346.x&&_349.x);
+                        _345.y = (_346.y&&_349.y);
+                        _345.z = (_346.z&&_349.z);
+                        _345.w = (_346.w&&_349.w);
+                      ushort4 _351;
+                        ushort4 _352;
+                          int4 _353 = make_int4(3, 3, 3, 3);
+                          int4 _354 = make_int4(0, 0, 0, 0);
+                          _352.x = (_353.x<_354.x);
+                          _352.y = (_353.y<_354.y);
+                          _352.z = (_353.z<_354.z);
+                          _352.w = (_353.w<_354.w);
+                        ushort4 _355;
+                          int4 _356 = make_int4(0, 0, 0, 0);
+                          _355.x = (_337.x<=_356.x);
+                          _355.y = (_337.y<=_356.y);
+                          _355.z = (_337.z<=_356.z);
+                          _355.w = (_337.w<=_356.w);
+                        _351.x = (_352.x&&_355.x);
+                        _351.y = (_352.y&&_355.y);
+                        _351.z = (_352.z&&_355.z);
+                        _351.w = (_352.w&&_355.w);
+                      _344.x = (_345.x||_351.x);
+                      _344.y = (_345.y||_351.y);
+                      _344.z = (_345.z||_351.z);
+                      _344.w = (_345.w||_351.w);
+                    int4 _357;
+                      int4 _358 = make_int4(1, 1, 1, 1);
+                      _357.x = (_340.x-_358.x);
+                      _357.y = (_340.y-_358.y);
+                      _357.z = (_340.z-_358.z);
+                      _357.w = (_340.w-_358.w);
+                    _343.x = (bool(_344.x)?_340.x:_357.x);
+                    _343.y = (bool(_344.y)?_340.y:_357.y);
+                    _343.z = (bool(_344.z)?_340.z:_357.z);
+                    _343.w = (bool(_344.w)?_340.w:_357.w);
+                    int4 _359 = make_int4(16, 16, 16, 16);
+                    _336.x = (_343.x%_359.x);
+                    _336.y = (_343.y%_359.y);
+                    _336.z = (_343.z%_359.z);
+                    _336.w = (_343.w%_359.w);
+                  int4 _360;
+                  ushort4 _361;
+                    ushort4 _362;
+                      ushort4 _363;
+                        int4 _364 = make_int4(16, 16, 16, 16);
+                        int4 _365 = make_int4(0, 0, 0, 0);
+                        _363.x = (_364.x>=_365.x);
+                        _363.y = (_364.y>=_365.y);
+                        _363.z = (_364.z>=_365.z);
+                        _363.w = (_364.w>=_365.w);
+                      ushort4 _366;
+                        int4 _367 = make_int4(0, 0, 0, 0);
+                        _366.x = (_336.x>=_367.x);
+                        _366.y = (_336.y>=_367.y);
+                        _366.z = (_336.z>=_367.z);
+                        _366.w = (_336.w>=_367.w);
+                      _362.x = (_363.x&&_366.x);
+                      _362.y = (_363.y&&_366.y);
+                      _362.z = (_363.z&&_366.z);
+                      _362.w = (_363.w&&_366.w);
+                    ushort4 _368;
+                      ushort4 _369;
+                        int4 _370 = make_int4(16, 16, 16, 16);
+                        int4 _371 = make_int4(0, 0, 0, 0);
+                        _369.x = (_370.x<_371.x);
+                        _369.y = (_370.y<_371.y);
+                        _369.z = (_370.z<_371.z);
+                        _369.w = (_370.w<_371.w);
+                      ushort4 _372;
+                        int4 _373 = make_int4(0, 0, 0, 0);
+                        _372.x = (_336.x<=_373.x);
+                        _372.y = (_336.y<=_373.y);
+                        _372.z = (_336.z<=_373.z);
+                        _372.w = (_336.w<=_373.w);
+                      _368.x = (_369.x&&_372.x);
+                      _368.y = (_369.y&&_372.y);
+                      _368.z = (_369.z&&_372.z);
+                      _368.w = (_369.w&&_372.w);
+                    _361.x = (_362.x||_368.x);
+                    _361.y = (_362.y||_368.y);
+                    _361.z = (_362.z||_368.z);
+                    _361.w = (_362.w||_368.w);
+                  int4 _374;
+                    int4 _375 = make_int4(16, 16, 16, 16);
+                    _374.x = (_336.x+_375.x);
+                    _374.y = (_336.y+_375.y);
+                    _374.z = (_336.z+_375.z);
+                    _374.w = (_336.w+_375.w);
+                  _360.x = (bool(_361.x)?_336.x:_374.x);
+                  _360.y = (bool(_361.y)?_336.y:_374.y);
+                  _360.z = (bool(_361.z)?_336.z:_374.z);
+                  _360.w = (bool(_361.w)?_336.w:_374.w);
+                  int4 _376 = make_int4(9, 9, 9, 9);
+                  _335.x = (_360.x*_376.x);
+                  _335.y = (_360.y*_376.y);
+                  _335.z = (_360.z*_376.z);
+                  _335.w = (_360.w*_376.w);
+                _333.x = (_334.x+_335.x);
+                _333.y = (_334.y+_335.y);
+                _333.z = (_334.z+_335.z);
+                _333.w = (_334.w+_335.w);
+              int4 _377 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+              _332.x = (_333.x+_377.x);
+              _332.y = (_333.y+_377.y);
+              _332.z = (_333.z+_377.z);
+              _332.w = (_333.w+_377.w);
+            int4 _378;
+              int4 _379 = make_int4(((((int)threadIdx.x) + 160))+(1*0), ((((int)threadIdx.x) + 160))+(1*1), ((((int)threadIdx.x) + 160))+(1*2), ((((int)threadIdx.x) + 160))+(1*3));
+              int4 _380 = make_int4(3, 3, 3, 3);
+              _378.x = (_379.x%_380.x);
+              _378.y = (_379.y%_380.y);
+              _378.z = (_379.z%_380.z);
+              _378.w = (_379.w%_380.w);
+            int4 _381;
+            ushort4 _382;
+              ushort4 _383;
+                ushort4 _384;
+                  int4 _385 = make_int4(3, 3, 3, 3);
+                  int4 _386 = make_int4(0, 0, 0, 0);
+                  _384.x = (_385.x>=_386.x);
+                  _384.y = (_385.y>=_386.y);
+                  _384.z = (_385.z>=_386.z);
+                  _384.w = (_385.w>=_386.w);
+                ushort4 _387;
+                  int4 _388 = make_int4(0, 0, 0, 0);
+                  _387.x = (_378.x>=_388.x);
+                  _387.y = (_378.y>=_388.y);
+                  _387.z = (_378.z>=_388.z);
+                  _387.w = (_378.w>=_388.w);
+                _383.x = (_384.x&&_387.x);
+                _383.y = (_384.y&&_387.y);
+                _383.z = (_384.z&&_387.z);
+                _383.w = (_384.w&&_387.w);
+              ushort4 _389;
+                ushort4 _390;
+                  int4 _391 = make_int4(3, 3, 3, 3);
+                  int4 _392 = make_int4(0, 0, 0, 0);
+                  _390.x = (_391.x<_392.x);
+                  _390.y = (_391.y<_392.y);
+                  _390.z = (_391.z<_392.z);
+                  _390.w = (_391.w<_392.w);
+                ushort4 _393;
+                  int4 _394 = make_int4(0, 0, 0, 0);
+                  _393.x = (_378.x<=_394.x);
+                  _393.y = (_378.y<=_394.y);
+                  _393.z = (_378.z<=_394.z);
+                  _393.w = (_378.w<=_394.w);
+                _389.x = (_390.x&&_393.x);
+                _389.y = (_390.y&&_393.y);
+                _389.z = (_390.z&&_393.z);
+                _389.w = (_390.w&&_393.w);
+              _382.x = (_383.x||_389.x);
+              _382.y = (_383.y||_389.y);
+              _382.z = (_383.z||_389.z);
+              _382.w = (_383.w||_389.w);
+            int4 _395;
+              int4 _396 = make_int4(3, 3, 3, 3);
+              _395.x = (_378.x+_396.x);
+              _395.y = (_378.y+_396.y);
+              _395.z = (_378.z+_396.z);
+              _395.w = (_378.w+_396.w);
+            _381.x = (bool(_382.x)?_378.x:_395.x);
+            _381.y = (bool(_382.y)?_378.y:_395.y);
+            _381.z = (bool(_382.z)?_378.z:_395.z);
+            _381.w = (bool(_382.w)?_378.w:_395.w);
+            _331.x = (_332.x+_381.x);
+            _331.y = (_332.y+_381.y);
+            _331.z = (_332.z+_381.z);
+            _331.w = (_332.w+_381.w);
+          *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 640)) = make_float4(kernel[_331.x],kernel[_331.y],kernel[_331.z],kernel[_331.w]);
+          int4 _397;
+            int4 _398;
+              int4 _399;
+                int4 _400 = make_int4((((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 73728), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 73728), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 73728), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 73728));
+                int4 _401;
+                  int4 _402;
+                    int4 _403;
+                      int4 _404 = make_int4((((((int)threadIdx.x) * 4) + 768))+(1*0), (((((int)threadIdx.x) * 4) + 768))+(1*1), (((((int)threadIdx.x) * 4) + 768))+(1*2), (((((int)threadIdx.x) * 4) + 768))+(1*3));
+                      int4 _405 = make_int4(3, 3, 3, 3);
+                      _403.x = (_404.x%_405.x);
+                      _403.y = (_404.y%_405.y);
+                      _403.z = (_404.z%_405.z);
+                      _403.w = (_404.w%_405.w);
+                    int4 _406;
+                      int4 _407 = make_int4((((((int)threadIdx.x) * 4) + 768))+(1*0), (((((int)threadIdx.x) * 4) + 768))+(1*1), (((((int)threadIdx.x) * 4) + 768))+(1*2), (((((int)threadIdx.x) * 4) + 768))+(1*3));
+                      int4 _408 = make_int4(3, 3, 3, 3);
+                      _406.x = (_407.x/_408.x);
+                      _406.y = (_407.y/_408.y);
+                      _406.z = (_407.z/_408.z);
+                      _406.w = (_407.w/_408.w);
+                    int4 _409;
+                    ushort4 _410;
+                      ushort4 _411;
+                        ushort4 _412;
+                          int4 _413 = make_int4(3, 3, 3, 3);
+                          int4 _414 = make_int4(0, 0, 0, 0);
+                          _412.x = (_413.x>=_414.x);
+                          _412.y = (_413.y>=_414.y);
+                          _412.z = (_413.z>=_414.z);
+                          _412.w = (_413.w>=_414.w);
+                        ushort4 _415;
+                          int4 _416 = make_int4(0, 0, 0, 0);
+                          _415.x = (_403.x>=_416.x);
+                          _415.y = (_403.y>=_416.y);
+                          _415.z = (_403.z>=_416.z);
+                          _415.w = (_403.w>=_416.w);
+                        _411.x = (_412.x&&_415.x);
+                        _411.y = (_412.y&&_415.y);
+                        _411.z = (_412.z&&_415.z);
+                        _411.w = (_412.w&&_415.w);
+                      ushort4 _417;
+                        ushort4 _418;
+                          int4 _419 = make_int4(3, 3, 3, 3);
+                          int4 _420 = make_int4(0, 0, 0, 0);
+                          _418.x = (_419.x<_420.x);
+                          _418.y = (_419.y<_420.y);
+                          _418.z = (_419.z<_420.z);
+                          _418.w = (_419.w<_420.w);
+                        ushort4 _421;
+                          int4 _422 = make_int4(0, 0, 0, 0);
+                          _421.x = (_403.x<=_422.x);
+                          _421.y = (_403.y<=_422.y);
+                          _421.z = (_403.z<=_422.z);
+                          _421.w = (_403.w<=_422.w);
+                        _417.x = (_418.x&&_421.x);
+                        _417.y = (_418.y&&_421.y);
+                        _417.z = (_418.z&&_421.z);
+                        _417.w = (_418.w&&_421.w);
+                      _410.x = (_411.x||_417.x);
+                      _410.y = (_411.y||_417.y);
+                      _410.z = (_411.z||_417.z);
+                      _410.w = (_411.w||_417.w);
+                    int4 _423;
+                      int4 _424 = make_int4(1, 1, 1, 1);
+                      _423.x = (_406.x-_424.x);
+                      _423.y = (_406.y-_424.y);
+                      _423.z = (_406.z-_424.z);
+                      _423.w = (_406.w-_424.w);
+                    _409.x = (bool(_410.x)?_406.x:_423.x);
+                    _409.y = (bool(_410.y)?_406.y:_423.y);
+                    _409.z = (bool(_410.z)?_406.z:_423.z);
+                    _409.w = (bool(_410.w)?_406.w:_423.w);
+                    int4 _425 = make_int4(16, 16, 16, 16);
+                    _402.x = (_409.x%_425.x);
+                    _402.y = (_409.y%_425.y);
+                    _402.z = (_409.z%_425.z);
+                    _402.w = (_409.w%_425.w);
+                  int4 _426;
+                  ushort4 _427;
+                    ushort4 _428;
+                      ushort4 _429;
+                        int4 _430 = make_int4(16, 16, 16, 16);
+                        int4 _431 = make_int4(0, 0, 0, 0);
+                        _429.x = (_430.x>=_431.x);
+                        _429.y = (_430.y>=_431.y);
+                        _429.z = (_430.z>=_431.z);
+                        _429.w = (_430.w>=_431.w);
+                      ushort4 _432;
+                        int4 _433 = make_int4(0, 0, 0, 0);
+                        _432.x = (_402.x>=_433.x);
+                        _432.y = (_402.y>=_433.y);
+                        _432.z = (_402.z>=_433.z);
+                        _432.w = (_402.w>=_433.w);
+                      _428.x = (_429.x&&_432.x);
+                      _428.y = (_429.y&&_432.y);
+                      _428.z = (_429.z&&_432.z);
+                      _428.w = (_429.w&&_432.w);
+                    ushort4 _434;
+                      ushort4 _435;
+                        int4 _436 = make_int4(16, 16, 16, 16);
+                        int4 _437 = make_int4(0, 0, 0, 0);
+                        _435.x = (_436.x<_437.x);
+                        _435.y = (_436.y<_437.y);
+                        _435.z = (_436.z<_437.z);
+                        _435.w = (_436.w<_437.w);
+                      ushort4 _438;
+                        int4 _439 = make_int4(0, 0, 0, 0);
+                        _438.x = (_402.x<=_439.x);
+                        _438.y = (_402.y<=_439.y);
+                        _438.z = (_402.z<=_439.z);
+                        _438.w = (_402.w<=_439.w);
+                      _434.x = (_435.x&&_438.x);
+                      _434.y = (_435.y&&_438.y);
+                      _434.z = (_435.z&&_438.z);
+                      _434.w = (_435.w&&_438.w);
+                    _427.x = (_428.x||_434.x);
+                    _427.y = (_428.y||_434.y);
+                    _427.z = (_428.z||_434.z);
+                    _427.w = (_428.w||_434.w);
+                  int4 _440;
+                    int4 _441 = make_int4(16, 16, 16, 16);
+                    _440.x = (_402.x+_441.x);
+                    _440.y = (_402.y+_441.y);
+                    _440.z = (_402.z+_441.z);
+                    _440.w = (_402.w+_441.w);
+                  _426.x = (bool(_427.x)?_402.x:_440.x);
+                  _426.y = (bool(_427.y)?_402.y:_440.y);
+                  _426.z = (bool(_427.z)?_402.z:_440.z);
+                  _426.w = (bool(_427.w)?_402.w:_440.w);
+                  int4 _442 = make_int4(9, 9, 9, 9);
+                  _401.x = (_426.x*_442.x);
+                  _401.y = (_426.y*_442.y);
+                  _401.z = (_426.z*_442.z);
+                  _401.w = (_426.w*_442.w);
+                _399.x = (_400.x+_401.x);
+                _399.y = (_400.y+_401.y);
+                _399.z = (_400.z+_401.z);
+                _399.w = (_400.w+_401.w);
+              int4 _443 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+              _398.x = (_399.x+_443.x);
+              _398.y = (_399.y+_443.y);
+              _398.z = (_399.z+_443.z);
+              _398.w = (_399.w+_443.w);
+            int4 _444;
+              int4 _445 = make_int4(((((int)threadIdx.x) + 192))+(1*0), ((((int)threadIdx.x) + 192))+(1*1), ((((int)threadIdx.x) + 192))+(1*2), ((((int)threadIdx.x) + 192))+(1*3));
+              int4 _446 = make_int4(3, 3, 3, 3);
+              _444.x = (_445.x%_446.x);
+              _444.y = (_445.y%_446.y);
+              _444.z = (_445.z%_446.z);
+              _444.w = (_445.w%_446.w);
+            int4 _447;
+            ushort4 _448;
+              ushort4 _449;
+                ushort4 _450;
+                  int4 _451 = make_int4(3, 3, 3, 3);
+                  int4 _452 = make_int4(0, 0, 0, 0);
+                  _450.x = (_451.x>=_452.x);
+                  _450.y = (_451.y>=_452.y);
+                  _450.z = (_451.z>=_452.z);
+                  _450.w = (_451.w>=_452.w);
+                ushort4 _453;
+                  int4 _454 = make_int4(0, 0, 0, 0);
+                  _453.x = (_444.x>=_454.x);
+                  _453.y = (_444.y>=_454.y);
+                  _453.z = (_444.z>=_454.z);
+                  _453.w = (_444.w>=_454.w);
+                _449.x = (_450.x&&_453.x);
+                _449.y = (_450.y&&_453.y);
+                _449.z = (_450.z&&_453.z);
+                _449.w = (_450.w&&_453.w);
+              ushort4 _455;
+                ushort4 _456;
+                  int4 _457 = make_int4(3, 3, 3, 3);
+                  int4 _458 = make_int4(0, 0, 0, 0);
+                  _456.x = (_457.x<_458.x);
+                  _456.y = (_457.y<_458.y);
+                  _456.z = (_457.z<_458.z);
+                  _456.w = (_457.w<_458.w);
+                ushort4 _459;
+                  int4 _460 = make_int4(0, 0, 0, 0);
+                  _459.x = (_444.x<=_460.x);
+                  _459.y = (_444.y<=_460.y);
+                  _459.z = (_444.z<=_460.z);
+                  _459.w = (_444.w<=_460.w);
+                _455.x = (_456.x&&_459.x);
+                _455.y = (_456.y&&_459.y);
+                _455.z = (_456.z&&_459.z);
+                _455.w = (_456.w&&_459.w);
+              _448.x = (_449.x||_455.x);
+              _448.y = (_449.y||_455.y);
+              _448.z = (_449.z||_455.z);
+              _448.w = (_449.w||_455.w);
+            int4 _461;
+              int4 _462 = make_int4(3, 3, 3, 3);
+              _461.x = (_444.x+_462.x);
+              _461.y = (_444.y+_462.y);
+              _461.z = (_444.z+_462.z);
+              _461.w = (_444.w+_462.w);
+            _447.x = (bool(_448.x)?_444.x:_461.x);
+            _447.y = (bool(_448.y)?_444.y:_461.y);
+            _447.z = (bool(_448.z)?_444.z:_461.z);
+            _447.w = (bool(_448.w)?_444.w:_461.w);
+            _397.x = (_398.x+_447.x);
+            _397.y = (_398.y+_447.y);
+            _397.z = (_398.z+_447.z);
+            _397.w = (_398.w+_447.w);
+          *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 768)) = make_float4(kernel[_397.x],kernel[_397.y],kernel[_397.z],kernel[_397.w]);
+          int4 _463;
+            int4 _464;
+              int4 _465;
+                int4 _466 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 896) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 896) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 896) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 896) / 48) * 4608)) + (rc_outer_outer [...]
+                int4 _467;
+                  int4 _468;
+                    int4 _469;
+                      int4 _470 = make_int4((((((int)threadIdx.x) * 4) + 896))+(1*0), (((((int)threadIdx.x) * 4) + 896))+(1*1), (((((int)threadIdx.x) * 4) + 896))+(1*2), (((((int)threadIdx.x) * 4) + 896))+(1*3));
+                      int4 _471 = make_int4(3, 3, 3, 3);
+                      _469.x = (_470.x%_471.x);
+                      _469.y = (_470.y%_471.y);
+                      _469.z = (_470.z%_471.z);
+                      _469.w = (_470.w%_471.w);
+                    int4 _472;
+                      int4 _473 = make_int4((((((int)threadIdx.x) * 4) + 896))+(1*0), (((((int)threadIdx.x) * 4) + 896))+(1*1), (((((int)threadIdx.x) * 4) + 896))+(1*2), (((((int)threadIdx.x) * 4) + 896))+(1*3));
+                      int4 _474 = make_int4(3, 3, 3, 3);
+                      _472.x = (_473.x/_474.x);
+                      _472.y = (_473.y/_474.y);
+                      _472.z = (_473.z/_474.z);
+                      _472.w = (_473.w/_474.w);
+                    int4 _475;
+                    ushort4 _476;
+                      ushort4 _477;
+                        ushort4 _478;
+                          int4 _479 = make_int4(3, 3, 3, 3);
+                          int4 _480 = make_int4(0, 0, 0, 0);
+                          _478.x = (_479.x>=_480.x);
+                          _478.y = (_479.y>=_480.y);
+                          _478.z = (_479.z>=_480.z);
+                          _478.w = (_479.w>=_480.w);
+                        ushort4 _481;
+                          int4 _482 = make_int4(0, 0, 0, 0);
+                          _481.x = (_469.x>=_482.x);
+                          _481.y = (_469.y>=_482.y);
+                          _481.z = (_469.z>=_482.z);
+                          _481.w = (_469.w>=_482.w);
+                        _477.x = (_478.x&&_481.x);
+                        _477.y = (_478.y&&_481.y);
+                        _477.z = (_478.z&&_481.z);
+                        _477.w = (_478.w&&_481.w);
+                      ushort4 _483;
+                        ushort4 _484;
+                          int4 _485 = make_int4(3, 3, 3, 3);
+                          int4 _486 = make_int4(0, 0, 0, 0);
+                          _484.x = (_485.x<_486.x);
+                          _484.y = (_485.y<_486.y);
+                          _484.z = (_485.z<_486.z);
+                          _484.w = (_485.w<_486.w);
+                        ushort4 _487;
+                          int4 _488 = make_int4(0, 0, 0, 0);
+                          _487.x = (_469.x<=_488.x);
+                          _487.y = (_469.y<=_488.y);
+                          _487.z = (_469.z<=_488.z);
+                          _487.w = (_469.w<=_488.w);
+                        _483.x = (_484.x&&_487.x);
+                        _483.y = (_484.y&&_487.y);
+                        _483.z = (_484.z&&_487.z);
+                        _483.w = (_484.w&&_487.w);
+                      _476.x = (_477.x||_483.x);
+                      _476.y = (_477.y||_483.y);
+                      _476.z = (_477.z||_483.z);
+                      _476.w = (_477.w||_483.w);
+                    int4 _489;
+                      int4 _490 = make_int4(1, 1, 1, 1);
+                      _489.x = (_472.x-_490.x);
+                      _489.y = (_472.y-_490.y);
+                      _489.z = (_472.z-_490.z);
+                      _489.w = (_472.w-_490.w);
+                    _475.x = (bool(_476.x)?_472.x:_489.x);
+                    _475.y = (bool(_476.y)?_472.y:_489.y);
+                    _475.z = (bool(_476.z)?_472.z:_489.z);
+                    _475.w = (bool(_476.w)?_472.w:_489.w);
+                    int4 _491 = make_int4(16, 16, 16, 16);
+                    _468.x = (_475.x%_491.x);
+                    _468.y = (_475.y%_491.y);
+                    _468.z = (_475.z%_491.z);
+                    _468.w = (_475.w%_491.w);
+                  int4 _492;
+                  ushort4 _493;
+                    ushort4 _494;
+                      ushort4 _495;
+                        int4 _496 = make_int4(16, 16, 16, 16);
+                        int4 _497 = make_int4(0, 0, 0, 0);
+                        _495.x = (_496.x>=_497.x);
+                        _495.y = (_496.y>=_497.y);
+                        _495.z = (_496.z>=_497.z);
+                        _495.w = (_496.w>=_497.w);
+                      ushort4 _498;
+                        int4 _499 = make_int4(0, 0, 0, 0);
+                        _498.x = (_468.x>=_499.x);
+                        _498.y = (_468.y>=_499.y);
+                        _498.z = (_468.z>=_499.z);
+                        _498.w = (_468.w>=_499.w);
+                      _494.x = (_495.x&&_498.x);
+                      _494.y = (_495.y&&_498.y);
+                      _494.z = (_495.z&&_498.z);
+                      _494.w = (_495.w&&_498.w);
+                    ushort4 _500;
+                      ushort4 _501;
+                        int4 _502 = make_int4(16, 16, 16, 16);
+                        int4 _503 = make_int4(0, 0, 0, 0);
+                        _501.x = (_502.x<_503.x);
+                        _501.y = (_502.y<_503.y);
+                        _501.z = (_502.z<_503.z);
+                        _501.w = (_502.w<_503.w);
+                      ushort4 _504;
+                        int4 _505 = make_int4(0, 0, 0, 0);
+                        _504.x = (_468.x<=_505.x);
+                        _504.y = (_468.y<=_505.y);
+                        _504.z = (_468.z<=_505.z);
+                        _504.w = (_468.w<=_505.w);
+                      _500.x = (_501.x&&_504.x);
+                      _500.y = (_501.y&&_504.y);
+                      _500.z = (_501.z&&_504.z);
+                      _500.w = (_501.w&&_504.w);
+                    _493.x = (_494.x||_500.x);
+                    _493.y = (_494.y||_500.y);
+                    _493.z = (_494.z||_500.z);
+                    _493.w = (_494.w||_500.w);
+                  int4 _506;
+                    int4 _507 = make_int4(16, 16, 16, 16);
+                    _506.x = (_468.x+_507.x);
+                    _506.y = (_468.y+_507.y);
+                    _506.z = (_468.z+_507.z);
+                    _506.w = (_468.w+_507.w);
+                  _492.x = (bool(_493.x)?_468.x:_506.x);
+                  _492.y = (bool(_493.y)?_468.y:_506.y);
+                  _492.z = (bool(_493.z)?_468.z:_506.z);
+                  _492.w = (bool(_493.w)?_468.w:_506.w);
+                  int4 _508 = make_int4(9, 9, 9, 9);
+                  _467.x = (_492.x*_508.x);
+                  _467.y = (_492.y*_508.y);
+                  _467.z = (_492.z*_508.z);
+                  _467.w = (_492.w*_508.w);
+                _465.x = (_466.x+_467.x);
+                _465.y = (_466.y+_467.y);
+                _465.z = (_466.z+_467.z);
+                _465.w = (_466.w+_467.w);
+              int4 _509 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+              _464.x = (_465.x+_509.x);
+              _464.y = (_465.y+_509.y);
+              _464.z = (_465.z+_509.z);
+              _464.w = (_465.w+_509.w);
+            int4 _510;
+              int4 _511 = make_int4(((((int)threadIdx.x) + 224))+(1*0), ((((int)threadIdx.x) + 224))+(1*1), ((((int)threadIdx.x) + 224))+(1*2), ((((int)threadIdx.x) + 224))+(1*3));
+              int4 _512 = make_int4(3, 3, 3, 3);
+              _510.x = (_511.x%_512.x);
+              _510.y = (_511.y%_512.y);
+              _510.z = (_511.z%_512.z);
+              _510.w = (_511.w%_512.w);
+            int4 _513;
+            ushort4 _514;
+              ushort4 _515;
+                ushort4 _516;
+                  int4 _517 = make_int4(3, 3, 3, 3);
+                  int4 _518 = make_int4(0, 0, 0, 0);
+                  _516.x = (_517.x>=_518.x);
+                  _516.y = (_517.y>=_518.y);
+                  _516.z = (_517.z>=_518.z);
+                  _516.w = (_517.w>=_518.w);
+                ushort4 _519;
+                  int4 _520 = make_int4(0, 0, 0, 0);
+                  _519.x = (_510.x>=_520.x);
+                  _519.y = (_510.y>=_520.y);
+                  _519.z = (_510.z>=_520.z);
+                  _519.w = (_510.w>=_520.w);
+                _515.x = (_516.x&&_519.x);
+                _515.y = (_516.y&&_519.y);
+                _515.z = (_516.z&&_519.z);
+                _515.w = (_516.w&&_519.w);
+              ushort4 _521;
+                ushort4 _522;
+                  int4 _523 = make_int4(3, 3, 3, 3);
+                  int4 _524 = make_int4(0, 0, 0, 0);
+                  _522.x = (_523.x<_524.x);
+                  _522.y = (_523.y<_524.y);
+                  _522.z = (_523.z<_524.z);
+                  _522.w = (_523.w<_524.w);
+                ushort4 _525;
+                  int4 _526 = make_int4(0, 0, 0, 0);
+                  _525.x = (_510.x<=_526.x);
+                  _525.y = (_510.y<=_526.y);
+                  _525.z = (_510.z<=_526.z);
+                  _525.w = (_510.w<=_526.w);
+                _521.x = (_522.x&&_525.x);
+                _521.y = (_522.y&&_525.y);
+                _521.z = (_522.z&&_525.z);
+                _521.w = (_522.w&&_525.w);
+              _514.x = (_515.x||_521.x);
+              _514.y = (_515.y||_521.y);
+              _514.z = (_515.z||_521.z);
+              _514.w = (_515.w||_521.w);
+            int4 _527;
+              int4 _528 = make_int4(3, 3, 3, 3);
+              _527.x = (_510.x+_528.x);
+              _527.y = (_510.y+_528.y);
+              _527.z = (_510.z+_528.z);
+              _527.w = (_510.w+_528.w);
+            _513.x = (bool(_514.x)?_510.x:_527.x);
+            _513.y = (bool(_514.y)?_510.y:_527.y);
+            _513.z = (bool(_514.z)?_510.z:_527.z);
+            _513.w = (bool(_514.w)?_510.w:_527.w);
+            _463.x = (_464.x+_513.x);
+            _463.y = (_464.y+_513.y);
+            _463.z = (_464.z+_513.z);
+            _463.w = (_464.w+_513.w);
+          *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 896)) = make_float4(kernel[_463.x],kernel[_463.y],kernel[_463.z],kernel[_463.w]);
+          int4 _529;
+            int4 _530;
+              int4 _531;
+                int4 _532 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1024) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1024) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1024) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1024) / 48) * 4608)) + (rc_outer_o [...]
+                int4 _533;
+                  int4 _534;
+                    int4 _535;
+                      int4 _536 = make_int4((((((int)threadIdx.x) * 4) + 1024))+(1*0), (((((int)threadIdx.x) * 4) + 1024))+(1*1), (((((int)threadIdx.x) * 4) + 1024))+(1*2), (((((int)threadIdx.x) * 4) + 1024))+(1*3));
+                      int4 _537 = make_int4(3, 3, 3, 3);
+                      _535.x = (_536.x%_537.x);
+                      _535.y = (_536.y%_537.y);
+                      _535.z = (_536.z%_537.z);
+                      _535.w = (_536.w%_537.w);
+                    int4 _538;
+                      int4 _539 = make_int4((((((int)threadIdx.x) * 4) + 1024))+(1*0), (((((int)threadIdx.x) * 4) + 1024))+(1*1), (((((int)threadIdx.x) * 4) + 1024))+(1*2), (((((int)threadIdx.x) * 4) + 1024))+(1*3));
+                      int4 _540 = make_int4(3, 3, 3, 3);
+                      _538.x = (_539.x/_540.x);
+                      _538.y = (_539.y/_540.y);
+                      _538.z = (_539.z/_540.z);
+                      _538.w = (_539.w/_540.w);
+                    int4 _541;
+                    ushort4 _542;
+                      ushort4 _543;
+                        ushort4 _544;
+                          int4 _545 = make_int4(3, 3, 3, 3);
+                          int4 _546 = make_int4(0, 0, 0, 0);
+                          _544.x = (_545.x>=_546.x);
+                          _544.y = (_545.y>=_546.y);
+                          _544.z = (_545.z>=_546.z);
+                          _544.w = (_545.w>=_546.w);
+                        ushort4 _547;
+                          int4 _548 = make_int4(0, 0, 0, 0);
+                          _547.x = (_535.x>=_548.x);
+                          _547.y = (_535.y>=_548.y);
+                          _547.z = (_535.z>=_548.z);
+                          _547.w = (_535.w>=_548.w);
+                        _543.x = (_544.x&&_547.x);
+                        _543.y = (_544.y&&_547.y);
+                        _543.z = (_544.z&&_547.z);
+                        _543.w = (_544.w&&_547.w);
+                      ushort4 _549;
+                        ushort4 _550;
+                          int4 _551 = make_int4(3, 3, 3, 3);
+                          int4 _552 = make_int4(0, 0, 0, 0);
+                          _550.x = (_551.x<_552.x);
+                          _550.y = (_551.y<_552.y);
+                          _550.z = (_551.z<_552.z);
+                          _550.w = (_551.w<_552.w);
+                        ushort4 _553;
+                          int4 _554 = make_int4(0, 0, 0, 0);
+                          _553.x = (_535.x<=_554.x);
+                          _553.y = (_535.y<=_554.y);
+                          _553.z = (_535.z<=_554.z);
+                          _553.w = (_535.w<=_554.w);
+                        _549.x = (_550.x&&_553.x);
+                        _549.y = (_550.y&&_553.y);
+                        _549.z = (_550.z&&_553.z);
+                        _549.w = (_550.w&&_553.w);
+                      _542.x = (_543.x||_549.x);
+                      _542.y = (_543.y||_549.y);
+                      _542.z = (_543.z||_549.z);
+                      _542.w = (_543.w||_549.w);
+                    int4 _555;
+                      int4 _556 = make_int4(1, 1, 1, 1);
+                      _555.x = (_538.x-_556.x);
+                      _555.y = (_538.y-_556.y);
+                      _555.z = (_538.z-_556.z);
+                      _555.w = (_538.w-_556.w);
+                    _541.x = (bool(_542.x)?_538.x:_555.x);
+                    _541.y = (bool(_542.y)?_538.y:_555.y);
+                    _541.z = (bool(_542.z)?_538.z:_555.z);
+                    _541.w = (bool(_542.w)?_538.w:_555.w);
+                    int4 _557 = make_int4(16, 16, 16, 16);
+                    _534.x = (_541.x%_557.x);
+                    _534.y = (_541.y%_557.y);
+                    _534.z = (_541.z%_557.z);
+                    _534.w = (_541.w%_557.w);
+                  int4 _558;
+                  ushort4 _559;
+                    ushort4 _560;
+                      ushort4 _561;
+                        int4 _562 = make_int4(16, 16, 16, 16);
+                        int4 _563 = make_int4(0, 0, 0, 0);
+                        _561.x = (_562.x>=_563.x);
+                        _561.y = (_562.y>=_563.y);
+                        _561.z = (_562.z>=_563.z);
+                        _561.w = (_562.w>=_563.w);
+                      ushort4 _564;
+                        int4 _565 = make_int4(0, 0, 0, 0);
+                        _564.x = (_534.x>=_565.x);
+                        _564.y = (_534.y>=_565.y);
+                        _564.z = (_534.z>=_565.z);
+                        _564.w = (_534.w>=_565.w);
+                      _560.x = (_561.x&&_564.x);
+                      _560.y = (_561.y&&_564.y);
+                      _560.z = (_561.z&&_564.z);
+                      _560.w = (_561.w&&_564.w);
+                    ushort4 _566;
+                      ushort4 _567;
+                        int4 _568 = make_int4(16, 16, 16, 16);
+                        int4 _569 = make_int4(0, 0, 0, 0);
+                        _567.x = (_568.x<_569.x);
+                        _567.y = (_568.y<_569.y);
+                        _567.z = (_568.z<_569.z);
+                        _567.w = (_568.w<_569.w);
+                      ushort4 _570;
+                        int4 _571 = make_int4(0, 0, 0, 0);
+                        _570.x = (_534.x<=_571.x);
+                        _570.y = (_534.y<=_571.y);
+                        _570.z = (_534.z<=_571.z);
+                        _570.w = (_534.w<=_571.w);
+                      _566.x = (_567.x&&_570.x);
+                      _566.y = (_567.y&&_570.y);
+                      _566.z = (_567.z&&_570.z);
+                      _566.w = (_567.w&&_570.w);
+                    _559.x = (_560.x||_566.x);
+                    _559.y = (_560.y||_566.y);
+                    _559.z = (_560.z||_566.z);
+                    _559.w = (_560.w||_566.w);
+                  int4 _572;
+                    int4 _573 = make_int4(16, 16, 16, 16);
+                    _572.x = (_534.x+_573.x);
+                    _572.y = (_534.y+_573.y);
+                    _572.z = (_534.z+_573.z);
+                    _572.w = (_534.w+_573.w);
+                  _558.x = (bool(_559.x)?_534.x:_572.x);
+                  _558.y = (bool(_559.y)?_534.y:_572.y);
+                  _558.z = (bool(_559.z)?_534.z:_572.z);
+                  _558.w = (bool(_559.w)?_534.w:_572.w);
+                  int4 _574 = make_int4(9, 9, 9, 9);
+                  _533.x = (_558.x*_574.x);
+                  _533.y = (_558.y*_574.y);
+                  _533.z = (_558.z*_574.z);
+                  _533.w = (_558.w*_574.w);
+                _531.x = (_532.x+_533.x);
+                _531.y = (_532.y+_533.y);
+                _531.z = (_532.z+_533.z);
+                _531.w = (_532.w+_533.w);
+              int4 _575 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+              _530.x = (_531.x+_575.x);
+              _530.y = (_531.y+_575.y);
+              _530.z = (_531.z+_575.z);
+              _530.w = (_531.w+_575.w);
+            int4 _576;
+              int4 _577 = make_int4(((((int)threadIdx.x) + 256))+(1*0), ((((int)threadIdx.x) + 256))+(1*1), ((((int)threadIdx.x) + 256))+(1*2), ((((int)threadIdx.x) + 256))+(1*3));
+              int4 _578 = make_int4(3, 3, 3, 3);
+              _576.x = (_577.x%_578.x);
+              _576.y = (_577.y%_578.y);
+              _576.z = (_577.z%_578.z);
+              _576.w = (_577.w%_578.w);
+            int4 _579;
+            ushort4 _580;
+              ushort4 _581;
+                ushort4 _582;
+                  int4 _583 = make_int4(3, 3, 3, 3);
+                  int4 _584 = make_int4(0, 0, 0, 0);
+                  _582.x = (_583.x>=_584.x);
+                  _582.y = (_583.y>=_584.y);
+                  _582.z = (_583.z>=_584.z);
+                  _582.w = (_583.w>=_584.w);
+                ushort4 _585;
+                  int4 _586 = make_int4(0, 0, 0, 0);
+                  _585.x = (_576.x>=_586.x);
+                  _585.y = (_576.y>=_586.y);
+                  _585.z = (_576.z>=_586.z);
+                  _585.w = (_576.w>=_586.w);
+                _581.x = (_582.x&&_585.x);
+                _581.y = (_582.y&&_585.y);
+                _581.z = (_582.z&&_585.z);
+                _581.w = (_582.w&&_585.w);
+              ushort4 _587;
+                ushort4 _588;
+                  int4 _589 = make_int4(3, 3, 3, 3);
+                  int4 _590 = make_int4(0, 0, 0, 0);
+                  _588.x = (_589.x<_590.x);
+                  _588.y = (_589.y<_590.y);
+                  _588.z = (_589.z<_590.z);
+                  _588.w = (_589.w<_590.w);
+                ushort4 _591;
+                  int4 _592 = make_int4(0, 0, 0, 0);
+                  _591.x = (_576.x<=_592.x);
+                  _591.y = (_576.y<=_592.y);
+                  _591.z = (_576.z<=_592.z);
+                  _591.w = (_576.w<=_592.w);
+                _587.x = (_588.x&&_591.x);
+                _587.y = (_588.y&&_591.y);
+                _587.z = (_588.z&&_591.z);
+                _587.w = (_588.w&&_591.w);
+              _580.x = (_581.x||_587.x);
+              _580.y = (_581.y||_587.y);
+              _580.z = (_581.z||_587.z);
+              _580.w = (_581.w||_587.w);
+            int4 _593;
+              int4 _594 = make_int4(3, 3, 3, 3);
+              _593.x = (_576.x+_594.x);
+              _593.y = (_576.y+_594.y);
+              _593.z = (_576.z+_594.z);
+              _593.w = (_576.w+_594.w);
+            _579.x = (bool(_580.x)?_576.x:_593.x);
+            _579.y = (bool(_580.y)?_576.y:_593.y);
+            _579.z = (bool(_580.z)?_576.z:_593.z);
+            _579.w = (bool(_580.w)?_576.w:_593.w);
+            _529.x = (_530.x+_579.x);
+            _529.y = (_530.y+_579.y);
+            _529.z = (_530.z+_579.z);
+            _529.w = (_530.w+_579.w);
+          *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 1024)) = make_float4(kernel[_529.x],kernel[_529.y],kernel[_529.z],kernel[_529.w]);
+          int4 _595;
+            int4 _596;
+              int4 _597;
+                int4 _598 = make_int4((((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 110592), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 110592), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 110592), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 110592));
+                int4 _599;
+                  int4 _600;
+                    int4 _601;
+                      int4 _602 = make_int4((((((int)threadIdx.x) * 4) + 1152))+(1*0), (((((int)threadIdx.x) * 4) + 1152))+(1*1), (((((int)threadIdx.x) * 4) + 1152))+(1*2), (((((int)threadIdx.x) * 4) + 1152))+(1*3));
+                      int4 _603 = make_int4(3, 3, 3, 3);
+                      _601.x = (_602.x%_603.x);
+                      _601.y = (_602.y%_603.y);
+                      _601.z = (_602.z%_603.z);
+                      _601.w = (_602.w%_603.w);
+                    int4 _604;
+                      int4 _605 = make_int4((((((int)threadIdx.x) * 4) + 1152))+(1*0), (((((int)threadIdx.x) * 4) + 1152))+(1*1), (((((int)threadIdx.x) * 4) + 1152))+(1*2), (((((int)threadIdx.x) * 4) + 1152))+(1*3));
+                      int4 _606 = make_int4(3, 3, 3, 3);
+                      _604.x = (_605.x/_606.x);
+                      _604.y = (_605.y/_606.y);
+                      _604.z = (_605.z/_606.z);
+                      _604.w = (_605.w/_606.w);
+                    int4 _607;
+                    ushort4 _608;
+                      ushort4 _609;
+                        ushort4 _610;
+                          int4 _611 = make_int4(3, 3, 3, 3);
+                          int4 _612 = make_int4(0, 0, 0, 0);
+                          _610.x = (_611.x>=_612.x);
+                          _610.y = (_611.y>=_612.y);
+                          _610.z = (_611.z>=_612.z);
+                          _610.w = (_611.w>=_612.w);
+                        ushort4 _613;
+                          int4 _614 = make_int4(0, 0, 0, 0);
+                          _613.x = (_601.x>=_614.x);
+                          _613.y = (_601.y>=_614.y);
+                          _613.z = (_601.z>=_614.z);
+                          _613.w = (_601.w>=_614.w);
+                        _609.x = (_610.x&&_613.x);
+                        _609.y = (_610.y&&_613.y);
+                        _609.z = (_610.z&&_613.z);
+                        _609.w = (_610.w&&_613.w);
+                      ushort4 _615;
+                        ushort4 _616;
+                          int4 _617 = make_int4(3, 3, 3, 3);
+                          int4 _618 = make_int4(0, 0, 0, 0);
+                          _616.x = (_617.x<_618.x);
+                          _616.y = (_617.y<_618.y);
+                          _616.z = (_617.z<_618.z);
+                          _616.w = (_617.w<_618.w);
+                        ushort4 _619;
+                          int4 _620 = make_int4(0, 0, 0, 0);
+                          _619.x = (_601.x<=_620.x);
+                          _619.y = (_601.y<=_620.y);
+                          _619.z = (_601.z<=_620.z);
+                          _619.w = (_601.w<=_620.w);
+                        _615.x = (_616.x&&_619.x);
+                        _615.y = (_616.y&&_619.y);
+                        _615.z = (_616.z&&_619.z);
+                        _615.w = (_616.w&&_619.w);
+                      _608.x = (_609.x||_615.x);
+                      _608.y = (_609.y||_615.y);
+                      _608.z = (_609.z||_615.z);
+                      _608.w = (_609.w||_615.w);
+                    int4 _621;
+                      int4 _622 = make_int4(1, 1, 1, 1);
+                      _621.x = (_604.x-_622.x);
+                      _621.y = (_604.y-_622.y);
+                      _621.z = (_604.z-_622.z);
+                      _621.w = (_604.w-_622.w);
+                    _607.x = (bool(_608.x)?_604.x:_621.x);
+                    _607.y = (bool(_608.y)?_604.y:_621.y);
+                    _607.z = (bool(_608.z)?_604.z:_621.z);
+                    _607.w = (bool(_608.w)?_604.w:_621.w);
+                    int4 _623 = make_int4(16, 16, 16, 16);
+                    _600.x = (_607.x%_623.x);
+                    _600.y = (_607.y%_623.y);
+                    _600.z = (_607.z%_623.z);
+                    _600.w = (_607.w%_623.w);
+                  int4 _624;
+                  ushort4 _625;
+                    ushort4 _626;
+                      ushort4 _627;
+                        int4 _628 = make_int4(16, 16, 16, 16);
+                        int4 _629 = make_int4(0, 0, 0, 0);
+                        _627.x = (_628.x>=_629.x);
+                        _627.y = (_628.y>=_629.y);
+                        _627.z = (_628.z>=_629.z);
+                        _627.w = (_628.w>=_629.w);
+                      ushort4 _630;
+                        int4 _631 = make_int4(0, 0, 0, 0);
+                        _630.x = (_600.x>=_631.x);
+                        _630.y = (_600.y>=_631.y);
+                        _630.z = (_600.z>=_631.z);
+                        _630.w = (_600.w>=_631.w);
+                      _626.x = (_627.x&&_630.x);
+                      _626.y = (_627.y&&_630.y);
+                      _626.z = (_627.z&&_630.z);
+                      _626.w = (_627.w&&_630.w);
+                    ushort4 _632;
+                      ushort4 _633;
+                        int4 _634 = make_int4(16, 16, 16, 16);
+                        int4 _635 = make_int4(0, 0, 0, 0);
+                        _633.x = (_634.x<_635.x);
+                        _633.y = (_634.y<_635.y);
+                        _633.z = (_634.z<_635.z);
+                        _633.w = (_634.w<_635.w);
+                      ushort4 _636;
+                        int4 _637 = make_int4(0, 0, 0, 0);
+                        _636.x = (_600.x<=_637.x);
+                        _636.y = (_600.y<=_637.y);
+                        _636.z = (_600.z<=_637.z);
+                        _636.w = (_600.w<=_637.w);
+                      _632.x = (_633.x&&_636.x);
+                      _632.y = (_633.y&&_636.y);
+                      _632.z = (_633.z&&_636.z);
+                      _632.w = (_633.w&&_636.w);
+                    _625.x = (_626.x||_632.x);
+                    _625.y = (_626.y||_632.y);
+                    _625.z = (_626.z||_632.z);
+                    _625.w = (_626.w||_632.w);
+                  int4 _638;
+                    int4 _639 = make_int4(16, 16, 16, 16);
+                    _638.x = (_600.x+_639.x);
+                    _638.y = (_600.y+_639.y);
+                    _638.z = (_600.z+_639.z);
+                    _638.w = (_600.w+_639.w);
+                  _624.x = (bool(_625.x)?_600.x:_638.x);
+                  _624.y = (bool(_625.y)?_600.y:_638.y);
+                  _624.z = (bool(_625.z)?_600.z:_638.z);
+                  _624.w = (bool(_625.w)?_600.w:_638.w);
+                  int4 _640 = make_int4(9, 9, 9, 9);
+                  _599.x = (_624.x*_640.x);
+                  _599.y = (_624.y*_640.y);
+                  _599.z = (_624.z*_640.z);
+                  _599.w = (_624.w*_640.w);
+                _597.x = (_598.x+_599.x);
+                _597.y = (_598.y+_599.y);
+                _597.z = (_598.z+_599.z);
+                _597.w = (_598.w+_599.w);
+              int4 _641 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+              _596.x = (_597.x+_641.x);
+              _596.y = (_597.y+_641.y);
+              _596.z = (_597.z+_641.z);
+              _596.w = (_597.w+_641.w);
+            int4 _642;
+              int4 _643 = make_int4(((((int)threadIdx.x) + 288))+(1*0), ((((int)threadIdx.x) + 288))+(1*1), ((((int)threadIdx.x) + 288))+(1*2), ((((int)threadIdx.x) + 288))+(1*3));
+              int4 _644 = make_int4(3, 3, 3, 3);
+              _642.x = (_643.x%_644.x);
+              _642.y = (_643.y%_644.y);
+              _642.z = (_643.z%_644.z);
+              _642.w = (_643.w%_644.w);
+            int4 _645;
+            ushort4 _646;
+              ushort4 _647;
+                ushort4 _648;
+                  int4 _649 = make_int4(3, 3, 3, 3);
+                  int4 _650 = make_int4(0, 0, 0, 0);
+                  _648.x = (_649.x>=_650.x);
+                  _648.y = (_649.y>=_650.y);
+                  _648.z = (_649.z>=_650.z);
+                  _648.w = (_649.w>=_650.w);
+                ushort4 _651;
+                  int4 _652 = make_int4(0, 0, 0, 0);
+                  _651.x = (_642.x>=_652.x);
+                  _651.y = (_642.y>=_652.y);
+                  _651.z = (_642.z>=_652.z);
+                  _651.w = (_642.w>=_652.w);
+                _647.x = (_648.x&&_651.x);
+                _647.y = (_648.y&&_651.y);
+                _647.z = (_648.z&&_651.z);
+                _647.w = (_648.w&&_651.w);
+              ushort4 _653;
+                ushort4 _654;
+                  int4 _655 = make_int4(3, 3, 3, 3);
+                  int4 _656 = make_int4(0, 0, 0, 0);
+                  _654.x = (_655.x<_656.x);
+                  _654.y = (_655.y<_656.y);
+                  _654.z = (_655.z<_656.z);
+                  _654.w = (_655.w<_656.w);
+                ushort4 _657;
+                  int4 _658 = make_int4(0, 0, 0, 0);
+                  _657.x = (_642.x<=_658.x);
+                  _657.y = (_642.y<=_658.y);
+                  _657.z = (_642.z<=_658.z);
+                  _657.w = (_642.w<=_658.w);
+                _653.x = (_654.x&&_657.x);
+                _653.y = (_654.y&&_657.y);
+                _653.z = (_654.z&&_657.z);
+                _653.w = (_654.w&&_657.w);
+              _646.x = (_647.x||_653.x);
+              _646.y = (_647.y||_653.y);
+              _646.z = (_647.z||_653.z);
+              _646.w = (_647.w||_653.w);
+            int4 _659;
+              int4 _660 = make_int4(3, 3, 3, 3);
+              _659.x = (_642.x+_660.x);
+              _659.y = (_642.y+_660.y);
+              _659.z = (_642.z+_660.z);
+              _659.w = (_642.w+_660.w);
+            _645.x = (bool(_646.x)?_642.x:_659.x);
+            _645.y = (bool(_646.y)?_642.y:_659.y);
+            _645.z = (bool(_646.z)?_642.z:_659.z);
+            _645.w = (bool(_646.w)?_642.w:_659.w);
+            _595.x = (_596.x+_645.x);
+            _595.y = (_596.y+_645.y);
+            _595.z = (_596.z+_645.z);
+            _595.w = (_596.w+_645.w);
+          *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 1152)) = make_float4(kernel[_595.x],kernel[_595.y],kernel[_595.z],kernel[_595.w]);
+          int4 _661;
+            int4 _662;
+              int4 _663;
+                int4 _664 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1280) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1280) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1280) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1280) / 48) * 4608)) + (rc_outer_o [...]
+                int4 _665;
+                  int4 _666;
+                    int4 _667;
+                      int4 _668 = make_int4((((((int)threadIdx.x) * 4) + 1280))+(1*0), (((((int)threadIdx.x) * 4) + 1280))+(1*1), (((((int)threadIdx.x) * 4) + 1280))+(1*2), (((((int)threadIdx.x) * 4) + 1280))+(1*3));
+                      int4 _669 = make_int4(3, 3, 3, 3);
+                      _667.x = (_668.x%_669.x);
+                      _667.y = (_668.y%_669.y);
+                      _667.z = (_668.z%_669.z);
+                      _667.w = (_668.w%_669.w);
+                    int4 _670;
+                      int4 _671 = make_int4((((((int)threadIdx.x) * 4) + 1280))+(1*0), (((((int)threadIdx.x) * 4) + 1280))+(1*1), (((((int)threadIdx.x) * 4) + 1280))+(1*2), (((((int)threadIdx.x) * 4) + 1280))+(1*3));
+                      int4 _672 = make_int4(3, 3, 3, 3);
+                      _670.x = (_671.x/_672.x);
+                      _670.y = (_671.y/_672.y);
+                      _670.z = (_671.z/_672.z);
+                      _670.w = (_671.w/_672.w);
+                    int4 _673;
+                    ushort4 _674;
+                      ushort4 _675;
+                        ushort4 _676;
+                          int4 _677 = make_int4(3, 3, 3, 3);
+                          int4 _678 = make_int4(0, 0, 0, 0);
+                          _676.x = (_677.x>=_678.x);
+                          _676.y = (_677.y>=_678.y);
+                          _676.z = (_677.z>=_678.z);
+                          _676.w = (_677.w>=_678.w);
+                        ushort4 _679;
+                          int4 _680 = make_int4(0, 0, 0, 0);
+                          _679.x = (_667.x>=_680.x);
+                          _679.y = (_667.y>=_680.y);
+                          _679.z = (_667.z>=_680.z);
+                          _679.w = (_667.w>=_680.w);
+                        _675.x = (_676.x&&_679.x);
+                        _675.y = (_676.y&&_679.y);
+                        _675.z = (_676.z&&_679.z);
+                        _675.w = (_676.w&&_679.w);
+                      ushort4 _681;
+                        ushort4 _682;
+                          int4 _683 = make_int4(3, 3, 3, 3);
+                          int4 _684 = make_int4(0, 0, 0, 0);
+                          _682.x = (_683.x<_684.x);
+                          _682.y = (_683.y<_684.y);
+                          _682.z = (_683.z<_684.z);
+                          _682.w = (_683.w<_684.w);
+                        ushort4 _685;
+                          int4 _686 = make_int4(0, 0, 0, 0);
+                          _685.x = (_667.x<=_686.x);
+                          _685.y = (_667.y<=_686.y);
+                          _685.z = (_667.z<=_686.z);
+                          _685.w = (_667.w<=_686.w);
+                        _681.x = (_682.x&&_685.x);
+                        _681.y = (_682.y&&_685.y);
+                        _681.z = (_682.z&&_685.z);
+                        _681.w = (_682.w&&_685.w);
+                      _674.x = (_675.x||_681.x);
+                      _674.y = (_675.y||_681.y);
+                      _674.z = (_675.z||_681.z);
+                      _674.w = (_675.w||_681.w);
+                    int4 _687;
+                      int4 _688 = make_int4(1, 1, 1, 1);
+                      _687.x = (_670.x-_688.x);
+                      _687.y = (_670.y-_688.y);
+                      _687.z = (_670.z-_688.z);
+                      _687.w = (_670.w-_688.w);
+                    _673.x = (bool(_674.x)?_670.x:_687.x);
+                    _673.y = (bool(_674.y)?_670.y:_687.y);
+                    _673.z = (bool(_674.z)?_670.z:_687.z);
+                    _673.w = (bool(_674.w)?_670.w:_687.w);
+                    int4 _689 = make_int4(16, 16, 16, 16);
+                    _666.x = (_673.x%_689.x);
+                    _666.y = (_673.y%_689.y);
+                    _666.z = (_673.z%_689.z);
+                    _666.w = (_673.w%_689.w);
+                  int4 _690;
+                  ushort4 _691;
+                    ushort4 _692;
+                      ushort4 _693;
+                        int4 _694 = make_int4(16, 16, 16, 16);
+                        int4 _695 = make_int4(0, 0, 0, 0);
+                        _693.x = (_694.x>=_695.x);
+                        _693.y = (_694.y>=_695.y);
+                        _693.z = (_694.z>=_695.z);
+                        _693.w = (_694.w>=_695.w);
+                      ushort4 _696;
+                        int4 _697 = make_int4(0, 0, 0, 0);
+                        _696.x = (_666.x>=_697.x);
+                        _696.y = (_666.y>=_697.y);
+                        _696.z = (_666.z>=_697.z);
+                        _696.w = (_666.w>=_697.w);
+                      _692.x = (_693.x&&_696.x);
+                      _692.y = (_693.y&&_696.y);
+                      _692.z = (_693.z&&_696.z);
+                      _692.w = (_693.w&&_696.w);
+                    ushort4 _698;
+                      ushort4 _699;
+                        int4 _700 = make_int4(16, 16, 16, 16);
+                        int4 _701 = make_int4(0, 0, 0, 0);
+                        _699.x = (_700.x<_701.x);
+                        _699.y = (_700.y<_701.y);
+                        _699.z = (_700.z<_701.z);
+                        _699.w = (_700.w<_701.w);
+                      ushort4 _702;
+                        int4 _703 = make_int4(0, 0, 0, 0);
+                        _702.x = (_666.x<=_703.x);
+                        _702.y = (_666.y<=_703.y);
+                        _702.z = (_666.z<=_703.z);
+                        _702.w = (_666.w<=_703.w);
+                      _698.x = (_699.x&&_702.x);
+                      _698.y = (_699.y&&_702.y);
+                      _698.z = (_699.z&&_702.z);
+                      _698.w = (_699.w&&_702.w);
+                    _691.x = (_692.x||_698.x);
+                    _691.y = (_692.y||_698.y);
+                    _691.z = (_692.z||_698.z);
+                    _691.w = (_692.w||_698.w);
+                  int4 _704;
+                    int4 _705 = make_int4(16, 16, 16, 16);
+                    _704.x = (_666.x+_705.x);
+                    _704.y = (_666.y+_705.y);
+                    _704.z = (_666.z+_705.z);
+                    _704.w = (_666.w+_705.w);
+                  _690.x = (bool(_691.x)?_666.x:_704.x);
+                  _690.y = (bool(_691.y)?_666.y:_704.y);
+                  _690.z = (bool(_691.z)?_666.z:_704.z);
+                  _690.w = (bool(_691.w)?_666.w:_704.w);
+                  int4 _706 = make_int4(9, 9, 9, 9);
+                  _665.x = (_690.x*_706.x);
+                  _665.y = (_690.y*_706.y);
+                  _665.z = (_690.z*_706.z);
+                  _665.w = (_690.w*_706.w);
+                _663.x = (_664.x+_665.x);
+                _663.y = (_664.y+_665.y);
+                _663.z = (_664.z+_665.z);
+                _663.w = (_664.w+_665.w);
+              int4 _707 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+              _662.x = (_663.x+_707.x);
+              _662.y = (_663.y+_707.y);
+              _662.z = (_663.z+_707.z);
+              _662.w = (_663.w+_707.w);
+            int4 _708;
+              int4 _709 = make_int4(((((int)threadIdx.x) + 320))+(1*0), ((((int)threadIdx.x) + 320))+(1*1), ((((int)threadIdx.x) + 320))+(1*2), ((((int)threadIdx.x) + 320))+(1*3));
+              int4 _710 = make_int4(3, 3, 3, 3);
+              _708.x = (_709.x%_710.x);
+              _708.y = (_709.y%_710.y);
+              _708.z = (_709.z%_710.z);
+              _708.w = (_709.w%_710.w);
+            int4 _711;
+            ushort4 _712;
+              ushort4 _713;
+                ushort4 _714;
+                  int4 _715 = make_int4(3, 3, 3, 3);
+                  int4 _716 = make_int4(0, 0, 0, 0);
+                  _714.x = (_715.x>=_716.x);
+                  _714.y = (_715.y>=_716.y);
+                  _714.z = (_715.z>=_716.z);
+                  _714.w = (_715.w>=_716.w);
+                ushort4 _717;
+                  int4 _718 = make_int4(0, 0, 0, 0);
+                  _717.x = (_708.x>=_718.x);
+                  _717.y = (_708.y>=_718.y);
+                  _717.z = (_708.z>=_718.z);
+                  _717.w = (_708.w>=_718.w);
+                _713.x = (_714.x&&_717.x);
+                _713.y = (_714.y&&_717.y);
+                _713.z = (_714.z&&_717.z);
+                _713.w = (_714.w&&_717.w);
+              ushort4 _719;
+                ushort4 _720;
+                  int4 _721 = make_int4(3, 3, 3, 3);
+                  int4 _722 = make_int4(0, 0, 0, 0);
+                  _720.x = (_721.x<_722.x);
+                  _720.y = (_721.y<_722.y);
+                  _720.z = (_721.z<_722.z);
+                  _720.w = (_721.w<_722.w);
+                ushort4 _723;
+                  int4 _724 = make_int4(0, 0, 0, 0);
+                  _723.x = (_708.x<=_724.x);
+                  _723.y = (_708.y<=_724.y);
+                  _723.z = (_708.z<=_724.z);
+                  _723.w = (_708.w<=_724.w);
+                _719.x = (_720.x&&_723.x);
+                _719.y = (_720.y&&_723.y);
+                _719.z = (_720.z&&_723.z);
+                _719.w = (_720.w&&_723.w);
+              _712.x = (_713.x||_719.x);
+              _712.y = (_713.y||_719.y);
+              _712.z = (_713.z||_719.z);
+              _712.w = (_713.w||_719.w);
+            int4 _725;
+              int4 _726 = make_int4(3, 3, 3, 3);
+              _725.x = (_708.x+_726.x);
+              _725.y = (_708.y+_726.y);
+              _725.z = (_708.z+_726.z);
+              _725.w = (_708.w+_726.w);
+            _711.x = (bool(_712.x)?_708.x:_725.x);
+            _711.y = (bool(_712.y)?_708.y:_725.y);
+            _711.z = (bool(_712.z)?_708.z:_725.z);
+            _711.w = (bool(_712.w)?_708.w:_725.w);
+            _661.x = (_662.x+_711.x);
+            _661.y = (_662.y+_711.y);
+            _661.z = (_662.z+_711.z);
+            _661.w = (_662.w+_711.w);
+          *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 1280)) = make_float4(kernel[_661.x],kernel[_661.y],kernel[_661.z],kernel[_661.w]);
+          int4 _727;
+            int4 _728;
+              int4 _729;
+                int4 _730 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1408) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1408) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1408) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1408) / 48) * 4608)) + (rc_outer_o [...]
+                int4 _731;
+                  int4 _732;
+                    int4 _733;
+                      int4 _734 = make_int4((((((int)threadIdx.x) * 4) + 1408))+(1*0), (((((int)threadIdx.x) * 4) + 1408))+(1*1), (((((int)threadIdx.x) * 4) + 1408))+(1*2), (((((int)threadIdx.x) * 4) + 1408))+(1*3));
+                      int4 _735 = make_int4(3, 3, 3, 3);
+                      _733.x = (_734.x%_735.x);
+                      _733.y = (_734.y%_735.y);
+                      _733.z = (_734.z%_735.z);
+                      _733.w = (_734.w%_735.w);
+                    int4 _736;
+                      int4 _737 = make_int4((((((int)threadIdx.x) * 4) + 1408))+(1*0), (((((int)threadIdx.x) * 4) + 1408))+(1*1), (((((int)threadIdx.x) * 4) + 1408))+(1*2), (((((int)threadIdx.x) * 4) + 1408))+(1*3));
+                      int4 _738 = make_int4(3, 3, 3, 3);
+                      _736.x = (_737.x/_738.x);
+                      _736.y = (_737.y/_738.y);
+                      _736.z = (_737.z/_738.z);
+                      _736.w = (_737.w/_738.w);
+                    int4 _739;
+                    ushort4 _740;
+                      ushort4 _741;
+                        ushort4 _742;
+                          int4 _743 = make_int4(3, 3, 3, 3);
+                          int4 _744 = make_int4(0, 0, 0, 0);
+                          _742.x = (_743.x>=_744.x);
+                          _742.y = (_743.y>=_744.y);
+                          _742.z = (_743.z>=_744.z);
+                          _742.w = (_743.w>=_744.w);
+                        ushort4 _745;
+                          int4 _746 = make_int4(0, 0, 0, 0);
+                          _745.x = (_733.x>=_746.x);
+                          _745.y = (_733.y>=_746.y);
+                          _745.z = (_733.z>=_746.z);
+                          _745.w = (_733.w>=_746.w);
+                        _741.x = (_742.x&&_745.x);
+                        _741.y = (_742.y&&_745.y);
+                        _741.z = (_742.z&&_745.z);
+                        _741.w = (_742.w&&_745.w);
+                      ushort4 _747;
+                        ushort4 _748;
+                          int4 _749 = make_int4(3, 3, 3, 3);
+                          int4 _750 = make_int4(0, 0, 0, 0);
+                          _748.x = (_749.x<_750.x);
+                          _748.y = (_749.y<_750.y);
+                          _748.z = (_749.z<_750.z);
+                          _748.w = (_749.w<_750.w);
+                        ushort4 _751;
+                          int4 _752 = make_int4(0, 0, 0, 0);
+                          _751.x = (_733.x<=_752.x);
+                          _751.y = (_733.y<=_752.y);
+                          _751.z = (_733.z<=_752.z);
+                          _751.w = (_733.w<=_752.w);
+                        _747.x = (_748.x&&_751.x);
+                        _747.y = (_748.y&&_751.y);
+                        _747.z = (_748.z&&_751.z);
+                        _747.w = (_748.w&&_751.w);
+                      _740.x = (_741.x||_747.x);
+                      _740.y = (_741.y||_747.y);
+                      _740.z = (_741.z||_747.z);
+                      _740.w = (_741.w||_747.w);
+                    int4 _753;
+                      int4 _754 = make_int4(1, 1, 1, 1);
+                      _753.x = (_736.x-_754.x);
+                      _753.y = (_736.y-_754.y);
+                      _753.z = (_736.z-_754.z);
+                      _753.w = (_736.w-_754.w);
+                    _739.x = (bool(_740.x)?_736.x:_753.x);
+                    _739.y = (bool(_740.y)?_736.y:_753.y);
+                    _739.z = (bool(_740.z)?_736.z:_753.z);
+                    _739.w = (bool(_740.w)?_736.w:_753.w);
+                    int4 _755 = make_int4(16, 16, 16, 16);
+                    _732.x = (_739.x%_755.x);
+                    _732.y = (_739.y%_755.y);
+                    _732.z = (_739.z%_755.z);
+                    _732.w = (_739.w%_755.w);
+                  int4 _756;
+                  ushort4 _757;
+                    ushort4 _758;
+                      ushort4 _759;
+                        int4 _760 = make_int4(16, 16, 16, 16);
+                        int4 _761 = make_int4(0, 0, 0, 0);
+                        _759.x = (_760.x>=_761.x);
+                        _759.y = (_760.y>=_761.y);
+                        _759.z = (_760.z>=_761.z);
+                        _759.w = (_760.w>=_761.w);
+                      ushort4 _762;
+                        int4 _763 = make_int4(0, 0, 0, 0);
+                        _762.x = (_732.x>=_763.x);
+                        _762.y = (_732.y>=_763.y);
+                        _762.z = (_732.z>=_763.z);
+                        _762.w = (_732.w>=_763.w);
+                      _758.x = (_759.x&&_762.x);
+                      _758.y = (_759.y&&_762.y);
+                      _758.z = (_759.z&&_762.z);
+                      _758.w = (_759.w&&_762.w);
+                    ushort4 _764;
+                      ushort4 _765;
+                        int4 _766 = make_int4(16, 16, 16, 16);
+                        int4 _767 = make_int4(0, 0, 0, 0);
+                        _765.x = (_766.x<_767.x);
+                        _765.y = (_766.y<_767.y);
+                        _765.z = (_766.z<_767.z);
+                        _765.w = (_766.w<_767.w);
+                      ushort4 _768;
+                        int4 _769 = make_int4(0, 0, 0, 0);
+                        _768.x = (_732.x<=_769.x);
+                        _768.y = (_732.y<=_769.y);
+                        _768.z = (_732.z<=_769.z);
+                        _768.w = (_732.w<=_769.w);
+                      _764.x = (_765.x&&_768.x);
+                      _764.y = (_765.y&&_768.y);
+                      _764.z = (_765.z&&_768.z);
+                      _764.w = (_765.w&&_768.w);
+                    _757.x = (_758.x||_764.x);
+                    _757.y = (_758.y||_764.y);
+                    _757.z = (_758.z||_764.z);
+                    _757.w = (_758.w||_764.w);
+                  int4 _770;
+                    int4 _771 = make_int4(16, 16, 16, 16);
+                    _770.x = (_732.x+_771.x);
+                    _770.y = (_732.y+_771.y);
+                    _770.z = (_732.z+_771.z);
+                    _770.w = (_732.w+_771.w);
+                  _756.x = (bool(_757.x)?_732.x:_770.x);
+                  _756.y = (bool(_757.y)?_732.y:_770.y);
+                  _756.z = (bool(_757.z)?_732.z:_770.z);
+                  _756.w = (bool(_757.w)?_732.w:_770.w);
+                  int4 _772 = make_int4(9, 9, 9, 9);
+                  _731.x = (_756.x*_772.x);
+                  _731.y = (_756.y*_772.y);
+                  _731.z = (_756.z*_772.z);
+                  _731.w = (_756.w*_772.w);
+                _729.x = (_730.x+_731.x);
+                _729.y = (_730.y+_731.y);
+                _729.z = (_730.z+_731.z);
+                _729.w = (_730.w+_731.w);
+              int4 _773 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+              _728.x = (_729.x+_773.x);
+              _728.y = (_729.y+_773.y);
+              _728.z = (_729.z+_773.z);
+              _728.w = (_729.w+_773.w);
+            int4 _774;
+              int4 _775 = make_int4(((((int)threadIdx.x) + 352))+(1*0), ((((int)threadIdx.x) + 352))+(1*1), ((((int)threadIdx.x) + 352))+(1*2), ((((int)threadIdx.x) + 352))+(1*3));
+              int4 _776 = make_int4(3, 3, 3, 3);
+              _774.x = (_775.x%_776.x);
+              _774.y = (_775.y%_776.y);
+              _774.z = (_775.z%_776.z);
+              _774.w = (_775.w%_776.w);
+            int4 _777;
+            ushort4 _778;
+              ushort4 _779;
+                ushort4 _780;
+                  int4 _781 = make_int4(3, 3, 3, 3);
+                  int4 _782 = make_int4(0, 0, 0, 0);
+                  _780.x = (_781.x>=_782.x);
+                  _780.y = (_781.y>=_782.y);
+                  _780.z = (_781.z>=_782.z);
+                  _780.w = (_781.w>=_782.w);
+                ushort4 _783;
+                  int4 _784 = make_int4(0, 0, 0, 0);
+                  _783.x = (_774.x>=_784.x);
+                  _783.y = (_774.y>=_784.y);
+                  _783.z = (_774.z>=_784.z);
+                  _783.w = (_774.w>=_784.w);
+                _779.x = (_780.x&&_783.x);
+                _779.y = (_780.y&&_783.y);
+                _779.z = (_780.z&&_783.z);
+                _779.w = (_780.w&&_783.w);
+              ushort4 _785;
+                ushort4 _786;
+                  int4 _787 = make_int4(3, 3, 3, 3);
+                  int4 _788 = make_int4(0, 0, 0, 0);
+                  _786.x = (_787.x<_788.x);
+                  _786.y = (_787.y<_788.y);
+                  _786.z = (_787.z<_788.z);
+                  _786.w = (_787.w<_788.w);
+                ushort4 _789;
+                  int4 _790 = make_int4(0, 0, 0, 0);
+                  _789.x = (_774.x<=_790.x);
+                  _789.y = (_774.y<=_790.y);
+                  _789.z = (_774.z<=_790.z);
+                  _789.w = (_774.w<=_790.w);
+                _785.x = (_786.x&&_789.x);
+                _785.y = (_786.y&&_789.y);
+                _785.z = (_786.z&&_789.z);
+                _785.w = (_786.w&&_789.w);
+              _778.x = (_779.x||_785.x);
+              _778.y = (_779.y||_785.y);
+              _778.z = (_779.z||_785.z);
+              _778.w = (_779.w||_785.w);
+            int4 _791;
+              int4 _792 = make_int4(3, 3, 3, 3);
+              _791.x = (_774.x+_792.x);
+              _791.y = (_774.y+_792.y);
+              _791.z = (_774.z+_792.z);
+              _791.w = (_774.w+_792.w);
+            _777.x = (bool(_778.x)?_774.x:_791.x);
+            _777.y = (bool(_778.y)?_774.y:_791.y);
+            _777.z = (bool(_778.z)?_774.z:_791.z);
+            _777.w = (bool(_778.w)?_774.w:_791.w);
+            _727.x = (_728.x+_777.x);
+            _727.y = (_728.y+_777.y);
+            _727.z = (_728.z+_777.z);
+            _727.w = (_728.w+_777.w);
+          *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 1408)) = make_float4(kernel[_727.x],kernel[_727.y],kernel[_727.z],kernel[_727.w]);
           __syncthreads();
-          for (int rc_outer_inner = 0; rc_outer_inner < 16; ++rc_outer_inner) {
-            conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7))] * kernel_shared[(((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6))]));
-            conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7))] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 384)]));
-            conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 1)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 1)]));
-            conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 1)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 385)]));
-            conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 2)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 2)]));
-            conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 2)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 386)]));
-            conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 3)]));
-            conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 387)]));
-            conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 64)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 4)]));
-            conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 64)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 388)]));
-            conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 65)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 5)]));
-            conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 65)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 389)]));
+          for (int rc_outer_inner = 0; rc_outer_inner < 8; ++rc_outer_inner) {
+            conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(rc_outer_inner * 18)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+            conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 18) + 1)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+            conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_outer_inner * 18) + 2)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+            conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_outer_inner * 18) + 3)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+            conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_outer_inner * 18) + 4)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+            conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_outer_inner * 18) + 5)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+            conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_outer_inner * 18) + 6)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+            conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 18) + 1)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+            conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 18) + 2)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+            conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_outer_inner * 18) + 3)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+            conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_outer_inner * 18) + 4)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+            conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_outer_inner * 18) + 5)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+            conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_outer_inner * 18) + 6)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+            conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_outer_inner * 18) + 7)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+            conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 18) + 2)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+            conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 18) + 3)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+            conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_outer_inner * 18) + 4)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+            conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_outer_inner * 18) + 5)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+            conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_outer_inner * 18) + 6)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+            conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_outer_inner * 18) + 7)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+            conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_outer_inner * 18) + 8)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+            conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 18) + 9)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+            conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 18) + 10)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+            conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_outer_inner * 18) + 11)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+            conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_outer_inner * 18) + 12)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+            conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_outer_inner * 18) + 13)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+            conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_outer_inner * 18) + 14)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+            conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_outer_inner * 18) + 15)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+            conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 18) + 10)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+            conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 18) + 11)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+            conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_outer_inner * 18) + 12)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+            conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_outer_inner * 18) + 13)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+            conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_outer_inner * 18) + 14)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+            conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_outer_inner * 18) + 15)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+            conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_outer_inner * 18) + 16)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+            conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 18) + 11)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
+            conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 18) + 12)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
+            conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_outer_inner * 18) + 13)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
+            conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_outer_inner * 18) + 14)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
+            conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_outer_inner * 18) + 15)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
+            conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_outer_inner * 18) + 16)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
+            conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_outer_inner * 18) + 17)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
           }
         }
       }
-      compute[((((int)blockIdx.x) * 392) + ((int)threadIdx.x))] = max((conv2d_nchw[0] + bias[((((int)blockIdx.x) * 8) + (((int)threadIdx.x) / 49))]), 0.000000e+00f);
-      compute[(((((int)blockIdx.x) * 392) + ((int)threadIdx.x)) + 196)] = max((conv2d_nchw[1] + bias[(((((int)blockIdx.x) * 8) + (((int)threadIdx.x) / 49)) + 4)]), 0.000000e+00f);
+      for (int i3_inner = 0; i3_inner < 7; ++i3_inner) {
+        compute[(((((((int)blockIdx.x) / 7) * 1568) + (((int)threadIdx.x) * 49)) + ((((int)blockIdx.x) % 7) * 7)) + i3_inner)] = max((conv2d_nchw[i3_inner] + bias[(((((int)blockIdx.x) / 7) * 32) + ((int)threadIdx.x))]), 0.000000e+00f);
+      }
     }
 
 
@@ -589,7 +3151,7 @@ In the example below we resume the status and do more 5 trials.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 2 minutes  35.530 seconds)
+   **Total running time of the script:** ( 2 minutes  34.321 seconds)
 
 
 .. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
index 6e750f68b..f9381f273 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
@@ -646,7 +646,7 @@ so we can read the log file and load the best schedules.
     Evaluate inference time cost...
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-       9.6964       9.7275       9.7331       9.6288       0.0479   
+       9.8793       9.8894       9.9270       9.8216       0.0436   
                
 
 
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
index f8ab78cf1..1df3ba618 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
@@ -665,7 +665,7 @@ so we can read the log file and load the best schedules.
     Evaluate inference time cost...
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      752.6694     752.9576     753.1499     751.9008      0.5491   
+      757.9333     757.6261     759.1621     757.0116      0.9044   
                
 
 
@@ -693,7 +693,7 @@ Other Tips
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  19.801 seconds)
+   **Total running time of the script:** ( 1 minutes  20.685 seconds)
 
 
 .. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_network_x86.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
index f46917a5b..78a29bc42 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
@@ -396,12 +396,12 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
                  placeholder_4: Buffer(placeholder_14: Pointer(float32), float32, [65536], []),
                  compute: Buffer(compute_2: Pointer(float32), float32, [65536], [])}
       buffer_map = {placeholder_5: placeholder, placeholder_6: placeholder_1, placeholder_7: placeholder_2, placeholder_8: placeholder_3, placeholder_9: placeholder_4, compute_1: compute}
-      preflattened_buffer_map = {placeholder_5: placeholder_15: Buffer(placeholder_10, float32, [128, 256], []), placeholder_7: placeholder_16: Buffer(placeholder_12, int32, [4916], []), placeholder_8: placeholder_17: Buffer(placeholder_13, int32, [33], []), compute_1: compute_3: Buffer(compute_2, float32, [128, 512], []), placeholder_9: placeholder_18: Buffer(placeholder_14, float32, [128, 512], []), placeholder_6: placeholder_19: Buffer(placeholder_11, float32, [4916, 16, 1], [])} {
+      preflattened_buffer_map = {placeholder_7: placeholder_15: Buffer(placeholder_12, int32, [4916], []), placeholder_6: placeholder_16: Buffer(placeholder_11, float32, [4916, 16, 1], []), placeholder_8: placeholder_17: Buffer(placeholder_13, int32, [33], []), placeholder_5: placeholder_18: Buffer(placeholder_10, float32, [128, 256], []), placeholder_9: placeholder_19: Buffer(placeholder_14, float32, [128, 512], []), compute_1: compute_3: Buffer(compute_2, float32, [128, 512], [])} {
       for (i0.outer.i1.outer.fused: int32, 0, 32) "parallel" {
         allocate(compute_4: Pointer(global float32), float32, [2048]), storage_scope = global {
-          for (nb_j.inner: int32, 0, 2) {
-            for (i.inner.init: int32, 0, 64) {
-              let cse_var_1: int32 = ((i.inner.init*32) + (nb_j.inner*16))
+          for (i.outer.inner: int32, 0, 4) {
+            for (i.inner.init: int32, 0, 32) {
+              let cse_var_1: int32 = ((i.outer.inner*512) + (i.inner.init*16))
                {
                 compute_5: Buffer(compute_4, float32, [2048], [])[cse_var_1] = 0f32
                 compute_5[(cse_var_1 + 1)] = 0f32
@@ -421,51 +421,78 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
                 compute_5[(cse_var_1 + 15)] = 0f32
               }
             }
-            for (elem_idx: int32, 0, let cse_var_2: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner) in (placeholder_3[(cse_var_2 + 1)] - placeholder_3[cse_var_2])) {
-              for (i.inner: int32, 0, 64) {
-                let cse_var_21: int32 = (elem_idx*16)
-                let cse_var_20: int32 = ((i.inner*32) + (nb_j.inner*16))
-                let cse_var_19: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)
-                let cse_var_18: int32 = ((floordiv(i0.outer.i1.outer.fused, 16)*16384) + (i.inner*256))
-                let cse_var_17: int32 = (cse_var_20 + 9)
-                let cse_var_16: int32 = (cse_var_20 + 8)
-                let cse_var_15: int32 = (cse_var_20 + 7)
-                let cse_var_14: int32 = (cse_var_20 + 6)
-                let cse_var_13: int32 = (cse_var_20 + 5)
-                let cse_var_12: int32 = (cse_var_20 + 4)
-                let cse_var_11: int32 = (cse_var_20 + 3)
-                let cse_var_10: int32 = (cse_var_20 + 2)
-                let cse_var_9: int32 = (cse_var_20 + 15)
-                let cse_var_8: int32 = (cse_var_20 + 14)
-                let cse_var_7: int32 = (cse_var_20 + 13)
-                let cse_var_6: int32 = (cse_var_20 + 12)
-                let cse_var_5: int32 = (cse_var_20 + 11)
-                let cse_var_4: int32 = (cse_var_20 + 10)
-                let cse_var_3: int32 = (cse_var_20 + 1)
-                 {
-                  compute_5[cse_var_20] = (compute_5[cse_var_20] + (placeholder_1[((placeholder_3[cse_var_19]*16) + cse_var_21)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_3] = (compute_5[cse_var_3] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 1)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 2)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 3)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 4)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 5)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 6)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 7)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 8)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 9)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_4] = (compute_5[cse_var_4] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 10)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_5] = (compute_5[cse_var_5] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 11)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_6] = (compute_5[cse_var_6] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 12)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 13)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 14)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 15)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
+            for (elem_idx: int32, 0, (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])) {
+              for (i.inner: int32, 0, 32) {
+                if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+                  let cse_var_2: int32 = ((i.outer.inner*512) + (i.inner*16))
+                  compute_5[cse_var_2] = (compute_5[cse_var_2] + (placeholder_1[((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16))]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+                }
+                if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+                  let cse_var_3: int32 = (((i.outer.inner*512) + (i.inner*16)) + 1)
+                  compute_5[cse_var_3] = (compute_5[cse_var_3] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 1)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+                }
+                if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+                  let cse_var_4: int32 = (((i.outer.inner*512) + (i.inner*16)) + 2)
+                  compute_5[cse_var_4] = (compute_5[cse_var_4] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 2)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+                }
+                if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+                  let cse_var_5: int32 = (((i.outer.inner*512) + (i.inner*16)) + 3)
+                  compute_5[cse_var_5] = (compute_5[cse_var_5] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 3)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+                }
+                if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+                  let cse_var_6: int32 = (((i.outer.inner*512) + (i.inner*16)) + 4)
+                  compute_5[cse_var_6] = (compute_5[cse_var_6] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 4)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+                }
+                if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+                  let cse_var_7: int32 = (((i.outer.inner*512) + (i.inner*16)) + 5)
+                  compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 5)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+                }
+                if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+                  let cse_var_8: int32 = (((i.outer.inner*512) + (i.inner*16)) + 6)
+                  compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 6)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+                }
+                if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+                  let cse_var_9: int32 = (((i.outer.inner*512) + (i.inner*16)) + 7)
+                  compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 7)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+                }
+                if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+                  let cse_var_10: int32 = (((i.outer.inner*512) + (i.inner*16)) + 8)
+                  compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 8)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+                }
+                if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+                  let cse_var_11: int32 = (((i.outer.inner*512) + (i.inner*16)) + 9)
+                  compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 9)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+                }
+                if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+                  let cse_var_12: int32 = (((i.outer.inner*512) + (i.inner*16)) + 10)
+                  compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 10)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+                }
+                if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+                  let cse_var_13: int32 = (((i.outer.inner*512) + (i.inner*16)) + 11)
+                  compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 11)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+                }
+                if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+                  let cse_var_14: int32 = (((i.outer.inner*512) + (i.inner*16)) + 12)
+                  compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 12)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+                }
+                if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+                  let cse_var_15: int32 = (((i.outer.inner*512) + (i.inner*16)) + 13)
+                  compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 13)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+                }
+                if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+                  let cse_var_16: int32 = (((i.outer.inner*512) + (i.inner*16)) + 14)
+                  compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 14)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+                }
+                if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+                  let cse_var_17: int32 = (((i.outer.inner*512) + (i.inner*16)) + 15)
+                  compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 15)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
                 }
               }
             }
           }
-          for (i0.inner: int32, 0, 64) {
-            let cse_var_22: int32 = (((floordiv(i0.outer.i1.outer.fused, 16)*32768) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 16)*32))
-            compute[ramp(cse_var_22, 1, 32)] = max((compute_5[ramp((i0.inner*32), 1, 32)] + placeholder_4[ramp(cse_var_22, 1, 32)]), broadcast(0f32, 32))
+          for (i0.inner: int32, 0, 128) {
+            let cse_var_18: int32 = ((i0.inner*512) + (i0.outer.i1.outer.fused*16))
+            compute[ramp(cse_var_18, 1, 16)] = max((compute_5[ramp((i0.inner*16), 1, 16)] + placeholder_4[ramp(cse_var_18, 1, 16)]), broadcast(0f32, 16))
           }
         }
       }
@@ -521,7 +548,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 1.855 ms
+    Execution time of this operator: 1.730 ms
 
 
 
diff --git a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
index d6dc25f26..f50bf6718 100644
--- a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
@@ -5,12 +5,12 @@
 
 Computation times
 =================
-**00:43.277** total execution time for **how_to_tune_with_autotvm** files:
+**00:43.532** total execution time for **how_to_tune_with_autotvm** files:
 
 +--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``)           | 00:43.248 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``)           | 00:43.499 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``)               | 00:00.015 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``)               | 00:00.019 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_cuda.py` (``tune_relay_cuda.py``)             | 00:00.005 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
index 5bd98d84b..8d7b4fb1f 100644
--- a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
@@ -879,8 +879,8 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 4, 32]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 128]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2885496
-    No: 6   GFLOPS: 110.83/110.83   result: MeasureResult(costs=(0.002088788229166667,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.8140833377838135, timestamp=1655930458.0250723)       [('tile_f', [-1, 1, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3754080
-    No: 7   GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+    No: 6   GFLOPS: 110.46/110.46   result: MeasureResult(costs=(0.00209571425,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.82222318649292, timestamp=1655930909.2240996)        [('tile_f', [-1, 1, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3754080
+    No: 7   GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1003,7 +1003,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 16, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 256, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6225319
-    No: 8   GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+    No: 8   GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1126,7 +1126,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 1, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 8, 64]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,943546
-    No: 9   GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+    No: 9   GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1249,7 +1249,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 16, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 16, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2868708
-    No: 10  GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+    No: 10  GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 142, in build
         res = future.result()
       File "/usr/lib/python3.7/concurrent/futures/_base.py", line 435, in result
@@ -1267,7 +1267,7 @@ for this template
     TimeoutError
 
             [('tile_f', [-1, 32, 2, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4691833
-    No: 11  GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+    No: 11  GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1390,7 +1390,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 2, 64]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1042124
-    No: 12  GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+    No: 12  GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1513,7 +1513,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 32, 1, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 32, 16]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10013405
-    No: 13  GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+    No: 13  GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1636,7 +1636,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 8, 8, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 32]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6732082
-    No: 14  GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+    No: 14  GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1759,7 +1759,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 4, 32]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 128]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7536735
-    No: 15  GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+    No: 15  GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1882,7 +1882,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 1, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 128, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,482121
-    No: 16  GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+    No: 16  GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2005,7 +2005,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 1, 16]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 32, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2824525
-    No: 17  GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+    No: 17  GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2128,7 +2128,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 64, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4559286
-    No: 18  GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+    No: 18  GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2251,7 +2251,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 32, 16]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 512]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9677544
-    No: 19  GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+    No: 19  GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 738, in __call__
         yield remote, remote.load_module(os.path.split(build_result.filename)[1])
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 702, in run_through_rpc
@@ -2339,7 +2339,7 @@ for this template
       15: _PyEval_EvalFrameDefault
       14: 0x0000000000537c30
       13: _PyObject_FastCallKeywords
-      12: 0x00007ffab91b6fa2
+      12: 0x00007f97971cffa2
       11: _ctypes_callproc
       10: ffi_call
       9: ffi_call_unix64
@@ -2404,7 +2404,7 @@ for this template
       21: _PyFunction_FastCallKeywords
       20: _PyEval_EvalFrameDefault
       19: _PyFunction_FastCall      [('tile_f', [-1, 8, 2, 16]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6390073
-    No: 20  GFLOPS: 144.17/144.17   result: MeasureResult(costs=(0.00160570644,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4077048301696777, timestamp=1655930484.4690123)      [('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9881539
+    No: 20  GFLOPS: 144.77/144.77   result: MeasureResult(costs=(0.00159906648,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4326999187469482, timestamp=1655930935.759078)       [('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9881539
 
 
 
@@ -2461,7 +2461,7 @@ and measure running time.
     Best config:
     [('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9881539
     Finish loading 20 records
-    Time cost of this operator: 0.001957
+    Time cost of this operator: 0.002022
 
 
 
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
index 277337fc1..db3b92130 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
@@ -328,10 +328,10 @@ Timing the untuned program
     ########## Build without Autotuning ##########
     Node Name                                     Ops                                           Time(us)  Time(%)  Shape              Inputs  Outputs  
     ---------                                     ---                                           --------  -------  -----              ------  -------  
-    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  318.6     98.76    (1, 2, 10, 10, 3)  2       1        
-    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       3.076     0.953    (1, 6, 10, 10)     1       1        
-    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.924     0.286    (1, 1, 10, 10, 3)  1       1        
-    Total_time                                    -                                             322.6     -        -                  -       -        
+    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  315.3     98.749   (1, 2, 10, 10, 3)  2       1        
+    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       3.079     0.964    (1, 6, 10, 10)     1       1        
+    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.916     0.287    (1, 1, 10, 10, 3)  1       1        
+    Total_time                                    -                                             319.295   -        -                  -       -        
 
 
 
@@ -397,10 +397,10 @@ Timing the tuned program
     ########## Build with Autotuning ##########
     Node Name                                     Ops                                           Time(us)  Time(%)  Shape              Inputs  Outputs  
     ---------                                     ---                                           --------  -------  -----              ------  -------  
-    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  190.6     98.591   (1, 1, 10, 10, 6)  2       1        
-    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       1.902     0.984    (1, 6, 10, 10)     1       1        
-    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.823     0.426    (1, 3, 10, 10, 1)  1       1        
-    Total_time                                    -                                             193.325   -        -                  -       -        
+    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  135.1     98.066   (1, 6, 10, 10, 1)  2       1        
+    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       1.748     1.269    (1, 6, 10, 10)     1       1        
+    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.916     0.665    (1, 1, 10, 10, 3)  1       1        
+    Total_time                                    -                                             137.765   -        -                  -       -        
 
 
 
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
index 3d4e94f14..af47ad316 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
@@ -225,7 +225,7 @@ take about **2 minutes** to download the Stanford Cars, while COCO 2017 validati
  .. code-block:: none
 
 
-    '/tmp/tmp5adly3xq/images/random'
+    '/tmp/tmptfwkoswh/images/random'
 
 
 
@@ -325,8 +325,8 @@ objects to other stuff? We can display some examples from our datasets using ``m
 
  .. code-block:: none
 
-    /tmp/tmp5adly3xq/images/target contains 8144 images
-    /tmp/tmp5adly3xq/images/random contains 5000 images
+    /tmp/tmptfwkoswh/images/target contains 8144 images
+    /tmp/tmptfwkoswh/images/random contains 5000 images
 
 
 
@@ -501,13 +501,13 @@ the time on our validation set).
  .. code-block:: none
 
     Epoch 1/3
-    328/328 - 55s - loss: 0.2082 - accuracy: 0.9277 - val_loss: 0.1597 - val_accuracy: 0.9535
+    328/328 - 55s - loss: 0.2163 - accuracy: 0.9270 - val_loss: 0.1386 - val_accuracy: 0.9528
     Epoch 2/3
-    328/328 - 52s - loss: 0.0971 - accuracy: 0.9631 - val_loss: 0.1233 - val_accuracy: 0.9630
+    328/328 - 52s - loss: 0.0909 - accuracy: 0.9670 - val_loss: 0.1189 - val_accuracy: 0.9581
     Epoch 3/3
-    328/328 - 52s - loss: 0.0650 - accuracy: 0.9757 - val_loss: 0.1162 - val_accuracy: 0.9600
+    328/328 - 52s - loss: 0.0696 - accuracy: 0.9729 - val_loss: 0.1105 - val_accuracy: 0.9641
 
-    <keras.callbacks.History object at 0x7f7ed75d5e10>
+    <keras.callbacks.History object at 0x7f489d506f10>
 
 
 
@@ -864,7 +864,7 @@ Arduino tutorial for how to do that `on GitHub <https://github.com/guberti/tvm-a
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 8 minutes  7.957 seconds)
+   **Total running time of the script:** ( 10 minutes  12.692 seconds)
 
 
 .. _sphx_glr_download_how_to_work_with_microtvm_micro_train.py:
diff --git a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
index e8d504d06..2d363b24e 100644
--- a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
 
 Computation times
 =================
-**08:52.919** total execution time for **how_to_work_with_microtvm** files:
+**10:58.917** total execution time for **how_to_work_with_microtvm** files:
 
 +---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py` (``micro_train.py``)               | 08:07.957 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py` (``micro_train.py``)               | 10:12.692 | 0.0 MB |
 +---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``)         | 00:41.550 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``)         | 00:42.778 | 0.0 MB |
 +---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``)             | 00:03.412 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``)             | 00:03.447 | 0.0 MB |
 +---------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_work_with_microtvm_micro_ethosu.py` (``micro_ethosu.py``)             | 00:00.000 | 0.0 MB |
 +---------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
index f6f05fc14..debf159b7 100644
--- a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
@@ -5,12 +5,12 @@
 
 Computation times
 =================
-**00:11.361** total execution time for **how_to_work_with_relay** files:
+**00:11.381** total execution time for **how_to_work_with_relay** files:
 
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``) | 00:09.853 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``) | 00:09.879 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``)                   | 00:01.502 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``)                   | 00:01.496 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_work_with_relay_using_relay_viz.py` (``using_relay_viz.py``)       | 00:00.006 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt b/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
index 02586ca07..d14bdadc6 100644
--- a/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
@@ -259,7 +259,7 @@ The following example customizes CUDA lowering rule for :code:`exp`.
  .. code-block:: none
 
 
-    <function my_cuda_math_rule at 0x7f7e3fb81440>
+    <function my_cuda_math_rule at 0x7f4810e0c950>
 
 
 
diff --git a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
index 98ae84c67..d73fb9320 100644
--- a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
@@ -5,22 +5,22 @@
 
 Computation times
 =================
-**00:04.002** total execution time for **how_to_work_with_schedules** files:
+**00:03.981** total execution time for **how_to_work_with_schedules** files:
 
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``)                 | 00:01.863 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``)                 | 00:01.854 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``)                     | 00:00.949 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``)                     | 00:00.930 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``)                     | 00:00.514 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``)                     | 00:00.522 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``)                               | 00:00.505 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``)                               | 00:00.503 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_work_with_schedules_extern_op.py` (``extern_op.py``)                     | 00:00.099 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_work_with_schedules_schedule_primitives.py` (``schedule_primitives.py``) | 00:00.033 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tedd.py` (``tedd.py``)                               | 00:00.026 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tedd.py` (``tedd.py``)                               | 00:00.027 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tuple_inputs.py` (``tuple_inputs.py``)               | 00:00.013 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tuple_inputs.py` (``tuple_inputs.py``)               | 00:00.012 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
index 44a9f0b57..733c8c49b 100644
--- a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
@@ -346,7 +346,7 @@ The importing needs to happen before the tensorized GEMV being executed.
                  C: Buffer(C_2: Pointer(float32), float32, [524288], [])}
       buffer_map = {A_1: A, B_1: B, C_1: C}
       preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [1024, 64], []), B_1: B_3: Buffer(B_2, float32, [512, 64], []), C_1: C_3: Buffer(C_2, float32, [1024, 512], [])} {
-      attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmpa_jm11ei/input0.cc'\nsource_filename = \"/tmp/tmpa_jm11ei/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n  %7 = alloca float*, align 8\n  %8 = alloca float*, align 8\n  %9 = alloca floa [...]
+      attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmpbs1xqx8t/input0.cc'\nsource_filename = \"/tmp/tmpbs1xqx8t/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n  %7 = alloca float*, align 8\n  %8 = alloca float*, align 8\n  %9 = alloca floa [...]
       for (i, 0, 1024) {
         for (j.outer: int32, 0, 32) {
           @tir.call_extern("gemv_update", @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), C_2, ((i*512) + (j.outer*16)), 16, 2, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), A_2, (i*64), 64, 1, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), B_2, (j.outer*1024), 1024, 1, dtype=handle), 16, 64, 64, dtype=int32)
diff --git a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
index d4b6d5080..18ff1a40c 100644
--- a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**00:20.522** total execution time for **topic_vta_tutorials_autotvm** files:
+**00:20.859** total execution time for **topic_vta_tutorials_autotvm** files:
 
 +---------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``) | 00:20.515 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``) | 00:20.852 | 0.0 MB |
 +---------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_alu_vta.py` (``tune_alu_vta.py``)     | 00:00.006 | 0.0 MB |
 +---------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
index 05803d903..062b34a16 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
@@ -291,7 +291,7 @@ The compilation steps are:
       DeprecationWarning,
     /workspace/vta/tutorials/frontend/deploy_classification.py:213: DeprecationWarning: legacy graph executor behavior of producing json / lib / params will be removed in the next release. Please see documents of tvm.contrib.graph_executor.GraphModule for the  new recommended usage.
       relay_prog, target=tvm.target.Target(target, host=env.target_host), params=params
-    resnet18_v1 inference graph built in 21.99s!
+    resnet18_v1 inference graph built in 22.46s!
 
 
 
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
index 15d700130..9cd5de20d 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
@@ -335,7 +335,7 @@ The compilation steps are:
       "target_host parameter is going to be deprecated. "
     /workspace/python/tvm/relay/build_module.py:389: DeprecationWarning: Please use input parameter mod (tvm.IRModule) instead of deprecated parameter mod (tvm.relay.function.Function)
       DeprecationWarning,
-    yolov3-tiny inference graph built in 15.41s!
+    yolov3-tiny inference graph built in 15.72s!
 
 
 
diff --git a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
index 216ab2850..be0509189 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**01:29.548** total execution time for **topic_vta_tutorials_frontend** files:
+**01:29.341** total execution time for **topic_vta_tutorials_frontend** files:
 
 +------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``)           | 00:47.517 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``)           | 00:47.066 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``) | 00:42.031 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``) | 00:42.275 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
index fe4fe00cf..48b3815be 100644
--- a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**00:03.245** total execution time for **topic_vta_tutorials_optimize** files:
+**00:03.243** total execution time for **topic_vta_tutorials_optimize** files:
 
 +--------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``)         | 00:02.862 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``) | 00:00.383 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``) | 00:00.381 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
index 952e0ed5e..dac94a0f1 100644
--- a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**00:00.709** total execution time for **topic_vta_tutorials** files:
+**00:00.690** total execution time for **topic_vta_tutorials** files:
 
 +---------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``) | 00:00.383 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``) | 00:00.369 | 0.0 MB |
 +---------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``) | 00:00.326 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``) | 00:00.322 | 0.0 MB |
 +---------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
index 8e775daa3..f2ab90a40 100644
--- a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
@@ -204,13 +204,6 @@ trials, we can load the best schedule from the log file and apply it.
 
 
 
-.. rst-class:: sphx-glr-script-out
-
- .. code-block:: none
-
-    *E*E
-
-
 
 
 
@@ -334,7 +327,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 93.585 ms
+    Execution time of this operator: 93.717 ms
 
 
 
@@ -452,7 +445,7 @@ operations.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  32.785 seconds)
+   **Total running time of the script:** ( 1 minutes  13.161 seconds)
 
 
 .. _sphx_glr_download_tutorial_auto_scheduler_matmul_x86.py:
diff --git a/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt b/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
index 3d5fd78d1..30a8f8096 100644
--- a/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
@@ -449,16 +449,16 @@ reduce variance, we take 5 measurements and average them.
     waiting for device...
     device available
     Get devices for measurement successfully!
-    No: 1   GFLOPS: 9.49/9.49       result: MeasureResult(costs=(0.028283824800000003,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5838947296142578, timestamp=1655929265.7724662)       [('tile_y', [-1, 1]), ('tile_x', [-1, 256])],None,80
-    No: 2   GFLOPS: 2.47/9.49       result: MeasureResult(costs=(0.10865412719999998,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.8847618103027344, timestamp=1655929267.6746712)        [('tile_y', [-1, 4]), ('tile_x', [-1, 8])],None,32
-    No: 3   GFLOPS: 11.79/11.79     result: MeasureResult(costs=(0.022765395,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5643162727355957, timestamp=1655929268.7010412)        [('tile_y', [-1, 64]), ('tile_x', [-1, 32])],None,56
-    No: 4   GFLOPS: 1.48/11.79      result: MeasureResult(costs=(0.18186187920000002,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.0187907218933105, timestamp=1655929272.2771149)        [('tile_y', [-1, 1]), ('tile_x', [-1, 4])],None,20
-    No: 5   GFLOPS: 3.63/11.79      result: MeasureResult(costs=(0.07400362120000001,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.3209781646728516, timestamp=1655929273.7276344)        [('tile_y', [-1, 256]), ('tile_x', [-1, 16])],None,48
-    No: 6   GFLOPS: 1.77/11.79      result: MeasureResult(costs=(0.151268727,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.6089162826538086, timestamp=1655929276.384843) [('tile_y', [-1, 512]), ('tile_x', [-1, 4])],None,29
-    No: 7   GFLOPS: 0.87/11.79      result: MeasureResult(costs=(0.3093742644,), error_no=MeasureErrorNo.NO_ERROR, all_cost=5.074639320373535, timestamp=1655929281.990054) [('tile_y', [-1, 512]), ('tile_x', [-1, 2])],None,19
-    No: 8   GFLOPS: 10.63/11.79     result: MeasureResult(costs=(0.0252409828,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5472111701965332, timestamp=1655929282.556275)        [('tile_y', [-1, 4]), ('tile_x', [-1, 64])],None,62
-    No: 9   GFLOPS: 1.90/11.79      result: MeasureResult(costs=(0.140921808,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.353353977203369, timestamp=1655929285.0292299) [('tile_y', [-1, 2]), ('tile_x', [-1, 2])],None,11
-    No: 10  GFLOPS: 2.73/11.79      result: MeasureResult(costs=(0.098435465,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.6813583374023438, timestamp=1655929286.7710526)        [('tile_y', [-1, 4]), ('tile_x', [-1, 4])],None,22
+    No: 1   GFLOPS: 9.76/9.76       result: MeasureResult(costs=(0.0275145092,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5797626972198486, timestamp=1655929709.3550327)       [('tile_y', [-1, 1]), ('tile_x', [-1, 256])],None,80
+    No: 2   GFLOPS: 2.69/9.76       result: MeasureResult(costs=(0.0997637056,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.743319034576416, timestamp=1655929711.115279) [('tile_y', [-1, 4]), ('tile_x', [-1, 8])],None,32
+    No: 3   GFLOPS: 11.75/11.75     result: MeasureResult(costs=(0.0228372184,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5632002353668213, timestamp=1655929712.157808)        [('tile_y', [-1, 64]), ('tile_x', [-1, 32])],None,56
+    No: 4   GFLOPS: 1.85/11.75      result: MeasureResult(costs=(0.1453076866,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.452601671218872, timestamp=1655929715.1464045)        [('tile_y', [-1, 1]), ('tile_x', [-1, 4])],None,20
+    No: 5   GFLOPS: 3.60/11.75      result: MeasureResult(costs=(0.07447396660000001,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.3235578536987305, timestamp=1655929716.597595) [('tile_y', [-1, 256]), ('tile_x', [-1, 16])],None,48
+    No: 6   GFLOPS: 1.79/11.75      result: MeasureResult(costs=(0.14989059440000002,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.566101312637329, timestamp=1655929719.2095697) [('tile_y', [-1, 512]), ('tile_x', [-1, 4])],None,29
+    No: 7   GFLOPS: 0.86/11.75      result: MeasureResult(costs=(0.31043540859999996,), error_no=MeasureErrorNo.NO_ERROR, all_cost=5.087138652801514, timestamp=1655929724.8370092) [('tile_y', [-1, 512]), ('tile_x', [-1, 2])],None,19
+    No: 8   GFLOPS: 10.57/11.75     result: MeasureResult(costs=(0.0253910224,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5524759292602539, timestamp=1655929725.4101107)       [('tile_y', [-1, 4]), ('tile_x', [-1, 64])],None,62
+    No: 9   GFLOPS: 1.84/11.75      result: MeasureResult(costs=(0.145611129,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.4285926818847656, timestamp=1655929727.9581838)        [('tile_y', [-1, 2]), ('tile_x', [-1, 2])],None,11
+    No: 10  GFLOPS: 2.76/11.75      result: MeasureResult(costs=(0.097386445,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.669588565826416, timestamp=1655929729.6872263) [('tile_y', [-1, 4]), ('tile_x', [-1, 4])],None,22
 
 
 
diff --git a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
index 5dbc8538a..97d5e1222 100644
--- a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
@@ -314,7 +314,7 @@ standard deviation.
 
  .. code-block:: none
 
-    {'mean': 494.4699098900002, 'median': 494.50663965000103, 'std': 0.9450088039594712}
+    {'mean': 496.8772450499864, 'median': 496.6672595999853, 'std': 1.4294658444955355}
 
 
 
@@ -550,31 +550,31 @@ the tuning data to.
 
     /workspace/python/tvm/driver/build_module.py:264: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
       "target_host parameter is going to be deprecated. "
-
    [Task  1/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  1/25]  Current/Best:   17.53/  17.53 GFLOPS | Progress: (4/20) | 6.17 s
    [Task  1/25]  Current/Best:    6.16/  17.53 GFLOPS | Progress: (8/20) | 9.11 s
    [Task  1/25]  Current/Best:   11.49/  22.81 GFLOPS | Progress: (12/20) | 11.59 s
    [Task  1/25]  Current/Best:   16.83/  22.81 GFLOPS | Progress: (16/20) | 13.27 s
    [Task  1/25]  Current/Best:   11.54/  23.79 GFLOPS | Progress: (20/20) | 15.00 s Done.
-
    [Task  2/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  2/25]  Current/Best:   12.26/  12.98 GFLOPS | Progress: (4/20) | 3.83 s
    [Task  2/25]  Current/Best:   13.02/  18.40 GFLOPS | Progress: (8/20) | 5.16 s
    [Task  2/25]  Current/Best:   20.95/  20.95 GFLOPS | Progress: (12/20) | 6.46 s
    [Task  2/25]  Current/Best:   12.27/  20.95 GFLOPS | Progress: (16/20) | 7.76 s
    [Task  2/25]  Current/Best:   19.37/  20.95 GFLOPS | Progress: (20/20) | 9.35 s Done.
-
    [Task  3/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  3/25]  Current/Best:    1.63/  10.57 GFLOPS | Progress: (4/20) | 5.80 s
    [Task  3/25]  Current/Best:   15.61/  16.89 GFLOPS | Progress: (8/20) | 7.70 s
    [Task  3/25]  Current/Best:   14.93/  16.89 GFLOPS | Progress: (12/20) | 9.43 s
    [Task  3/25]  Current/Best:    7.20/  23.78 GFLOPS | Progress: (16/20) | 11.37 s
    [Task  3/25]  Current/Best:   12.32/  23.78 GFLOPS | Progress: (20/20) | 15.96 s Done.
-
    [Task  4/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  4/25]  Current/Best:    9.55/  20.52 GFLOPS | Progress: (4/20) | 2.33 s
    [Task  4/25]  Current/Best:    6.75/  20.52 GFLOPS | Progress: (8/20) | 7.03 s
    [Task  4/25]  Current/Best:   22.36/  22.36 GFLOPS | Progress: (12/20) | 12.00 s
    [Task  4/25]  Current/Best:   16.24/  22.36 GFLOPS | Progress: (16/20) | 14.38 s
    [Task  4/25]  Current/Best:   13.35/  22.36 GFLOPS | Progress: (20/20) | 16.34 s Done.
-
    [Task  5/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  5/25]  Current/Best:    9.61/  10.29 GFLOPS | Progress: (4/20) | 2.54 s
    [Task  5/25]  Current/Best:   11.65/  12.31 GFLOPS | Progress: (8/20) | 4.63 s
    [Task  5/25]  Current/Best:   11.81/  18.02 GFLOPS | Progress: (12/20) | 7.79 s
    [Task  5/25]  Current/Best:   11.54/  22.76 GFLOPS | Progress: (16/20) | 9.26 s
    [Task  5/25]  Current/Best:   12.02/  22.76 GFLOPS | Progress: (20/20) | 11.15 s Done.
-
    [Task  6/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  6/25]  Current/Best:   12.14/  20.74 GFLOPS | Progress: (4/20) | 4.05 s
    [Task  6/25]  Current/Best:   18.93/  20.74 GFLOPS | Progress: (8/20) | 5.82 s
    [Task  6/25]  Current/Best:   13.35/  20.74 GFLOPS | Progress: (12/20) | 7.76 s
    [Task  6/25]  Current/Best:   19.98/  20.74 GFLOPS | Progress: (16/20) | 10.04 s
    [Task  6/25]  Current/Best:    3.71/  20.74 GFLOPS | Progress: (20/20) | 12.57 s Done.
-
    [Task  7/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  7/25]  Current/Best:   11.12/  12.99 GFLOPS | Progress: (4/20) | 3.48 s
    [Task  7/25]  Current/Best:   20.22/  21.08 GFLOPS | Progress: (8/20) | 4.99 s
    [Task  7/25]  Current/Best:   16.19/  21.08 GFLOPS | Progress: (12/20) | 6.87 s
    [Task  7/25]  Current/Best:   12.23/  21.08 GFLOPS | Progress: (16/20) | 8.92 s
    [Task  7/25]  Current/Best:    6.38/  21.75 GFLOPS | Progress: (20/20) | 11.40 s Done.
-
    [Task  8/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  8/25]  Current/Best:    9.89/  14.11 GFLOPS | Progress: (4/20) | 2.84 s
    [Task  8/25]  Current/Best:    9.97/  14.11 GFLOPS | Progress: (8/20) | 7.83 s
    [Task  8/25]  Current/Best:   12.59/  14.11 GFLOPS | Progress: (12/20) | 14.31 s
    [Task  8/25]  Current/Best:   18.78/  18.78 GFLOPS | Progress: (16/20) | 16.43 s
    [Task  8/25]  Current/Best:   20.16/  20.16 GFLOPS | Progress: (20/20) | 23.47 s Done.
-
    [Task  9/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  9/25]  Current/Best:   14.34/  15.73 GFLOPS | Progress: (4/20) | 11.87 s
    [Task  9/25]  Current/Best:   23.44/  23.44 GFLOPS | Progress: (8/20) | 13.59 s
    [Task  9/25]  Current/Best:    8.28/  23.44 GFLOPS | Progress: (12/20) | 16.07 s
    [Task  9/25]  Current/Best:   17.85/  23.44 GFLOPS | Progress: (16/20) | 18.93 s
    [Task  9/25]  Current/Best:    9.02/  23.44 GFLOPS | Progress: (20/20) | 27.44 s
    [Task 10/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 10/25]  Current/Best:   18.31/  18.31 GFLOPS | Progress: (4/20) | 2.53 s
    [Task 10/25]  Current/Best:   15.62/  18.31 GFLOPS | Progress: (8/20) | 4.15 s
    [Task 10/25]  Current/Best:   12.42/  18.93 GFLOPS | Progress: (12/20) | 5.68 s
    [Task 10/25]  Current/Best:   19.08/  20.41 GFLOPS | Progress: (16/20) | 6.79 s
    [Task 10/25]  Current/Best:    8.86/  20.41 GFLOPS | Progress: (20/20
 ) | 8.31 s Done.
-
    [Task 11/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 11/25]  Current/Best:   12.33/  18.13 GFLOPS | Progress: (4/20) | 3.28 s
    [Task 11/25]  Current/Best:   16.81/  18.13 GFLOPS | Progress: (8/20) | 6.10 s
    [Task 11/25]  Current/Best:   18.28/  18.28 GFLOPS | Progress: (12/20) | 8.16 s
    [Task 11/25]  Current/Best:   13.54/  21.20 GFLOPS | Progress: (16/20) | 11.10 s
    [Task 11/25]  Current/Best:   19.53/  21.55 GFLOPS | Progress: (20/20) | 13.20 s Done.
-
    [Task 12/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 12/25]  Current/Best:    7.82/  17.96 GFLOPS | Progress: (4/20) | 5.61 s
    [Task 12/25]  Current/Best:    5.20/  17.96 GFLOPS | Progress: (8/20) | 9.51 s
    [Task 12/25]  Current/Best:   18.96/  18.96 GFLOPS | Progress: (12/20) | 11.51 s
    [Task 12/25]  Current/Best:   15.47/  18.96 GFLOPS | Progress: (16/20) | 14.40 s
    [Task 12/25]  Current/Best:   15.06/  18.96 GFLOPS | Progress: (20/20) | 16.31 s Done.
-
    [Task 13/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 13/25]  Current/Best:    8.27/  17.38 GFLOPS | Progress: (4/20) | 3.71 s
    [Task 13/25]  Current/Best:   16.08/  21.03 GFLOPS | Progress: (8/20) | 6.27 s
    [Task 13/25]  Current/Best:   19.63/  21.80 GFLOPS | Progress: (12/20) | 9.32 s
    [Task 13/25]  Current/Best:   12.31/  21.80 GFLOPS | Progress: (16/20) | 12.80 s
    [Task 13/25]  Current/Best:   18.52/  21.80 GFLOPS | Progress: (20/20) | 15.12 s Done.
-
    [Task 14/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 14/25]  Current/Best:   13.58/  13.58 GFLOPS | Progress: (4/20) | 3.36 s
    [Task 14/25]  Current/Best:    6.11/  13.58 GFLOPS | Progress: (8/20) | 5.59 s
    [Task 14/25]  Current/Best:   20.28/  20.28 GFLOPS | Progress: (12/20) | 8.23 s
    [Task 14/25]  Current/Best:   16.09/  20.28 GFLOPS | Progress: (16/20) | 9.90 s Done.
-
    [Task 14/25]  Current/Best:   16.91/  20.28 GFLOPS | Progress: (20/20) | 11.61 s
    [Task 15/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 15/25]  Current/Best:   16.17/  17.64 GFLOPS | Progress: (4/20) | 2.61 s
    [Task 15/25]  Current/Best:   14.33/  17.90 GFLOPS | Progress: (8/20) | 3.94 s
    [Task 15/25]  Current/Best:   10.39/  22.38 GFLOPS | Progress: (12/20) | 6.16 s
    [Task 15/25]  Current/Best:   20.35/  22.38 GFLOPS | Progress: (16/20) | 9.21 s
    [Task 15/25]  Current/Best:    9.66/  22.38 GFLOPS | Progress: (20/20) | 10.23 s
    [Task 16/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 16/25]  Current/Best:   20.34/  20.34 GFLOPS | Progress: (4/20) | 2.84 s
    [Task 16/25]  Current/Best:    3.04/  20.34 GFLOPS | Progress: (8/20) | 4.44 s
    [Task 16/25]  Current/Best:   19.42/  20.34 GFLOPS | Progress: (12/20) | 5.64 s
    [Task 16/25]  Current/Best:   17.71/  20.34 GFLOPS | Progress: (16/20) |
  7.01 s
    [Task 16/25]  Current/Best:   10.00/  22.27 GFLOPS | Progress: (20/20) | 9.15 s Done.
-
    [Task 17/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 17/25]  Current/Best:   12.96/  18.83 GFLOPS | Progress: (4/20) | 4.75 s
    [Task 17/25]  Current/Best:   14.47/  23.41 GFLOPS | Progress: (8/20) | 7.60 s
    [Task 17/25]  Current/Best:   16.99/  23.41 GFLOPS | Progress: (12/20) | 9.64 s
    [Task 17/25]  Current/Best:   16.49/  23.41 GFLOPS | Progress: (16/20) | 11.88 s
    [Task 17/25]  Current/Best:    9.92/  23.41 GFLOPS | Progress: (20/20) | 14.02 s Done.
-
    [Task 18/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 18/25]  Current/Best:   11.24/  17.82 GFLOPS | Progress: (4/20) | 3.72 s
    [Task 18/25]  Current/Best:   10.48/  19.75 GFLOPS | Progress: (8/20) | 7.37 s
    [Task 18/25]  Current/Best:   19.35/  19.75 GFLOPS | Progress: (12/20) | 9.30 s
    [Task 18/25]  Current/Best:   10.09/  19.75 GFLOPS | Progress: (16/20) | 13.10 s
    [Task 18/25]  Current/Best:   20.38/  20.38 GFLOPS | Progress: (20/20) | 14.62 s Done.
-
    [Task 19/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 19/25]  Current/Best:    7.14/  20.44 GFLOPS | Progress: (4/20) | 6.08 s
    [Task 19/25]  Current/Best:    2.61/  20.44 GFLOPS | Progress: (8/20) | 9.43 s
    [Task 19/25]  Current/Best:   20.41/  21.88 GFLOPS | Progress: (12/20) | 12.40 s
    [Task 19/25]  Current/Best:   14.26/  22.24 GFLOPS | Progress: (16/20) | 15.43 s
    [Task 19/25]  Current/Best:    2.70/  23.19 GFLOPS | Progress: (20/20) | 18.24 s Done.
-
    [Task 20/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 20/25]  Current/Best:    9.04/  15.20 GFLOPS | Progress: (4/20) | 3.29 s Done.
+
    [Task  1/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  1/25]  Current/Best:   17.49/  17.49 GFLOPS | Progress: (4/20) | 6.10 s
    [Task  1/25]  Current/Best:    6.16/  17.49 GFLOPS | Progress: (8/20) | 9.04 s
    [Task  1/25]  Current/Best:   11.53/  22.73 GFLOPS | Progress: (12/20) | 11.53 s
    [Task  1/25]  Current/Best:   16.82/  22.82 GFLOPS | Progress: (16/20) | 13.21 s
    [Task  1/25]  Current/Best:   11.58/  23.73 GFLOPS | Progress: (20/20) | 14.94 s Done.
+
    [Task  2/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  2/25]  Current/Best:   12.16/  12.80 GFLOPS | Progress: (4/20) | 3.78 s
    [Task  2/25]  Current/Best:   14.18/  17.42 GFLOPS | Progress: (8/20) | 5.11 s
    [Task  2/25]  Current/Best:   21.42/  21.42 GFLOPS | Progress: (12/20) | 6.42 s
    [Task  2/25]  Current/Best:   12.44/  21.42 GFLOPS | Progress: (16/20) | 7.67 s
    [Task  2/25]  Current/Best:   20.07/  21.42 GFLOPS | Progress: (20/20) | 9.29 s Done.
+
    [Task  3/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  3/25]  Current/Best:    1.63/  10.55 GFLOPS | Progress: (4/20) | 5.82 s
    [Task  3/25]  Current/Best:   15.58/  16.86 GFLOPS | Progress: (8/20) | 7.76 s
    [Task  3/25]  Current/Best:   14.87/  16.86 GFLOPS | Progress: (12/20) | 9.50 s
    [Task  3/25]  Current/Best:    7.20/  23.80 GFLOPS | Progress: (16/20) | 11.41 s
    [Task  3/25]  Current/Best:   12.13/  23.80 GFLOPS | Progress: (20/20) | 15.98 s Done.
+
    [Task  4/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  4/25]  Current/Best:    9.54/  20.11 GFLOPS | Progress: (4/20) | 2.34 s
    [Task  4/25]  Current/Best:    6.61/  20.11 GFLOPS | Progress: (8/20) | 7.05 s
    [Task  4/25]  Current/Best:   21.58/  21.58 GFLOPS | Progress: (12/20) | 11.94 s
    [Task  4/25]  Current/Best:   17.29/  21.58 GFLOPS | Progress: (16/20) | 14.32 s
    [Task  4/25]  Current/Best:   13.28/  21.58 GFLOPS | Progress: (20/20) | 16.38 s Done.
+
    [Task  5/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  5/25]  Current/Best:    9.62/  10.22 GFLOPS | Progress: (4/20) | 2.55 s
    [Task  5/25]  Current/Best:   11.65/  12.71 GFLOPS | Progress: (8/20) | 4.61 s
    [Task  5/25]  Current/Best:   11.16/  18.08 GFLOPS | Progress: (12/20) | 7.66 s
    [Task  5/25]  Current/Best:   11.63/  22.78 GFLOPS | Progress: (16/20) | 9.10 s
    [Task  5/25]  Current/Best:   11.95/  22.78 GFLOPS | Progress: (20/20) | 11.03 s Done.
+
    [Task  6/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  6/25]  Current/Best:   12.21/  20.77 GFLOPS | Progress: (4/20) | 4.07 s
    [Task  6/25]  Current/Best:   18.77/  20.77 GFLOPS | Progress: (8/20) | 5.85 s
    [Task  6/25]  Current/Best:   13.21/  20.77 GFLOPS | Progress: (12/20) | 7.80 s
    [Task  6/25]  Current/Best:   19.93/  20.77 GFLOPS | Progress: (16/20) | 10.06 s
    [Task  6/25]  Current/Best:    3.70/  20.77 GFLOPS | Progress: (20/20) | 12.57 s Done.
+
    [Task  7/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  7/25]  Current/Best:   10.92/  12.28 GFLOPS | Progress: (4/20) | 3.59 s
    [Task  7/25]  Current/Best:   20.22/  20.85 GFLOPS | Progress: (8/20) | 5.10 s
    [Task  7/25]  Current/Best:   15.73/  20.89 GFLOPS | Progress: (12/20) | 7.01 s
    [Task  7/25]  Current/Best:   12.23/  20.89 GFLOPS | Progress: (16/20) | 9.06 s
    [Task  7/25]  Current/Best:    6.39/  21.75 GFLOPS | Progress: (20/20) | 11.51 s Done.
+
    [Task  8/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  8/25]  Current/Best:    9.95/  14.15 GFLOPS | Progress: (4/20) | 2.86 s
    [Task  8/25]  Current/Best:   10.30/  14.15 GFLOPS | Progress: (8/20) | 7.96 s
    [Task  8/25]  Current/Best:   12.64/  14.15 GFLOPS | Progress: (12/20) | 14.54 s
    [Task  8/25]  Current/Best:   18.88/  18.88 GFLOPS | Progress: (16/20) | 16.64 s
    [Task  8/25]  Current/Best:   19.95/  19.95 GFLOPS | Progress: (20/20) | 23.68 s Done.
+
    [Task  9/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  9/25]  Current/Best:   14.13/  14.38 GFLOPS | Progress: (4/20) | 11.90 s
    [Task  9/25]  Current/Best:   23.38/  23.38 GFLOPS | Progress: (8/20) | 13.73 s
    [Task  9/25]  Current/Best:    8.23/  23.38 GFLOPS | Progress: (12/20) | 16.29 s
    [Task  9/25]  Current/Best:   17.99/  23.38 GFLOPS | Progress: (16/20) | 19.13 s
    [Task  9/25]  Current/Best:    9.01/  23.38 GFLOPS | Progress: (20/20) | 27.66 s
    [Task 10/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 10/25]  Current/Best:   18.39/  18.39 GFLOPS | Progress: (4/20) | 2.47 s
    [Task 10/25]  Current/Best:   15.68/  18.39 GFLOPS | Progress: (8/20) | 4.13 s
    [Task 10/25]  Current/Best:   12.56/  19.13 GFLOPS | Progress: (12/20) | 5.68 s
    [Task 10/25]  Current/Best:   19.00/  20.20 GFLOPS | Progress: (16/20) | 6.78 s
    [Task 10/25]  Current/Best:    8.92/  20.20 GFLOPS | Progress: (20/20
 ) | 8.34 s Done.
+
    [Task 11/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 11/25]  Current/Best:   12.17/  18.05 GFLOPS | Progress: (4/20) | 3.31 s
    [Task 11/25]  Current/Best:   16.81/  18.05 GFLOPS | Progress: (8/20) | 6.10 s
    [Task 11/25]  Current/Best:   18.16/  18.16 GFLOPS | Progress: (12/20) | 8.18 s
    [Task 11/25]  Current/Best:   13.40/  21.09 GFLOPS | Progress: (16/20) | 11.16 s
    [Task 11/25]  Current/Best:   19.42/  21.59 GFLOPS | Progress: (20/20) | 13.23 s Done.
+
    [Task 12/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 12/25]  Current/Best:    7.80/  18.14 GFLOPS | Progress: (4/20) | 5.65 s
    [Task 12/25]  Current/Best:    5.23/  18.14 GFLOPS | Progress: (8/20) | 9.58 s
    [Task 12/25]  Current/Best:   18.88/  18.88 GFLOPS | Progress: (12/20) | 11.57 s
    [Task 12/25]  Current/Best:   15.26/  18.88 GFLOPS | Progress: (16/20) | 14.52 s
    [Task 12/25]  Current/Best:   15.13/  18.99 GFLOPS | Progress: (20/20) | 16.44 s Done.
+
    [Task 13/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 13/25]  Current/Best:    8.66/  17.31 GFLOPS | Progress: (4/20) | 3.71 s
    [Task 13/25]  Current/Best:   16.04/  20.93 GFLOPS | Progress: (8/20) | 6.32 s
    [Task 13/25]  Current/Best:   19.45/  21.48 GFLOPS | Progress: (12/20) | 9.42 s
    [Task 13/25]  Current/Best:   12.24/  21.48 GFLOPS | Progress: (16/20) | 12.87 s
    [Task 13/25]  Current/Best:   18.62/  21.48 GFLOPS | Progress: (20/20) | 15.18 s Done.
+
    [Task 14/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 14/25]  Current/Best:   13.56/  13.56 GFLOPS | Progress: (4/20) | 3.28 s
    [Task 14/25]  Current/Best:    6.10/  13.56 GFLOPS | Progress: (8/20) | 5.51 s
    [Task 14/25]  Current/Best:   20.48/  20.48 GFLOPS | Progress: (12/20) | 8.20 s
    [Task 14/25]  Current/Best:   16.65/  20.48 GFLOPS | Progress: (16/20) | 9.84 s Done.
+
    [Task 14/25]  Current/Best:   17.12/  20.48 GFLOPS | Progress: (20/20) | 11.64 s
    [Task 15/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 15/25]  Current/Best:   16.16/  17.66 GFLOPS | Progress: (4/20) | 2.64 s
    [Task 15/25]  Current/Best:   14.28/  18.02 GFLOPS | Progress: (8/20) | 4.00 s
    [Task 15/25]  Current/Best:   10.37/  22.21 GFLOPS | Progress: (12/20) | 6.23 s
    [Task 15/25]  Current/Best:   20.38/  22.21 GFLOPS | Progress: (16/20) | 9.57 s
    [Task 15/25]  Current/Best:    9.70/  22.21 GFLOPS | Progress: (20/20) | 10.59 s
    [Task 16/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 16/25]  Current/Best:   19.70/  19.70 GFLOPS | Progress: (4/20) | 2.88 s
    [Task 16/25]  Current/Best:    3.04/  19.70 GFLOPS | Progress: (8/20) | 4.51 s
    [Task 16/25]  Current/Best:   19.64/  19.70 GFLOPS | Progress: (12/20) | 5.74 s
    [Task 16/25]  Current/Best:   17.98/  19.70 GFLOPS | Progress: (16/20) |
  7.10 s
    [Task 16/25]  Current/Best:   10.01/  19.70 GFLOPS | Progress: (20/20) | 9.27 s Done.
+
    [Task 17/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 17/25]  Current/Best:   13.39/  18.84 GFLOPS | Progress: (4/20) | 4.77 s
    [Task 17/25]  Current/Best:   14.43/  23.23 GFLOPS | Progress: (8/20) | 7.65 s
    [Task 17/25]  Current/Best:   17.08/  23.23 GFLOPS | Progress: (12/20) | 9.71 s
    [Task 17/25]  Current/Best:   16.53/  23.23 GFLOPS | Progress: (16/20) | 11.91 s
    [Task 17/25]  Current/Best:   10.03/  23.23 GFLOPS | Progress: (20/20) | 14.08 s Done.
+
    [Task 18/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 18/25]  Current/Best:   11.33/  17.84 GFLOPS | Progress: (4/20) | 3.77 s
    [Task 18/25]  Current/Best:   10.55/  19.85 GFLOPS | Progress: (8/20) | 7.48 s
    [Task 18/25]  Current/Best:   19.19/  19.85 GFLOPS | Progress: (12/20) | 9.40 s
    [Task 18/25]  Current/Best:   10.05/  19.85 GFLOPS | Progress: (16/20) | 13.23 s
    [Task 18/25]  Current/Best:   20.90/  20.90 GFLOPS | Progress: (20/20) | 14.74 s Done.
+
    [Task 19/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 19/25]  Current/Best:    7.06/  20.32 GFLOPS | Progress: (4/20) | 6.08 s
    [Task 19/25]  Current/Best:    2.60/  20.32 GFLOPS | Progress: (8/20) | 9.45 s
    [Task 19/25]  Current/Best:   19.76/  21.53 GFLOPS | Progress: (12/20) | 12.40 s
    [Task 19/25]  Current/Best:   15.38/  21.64 GFLOPS | Progress: (16/20) | 15.38 s
    [Task 19/25]  Current/Best:    2.70/  23.46 GFLOPS | Progress: (20/20) | 18.19 s Done.
+
    [Task 20/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 20/25]  Current/Best:    9.24/  15.19 GFLOPS | Progress: (4/20) | 3.32 s Done.
      Done.
-
    [Task 20/25]  Current/Best:    9.75/  15.20 GFLOPS | Progress: (8/20) | 6.86 s
    [Task 20/25]  Current/Best:    2.32/  16.72 GFLOPS | Progress: (12/20) | 10.80 s
    [Task 20/25]  Current/Best:   12.28/  16.72 GFLOPS | Progress: (16/20) | 14.59 s
    [Task 20/25]  Current/Best:   13.02/  22.23 GFLOPS | Progress: (20/20) | 16.70 s
    [Task 21/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 21/25]  Current/Best:    6.42/  17.59 GFLOPS | Progress: (4/20) | 3.21 s
    [Task 21/25]  Current/Best:   14.64/  17.59 GFLOPS | Progress: (8/20) | 4.85 s
    [Task 21/25]  Current/Best:    1.61/  17.59 GFLOPS | Progress: (12/20) | 6.94 s
    [Task 21/25]  Current/Best:   18.15/  18.15 GFLOPS | Progress: (16/20) | 10.47 s
    [Task 21/25]  Current/Best:    4.46/  18.15 GFLOPS | Progress: (20/20) | 17.74 s
    [Task 22/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 22/25]  Current/Best:    2.70/  17.06 GFLOPS | Progress: (4/20
 ) | 2.63 s
    [Task 22/25]  Current/Best:    8.75/  21.78 GFLOPS | Progress: (8/20) | 4.70 s
    [Task 22/25]  Current/Best:   20.02/  21.78 GFLOPS | Progress: (12/20) | 7.06 s
    [Task 22/25]  Current/Best:   15.38/  21.78 GFLOPS | Progress: (16/20) | 9.18 s
    [Task 22/25]  Current/Best:   14.29/  21.78 GFLOPS | Progress: (20/20) | 10.92 s Done.
-
    [Task 23/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 23/25]  Current/Best:   17.17/  20.65 GFLOPS | Progress: (4/20) | 3.22 s
    [Task 23/25]  Current/Best:   13.80/  20.65 GFLOPS | Progress: (8/20) | 6.72 s
    [Task 23/25]  Current/Best:   20.95/  21.57 GFLOPS | Progress: (12/20) | 8.62 s
    [Task 23/25]  Current/Best:    6.42/  21.57 GFLOPS | Progress: (16/20) | 15.77 s
    [Task 23/25]  Current/Best:    7.84/  21.57 GFLOPS | Progress: (20/20) | 20.02 s Done.
-
    [Task 24/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 24/25]  Current/Best:    8.38/   8.38 GFLOPS | Progress: (4/20) | 11.75 s
    [Task 24/25]  Current/Best:    2.13/   8.38 GFLOPS | Progress: (8/20) | 22.71 s
    [Task 24/25]  Current/Best:    4.40/   8.38 GFLOPS | Progress: (12/20) | 34.20 s Done.
+
    [Task 20/25]  Current/Best:   10.05/  15.19 GFLOPS | Progress: (8/20) | 6.73 s
    [Task 20/25]  Current/Best:    2.32/  16.76 GFLOPS | Progress: (12/20) | 10.69 s
    [Task 20/25]  Current/Best:   12.52/  16.76 GFLOPS | Progress: (16/20) | 14.65 s
    [Task 20/25]  Current/Best:   13.42/  22.02 GFLOPS | Progress: (20/20) | 16.78 s
    [Task 21/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 21/25]  Current/Best:    6.40/  17.61 GFLOPS | Progress: (4/20) | 3.24 s
    [Task 21/25]  Current/Best:   14.60/  17.61 GFLOPS | Progress: (8/20) | 4.84 s
    [Task 21/25]  Current/Best:    1.61/  17.61 GFLOPS | Progress: (12/20) | 6.99 s
    [Task 21/25]  Current/Best:   18.00/  18.00 GFLOPS | Progress: (16/20) | 10.51 s
    [Task 21/25]  Current/Best:    4.47/  18.00 GFLOPS | Progress: (20/20) | 17.92 s
    [Task 22/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 22/25]  Current/Best:    2.70/  17.02 GFLOPS | Progress: (4/20
 ) | 2.64 s
    [Task 22/25]  Current/Best:    8.91/  21.90 GFLOPS | Progress: (8/20) | 4.70 s
    [Task 22/25]  Current/Best:   19.94/  21.90 GFLOPS | Progress: (12/20) | 7.08 s
    [Task 22/25]  Current/Best:   15.34/  21.90 GFLOPS | Progress: (16/20) | 9.25 s
    [Task 22/25]  Current/Best:   14.68/  21.90 GFLOPS | Progress: (20/20) | 10.99 s Done.
+
    [Task 23/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 23/25]  Current/Best:   17.45/  20.55 GFLOPS | Progress: (4/20) | 3.18 s
    [Task 23/25]  Current/Best:   15.65/  20.55 GFLOPS | Progress: (8/20) | 6.56 s
    [Task 23/25]  Current/Best:   20.84/  21.61 GFLOPS | Progress: (12/20) | 8.43 s
    [Task 23/25]  Current/Best:    6.33/  21.61 GFLOPS | Progress: (16/20) | 15.49 s
    [Task 23/25]  Current/Best:    7.83/  21.61 GFLOPS | Progress: (20/20) | 19.75 s Done.
+
    [Task 24/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 24/25]  Current/Best:    8.21/   8.21 GFLOPS | Progress: (4/20) | 11.76 s
    [Task 24/25]  Current/Best:    3.60/   8.21 GFLOPS | Progress: (8/20) | 22.97 s
    [Task 24/25]  Current/Best:    4.07/   8.21 GFLOPS | Progress: (12/20) | 33.71 s Done.
      Done.
-
    [Task 24/25]  Current/Best:    6.65/   8.60 GFLOPS | Progress: (16/20) | 40.04 s
    [Task 24/25]  Current/Best:    3.39/   8.60 GFLOPS | Progress: (20/20) | 46.07 s Done.
-
    [Task 25/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 25/25]  Current/Best:    1.55/   2.72 GFLOPS | Progress: (4/20) | 11.53 s
    [Task 25/25]  Current/Best:    6.03/   7.79 GFLOPS | Progress: (8/20) | 22.73 s
    [Task 25/25]  Current/Best:    5.94/   7.79 GFLOPS | Progress: (12/20) | 33.99 s
    [Task 25/25]  Current/Best:    5.83/   8.53 GFLOPS | Progress: (16/20) | 35.82 s
    [Task 25/25]  Current/Best:    2.87/   8.70 GFLOPS | Progress: (20/20) | 46.49 s
+
    [Task 24/25]  Current/Best:    6.77/   8.80 GFLOPS | Progress: (16/20) | 39.47 s
    [Task 24/25]  Current/Best:    3.31/   9.11 GFLOPS | Progress: (20/20) | 45.59 s Done.
+
    [Task 25/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 25/25]  Current/Best:    1.55/   2.78 GFLOPS | Progress: (4/20) | 11.55 s
    [Task 25/25]  Current/Best:    5.65/   7.91 GFLOPS | Progress: (8/20) | 22.76 s
    [Task 25/25]  Current/Best:    6.04/   7.91 GFLOPS | Progress: (12/20) | 34.03 s
    [Task 25/25]  Current/Best:    5.83/   9.44 GFLOPS | Progress: (16/20) | 35.75 s
    [Task 25/25]  Current/Best:    2.94/   9.44 GFLOPS | Progress: (20/20) | 46.44 s
 
 
 
@@ -735,8 +735,8 @@ improvement in comparing the optimized model to the unoptimized model.
 
  .. code-block:: none
 
-    optimized: {'mean': 411.37254193999524, 'median': 411.352962549995, 'std': 0.3513869974955923}
-    unoptimized: {'mean': 494.4699098900002, 'median': 494.50663965000103, 'std': 0.9450088039594712}
+    optimized: {'mean': 414.72895694999806, 'median': 414.71009494998725, 'std': 0.2884264773767498}
+    unoptimized: {'mean': 496.8772450499864, 'median': 496.6672595999853, 'std': 1.4294658444955355}
 
 
 
@@ -759,7 +759,7 @@ profiling/benchmarking.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 10 minutes  19.842 seconds)
+   **Total running time of the script:** ( 10 minutes  23.018 seconds)
 
 
 .. _sphx_glr_download_tutorial_autotvm_relay_x86.py:
diff --git a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
index 66e687fb9..baba208db 100644
--- a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
+++ b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
@@ -269,7 +269,7 @@ device and returns the measured cost. Network overhead is excluded.
 
  .. code-block:: none
 
-    1.298e-07 secs/op
+    1.292e-07 secs/op
 
 
 
diff --git a/docs/_sources/tutorial/intro_topi.rst.txt b/docs/_sources/tutorial/intro_topi.rst.txt
index ccbad5116..7f05afcca 100644
--- a/docs/_sources/tutorial/intro_topi.rst.txt
+++ b/docs/_sources/tutorial/intro_topi.rst.txt
@@ -262,7 +262,7 @@ As you can see, scheduled stages of computation have been accumulated and we can
 
  .. code-block:: none
 
-    [stage(a, placeholder(a, 0xce7d210)), stage(b, placeholder(b, 0xbc13d50)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min= [...]
+    [stage(a, placeholder(a, 0x219218b0)), stage(b, placeholder(b, 0xe250130)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min [...]
 
 
 
diff --git a/docs/_sources/tutorial/sg_execution_times.rst.txt b/docs/_sources/tutorial/sg_execution_times.rst.txt
index ec1499958..092ac15e5 100644
--- a/docs/_sources/tutorial/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorial/sg_execution_times.rst.txt
@@ -5,29 +5,29 @@
 
 Computation times
 =================
-**13:47.370** total execution time for **tutorial** files:
+**13:30.797** total execution time for **tutorial** files:
 
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``)                 | 10:19.842 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``)                 | 10:23.018 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``) | 01:32.785 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``) | 01:13.161 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``)     | 01:00.519 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``)     | 01:01.382 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``)                 | 00:27.674 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``)                 | 00:28.210 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``)               | 00:24.343 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``)               | 00:23.705 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``)       | 00:01.404 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``)                               | 00:00.666 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``)                               | 00:00.658 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``)       | 00:00.512 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``) | 00:00.144 | 0.0 MB |
-+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tvmc_command_line_driver.py` (``tvmc_command_line_driver.py``)   | 00:00.000 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``) | 00:00.143 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_tutorial_introduction.py` (``introduction.py``)                           | 00:00.000 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_tutorial_tvmc_command_line_driver.py` (``tvmc_command_line_driver.py``)   | 00:00.000 | 0.0 MB |
++------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_tutorial_tvmc_python.py` (``tvmc_python.py``)                             | 00:00.000 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_tutorial_install.py` (``install.py``)                                     | 00:00.000 | 0.0 MB |
diff --git a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
index eb22ecf27..b4a8c5f98 100644
--- a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
+++ b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
@@ -288,8 +288,8 @@ helper function to run a profile of the TVM generated code.
 
  .. code-block:: none
 
-    Numpy running time: 0.000009
-    naive: 0.000006
+    Numpy running time: 0.000008
+    naive: 0.000007
 
 
 
@@ -499,10 +499,10 @@ We can now compare the different schedules
  .. code-block:: none
 
                 Operator                  Timing             Performance
-                   numpy    8.620539999810716e-06                    1.0
-                   naive              5.8315e-06      0.6764657434601595
-                parallel              6.9705e-06      0.8085920371755195
-                  vector             2.46388e-05       2.858150417554005
+                   numpy    8.178030002454761e-06                    1.0
+                   naive               6.837e-06      0.8360204105325817
+                parallel    6.962200000000001e-06     0.8513297209609392
+                  vector    2.4714099999999998e-05    3.0220114125995723
 
 
 
@@ -923,7 +923,7 @@ matrix multiplication.
 
  .. code-block:: none
 
-    Numpy running time: 0.018293
+    Numpy running time: 0.018520
 
 
 
@@ -983,7 +983,7 @@ optimizations.
 
     /workspace/python/tvm/driver/build_module.py:264: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
       "target_host parameter is going to be deprecated. "
-    none: 3.389984
+    none: 3.452831
 
 
 
@@ -1088,7 +1088,7 @@ schedule.
 
     /workspace/python/tvm/driver/build_module.py:264: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
       "target_host parameter is going to be deprecated. "
-    blocking: 0.301645
+    blocking: 0.292013
 
 
 
@@ -1186,7 +1186,7 @@ already cache friendly from our previous optimizations.
 
     /workspace/python/tvm/driver/build_module.py:264: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
       "target_host parameter is going to be deprecated. "
-    vectorization: 0.334908
+    vectorization: 0.336959
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1262,7 +1262,7 @@ more cache friendly.
 
     /workspace/python/tvm/driver/build_module.py:264: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
       "target_host parameter is going to be deprecated. "
-    loop permutation: 0.115885
+    loop permutation: 0.121260
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1363,7 +1363,7 @@ optimized schedule.
 
     /workspace/python/tvm/driver/build_module.py:264: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
       "target_host parameter is going to be deprecated. "
-    array packing: 0.108837
+    array packing: 0.111406
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1458,7 +1458,7 @@ to `C` when all the block results are ready.
 
     /workspace/python/tvm/driver/build_module.py:264: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
       "target_host parameter is going to be deprecated. "
-    block caching: 0.111126
+    block caching: 0.111786
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1546,7 +1546,7 @@ of thread-level parallelization.
 
     /workspace/python/tvm/driver/build_module.py:264: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
       "target_host parameter is going to be deprecated. "
-    parallelization: 0.144419
+    parallelization: 0.145750
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1627,13 +1627,13 @@ working, we can compare the results.
  .. code-block:: none
 
                 Operator                  Timing             Performance
-                    none            3.3899836249                     1.0
-                blocking            0.3016445303     0.08898111721967332
-           vectorization            0.3349076508     0.09879329455754504
-        loop permutation     0.11588464980000002     0.03418442760277889
-           array packing            0.1088372546    0.032105539920774855
-           block caching     0.11112594270000001     0.03278067241498197
-         parallelization            0.1444187422     0.04260160466240015
+                    none      3.4528312396000005                     1.0
+                blocking            0.2920132543     0.08457211894718285
+           vectorization     0.33695932559999997     0.09758928317592366
+        loop permutation     0.12125966699999999    0.035118909261851884
+           array packing            0.1114056159     0.03226500462064459
+           block caching            0.1117857104    0.032375086600800684
+         parallelization     0.14575047530000002    0.042211873441253024
 
 
 
@@ -1675,7 +1675,7 @@ the computation for specific platforms.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  0.519 seconds)
+   **Total running time of the script:** ( 1 minutes  1.382 seconds)
 
 
 .. _sphx_glr_download_tutorial_tensor_expr_get_started.py:
diff --git a/docs/commit_hash b/docs/commit_hash
index 2622cd117..1d81f4bcb 100644
--- a/docs/commit_hash
+++ b/docs/commit_hash
@@ -1 +1 @@
-caa0d59c335713d29b1e63714395fc2ba3d979dc
+c334790bf88694db8d748d2299f50f2b04c46486
diff --git a/docs/how_to/compile_models/from_mxnet.html b/docs/how_to/compile_models/from_mxnet.html
index b8ac33426..45d0bdb23 100644
--- a/docs/how_to/compile_models/from_mxnet.html
+++ b/docs/how_to/compile_models/from_mxnet.html
@@ -422,7 +422,7 @@ to download the full example code</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;x&quot;</span><span class="p">,</span> <a href="https://docs.python.org/3/library/stdtypes.html#tuple" title="builtins.tuple" class="sphx-glr-backref-module-builtins sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">x</span><span class="o">.</span><span class="n">shape</span></a><span class="p">)</span>
 </pre></div>
 </div>
-<img src="../../_images/sphx_glr_from_mxnet_001.png" srcset="../../_images/sphx_glr_from_mxnet_001.png" alt="from mxnet" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip812ab175-3adc-45fc-a776-1bd65330f280 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+<img src="../../_images/sphx_glr_from_mxnet_001.png" srcset="../../_images/sphx_glr_from_mxnet_001.png" alt="from mxnet" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip2cde5cc7-4677-4c09-92ea-a4a047eb10a2 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
 x (1, 3, 224, 224)
 </pre></div>
 </div>
diff --git a/docs/how_to/compile_models/from_oneflow.html b/docs/how_to/compile_models/from_oneflow.html
index 6991f26a1..adb3a72c9 100644
--- a/docs/how_to/compile_models/from_oneflow.html
+++ b/docs/how_to/compile_models/from_oneflow.html
@@ -427,105 +427,143 @@ python3 -m pip install -f https://release.oneflow.info <span class="nv">oneflow<
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: &quot;https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip&quot; to /workspace/.oneflow/flowvision_cache/resnet18.zip
 
   0%|          | 0.00/41.5M [00:00&lt;?, ?B/s]
-  0%|          | 16.0k/41.5M [00:00&lt;07:57, 91.1kB/s]
-  0%|          | 40.0k/41.5M [00:00&lt;06:09, 117kB/s]
-  0%|          | 72.0k/41.5M [00:00&lt;04:56, 147kB/s]
-  0%|          | 96.0k/41.5M [00:00&lt;05:05, 142kB/s]
-  0%|          | 128k/41.5M [00:00&lt;04:37, 156kB/s]
-  0%|          | 160k/41.5M [00:01&lt;04:23, 165kB/s]
-  0%|          | 192k/41.5M [00:01&lt;04:14, 170kB/s]
-  1%|          | 232k/41.5M [00:01&lt;03:50, 188kB/s]
-  1%|          | 264k/41.5M [00:01&lt;03:52, 186kB/s]
-  1%|          | 304k/41.5M [00:01&lt;03:37, 198kB/s]
-  1%|          | 352k/41.5M [00:01&lt;03:15, 221kB/s]
-  1%|          | 392k/41.5M [00:02&lt;03:13, 222kB/s]
-  1%|1         | 440k/41.5M [00:02&lt;03:01, 237kB/s]
-  1%|1         | 488k/41.5M [00:02&lt;02:53, 248kB/s]
-  1%|1         | 536k/41.5M [00:02&lt;02:48, 255kB/s]
-  1%|1         | 584k/41.5M [00:02&lt;02:45, 260kB/s]
-  2%|1         | 640k/41.5M [00:03&lt;02:34, 277kB/s]
-  2%|1         | 696k/41.5M [00:03&lt;02:28, 289kB/s]
-  2%|1         | 760k/41.5M [00:03&lt;02:17, 311kB/s]
-  2%|1         | 824k/41.5M [00:03&lt;02:10, 326kB/s]
-  2%|2         | 888k/41.5M [00:03&lt;02:06, 337kB/s]
-  2%|2         | 960k/41.5M [00:03&lt;01:58, 358kB/s]
-  2%|2         | 1.01M/41.5M [00:04&lt;01:53, 373kB/s]
-  3%|2         | 1.09M/41.5M [00:04&lt;01:46, 397kB/s]
-  3%|2         | 1.16M/41.5M [00:04&lt;01:42, 413kB/s]
-  3%|2         | 1.24M/41.5M [00:04&lt;01:39, 425kB/s]
-  3%|3         | 1.33M/41.5M [00:04&lt;01:34, 447kB/s]
-  3%|3         | 1.42M/41.5M [00:05&lt;01:28, 476kB/s]
-  4%|3         | 1.52M/41.5M [00:05&lt;01:24, 496kB/s]
-  4%|3         | 1.62M/41.5M [00:05&lt;01:19, 524kB/s]
-  4%|4         | 1.73M/41.5M [00:05&lt;01:14, 557kB/s]
-  4%|4         | 1.84M/41.5M [00:05&lt;01:11, 580kB/s]
-  5%|4         | 1.95M/41.5M [00:05&lt;01:08, 610kB/s]
-  5%|5         | 2.09M/41.5M [00:06&lt;01:02, 658kB/s]
-  5%|5         | 2.23M/41.5M [00:06&lt;00:58, 705kB/s]
-  6%|5         | 2.38M/41.5M [00:06&lt;00:54, 751kB/s]
-  6%|6         | 2.54M/41.5M [00:06&lt;00:50, 811kB/s]
-  7%|6         | 2.71M/41.5M [00:06&lt;00:46, 866kB/s]
-  7%|6         | 2.90M/41.5M [00:07&lt;00:43, 932kB/s]
-  7%|7         | 3.09M/41.5M [00:07&lt;00:40, 992kB/s]
-  8%|7         | 3.31M/41.5M [00:07&lt;00:37, 1.07MB/s]
-  9%|8         | 3.54M/41.5M [00:07&lt;00:34, 1.15MB/s]
-  9%|9         | 3.78M/41.5M [00:07&lt;00:32, 1.22MB/s]
- 10%|9         | 4.04M/41.5M [00:07&lt;00:30, 1.30MB/s]
- 10%|#         | 4.31M/41.5M [00:08&lt;00:28, 1.39MB/s]
- 11%|#1        | 4.61M/41.5M [00:08&lt;00:25, 1.49MB/s]
- 12%|#1        | 4.91M/41.5M [00:08&lt;00:22, 1.74MB/s]
- 13%|#2        | 5.24M/41.5M [00:08&lt;00:19, 1.93MB/s]
- 13%|#3        | 5.44M/41.5M [00:08&lt;00:19, 1.95MB/s]
- 14%|#3        | 5.63M/41.5M [00:08&lt;00:22, 1.66MB/s]
- 14%|#4        | 5.98M/41.5M [00:08&lt;00:18, 1.99MB/s]
- 15%|#5        | 6.37M/41.5M [00:09&lt;00:16, 2.27MB/s]
- 16%|#5        | 6.59M/41.5M [00:09&lt;00:16, 2.29MB/s]
- 16%|#6        | 6.82M/41.5M [00:09&lt;00:18, 1.93MB/s]
- 17%|#7        | 7.23M/41.5M [00:09&lt;00:15, 2.36MB/s]
- 19%|#8        | 7.69M/41.5M [00:09&lt;00:12, 2.91MB/s]
- 19%|#9        | 7.99M/41.5M [00:09&lt;00:12, 2.71MB/s]
- 20%|#9        | 8.27M/41.5M [00:09&lt;00:15, 2.32MB/s]
- 21%|##1       | 8.72M/41.5M [00:10&lt;00:12, 2.72MB/s]
- 22%|##2       | 9.26M/41.5M [00:10&lt;00:09, 3.40MB/s]
- 23%|##3       | 9.62M/41.5M [00:10&lt;00:10, 3.16MB/s]
- 24%|##3       | 9.95M/41.5M [00:10&lt;00:12, 2.71MB/s]
- 25%|##5       | 10.5M/41.5M [00:10&lt;00:11, 2.92MB/s]
- 27%|##6       | 11.1M/41.5M [00:10&lt;00:08, 3.61MB/s]
- 28%|##8       | 11.8M/41.5M [00:10&lt;00:07, 4.27MB/s]
- 29%|##9       | 12.2M/41.5M [00:11&lt;00:08, 3.82MB/s]
- 30%|###       | 12.6M/41.5M [00:11&lt;00:09, 3.28MB/s]
- 32%|###1      | 13.2M/41.5M [00:11&lt;00:08, 3.58MB/s]
- 34%|###3      | 14.0M/41.5M [00:11&lt;00:06, 4.40MB/s]
- 36%|###5      | 14.8M/41.5M [00:11&lt;00:05, 5.20MB/s]
- 37%|###7      | 15.4M/41.5M [00:11&lt;00:05, 4.65MB/s]
- 38%|###8      | 15.8M/41.5M [00:11&lt;00:06, 4.01MB/s]
- 40%|###9      | 16.6M/41.5M [00:12&lt;00:05, 4.86MB/s]
- 42%|####2     | 17.5M/41.5M [00:12&lt;00:04, 5.71MB/s]
- 44%|####3     | 18.1M/41.5M [00:12&lt;00:04, 5.20MB/s]
- 45%|####4     | 18.6M/41.5M [00:12&lt;00:05, 4.49MB/s]
- 47%|####7     | 19.5M/41.5M [00:12&lt;00:04, 5.68MB/s]
- 50%|####9     | 20.6M/41.5M [00:12&lt;00:03, 6.64MB/s]
- 51%|#####1    | 21.2M/41.5M [00:12&lt;00:03, 6.02MB/s]
- 53%|#####2    | 21.9M/41.5M [00:13&lt;00:03, 5.19MB/s]
- 55%|#####5    | 23.0M/41.5M [00:13&lt;00:02, 6.58MB/s]
- 58%|#####8    | 24.1M/41.5M [00:13&lt;00:02, 7.67MB/s]
- 60%|######    | 24.9M/41.5M [00:13&lt;00:02, 6.96MB/s]
- 62%|######1   | 25.6M/41.5M [00:13&lt;00:02, 6.02MB/s]
- 65%|######4   | 26.9M/41.5M [00:13&lt;00:02, 7.55MB/s]
- 68%|######8   | 28.2M/41.5M [00:13&lt;00:01, 8.86MB/s]
- 70%|#######   | 29.2M/41.5M [00:13&lt;00:01, 8.01MB/s]
- 72%|#######2  | 30.0M/41.5M [00:14&lt;00:01, 6.94MB/s]
- 75%|#######5  | 31.3M/41.5M [00:14&lt;00:01, 8.35MB/s]
- 79%|#######8  | 32.6M/41.5M [00:14&lt;00:00, 9.59MB/s]
- 81%|########1 | 33.6M/41.5M [00:14&lt;00:00, 8.55MB/s]
- 83%|########3 | 34.5M/41.5M [00:14&lt;00:00, 7.40MB/s]
- 86%|########6 | 35.7M/41.5M [00:14&lt;00:00, 8.45MB/s]
- 89%|########9 | 37.1M/41.5M [00:14&lt;00:00, 9.58MB/s]
- 92%|#########1| 38.0M/41.5M [00:15&lt;00:00, 8.60MB/s]
- 94%|#########3| 38.9M/41.5M [00:15&lt;00:00, 7.43MB/s]
- 97%|#########6| 40.1M/41.5M [00:15&lt;00:00, 8.47MB/s]
-100%|#########9| 41.5M/41.5M [00:15&lt;00:00, 9.60MB/s]
-100%|##########| 41.5M/41.5M [00:15&lt;00:00, 2.82MB/s]
+  0%|          | 16.0k/41.5M [00:00&lt;08:17, 87.5kB/s]
+  0%|          | 32.0k/41.5M [00:00&lt;08:18, 87.2kB/s]
+  0%|          | 48.0k/41.5M [00:00&lt;08:18, 87.1kB/s]
+  0%|          | 64.0k/41.5M [00:00&lt;08:18, 87.1kB/s]
+  0%|          | 80.0k/41.5M [00:00&lt;08:19, 87.0kB/s]
+  0%|          | 96.0k/41.5M [00:01&lt;08:18, 87.0kB/s]
+  0%|          | 112k/41.5M [00:01&lt;08:18, 87.0kB/s]
+  0%|          | 128k/41.5M [00:01&lt;08:18, 87.0kB/s]
+  0%|          | 144k/41.5M [00:01&lt;08:18, 87.0kB/s]
+  0%|          | 168k/41.5M [00:01&lt;07:11, 100kB/s]
+  0%|          | 184k/41.5M [00:02&lt;07:29, 96.3kB/s]
+  0%|          | 208k/41.5M [00:02&lt;06:45, 107kB/s]
+  1%|          | 232k/41.5M [00:02&lt;06:19, 114kB/s]
+  1%|          | 256k/41.5M [00:02&lt;06:03, 119kB/s]
+  1%|          | 280k/41.5M [00:02&lt;05:53, 122kB/s]
+  1%|          | 304k/41.5M [00:03&lt;05:45, 125kB/s]
+  1%|          | 336k/41.5M [00:03&lt;05:09, 140kB/s]
+  1%|          | 368k/41.5M [00:03&lt;04:47, 150kB/s]
+  1%|          | 400k/41.5M [00:03&lt;04:34, 157kB/s]
+  1%|1         | 440k/41.5M [00:03&lt;04:05, 175kB/s]
+  1%|1         | 480k/41.5M [00:03&lt;03:48, 188kB/s]
+  1%|1         | 528k/41.5M [00:04&lt;03:24, 210kB/s]
+  1%|1         | 584k/41.5M [00:04&lt;03:00, 238kB/s]
+  2%|1         | 640k/41.5M [00:04&lt;02:45, 258kB/s]
+  2%|1         | 696k/41.5M [00:04&lt;02:37, 272kB/s]
+  2%|1         | 768k/41.5M [00:04&lt;02:18, 308kB/s]
+  2%|1         | 848k/41.5M [00:05&lt;02:03, 346kB/s]
+  2%|2         | 928k/41.5M [00:05&lt;01:54, 373kB/s]
+  2%|2         | 1.00M/41.5M [00:05&lt;01:41, 418kB/s]
+  3%|2         | 1.10M/41.5M [00:05&lt;01:31, 461kB/s]
+  3%|2         | 1.15M/41.5M [00:06&lt;02:48, 250kB/s]
+  3%|3         | 1.43M/41.5M [00:06&lt;01:19, 529kB/s]
+  4%|3         | 1.52M/41.5M [00:06&lt;01:21, 516kB/s]
+  4%|3         | 1.60M/41.5M [00:06&lt;01:22, 507kB/s]
+  4%|4         | 1.69M/41.5M [00:06&lt;01:23, 500kB/s]
+  4%|4         | 1.79M/41.5M [00:07&lt;01:20, 517kB/s]
+  5%|4         | 1.88M/41.5M [00:07&lt;01:19, 519kB/s]
+  5%|4         | 1.98M/41.5M [00:07&lt;01:17, 533kB/s]
+  5%|5         | 2.09M/41.5M [00:07&lt;01:16, 542kB/s]
+  5%|5         | 2.20M/41.5M [00:07&lt;01:13, 562kB/s]
+  6%|5         | 2.30M/41.5M [00:08&lt;01:13, 563kB/s]
+  6%|5         | 2.41M/41.5M [00:08&lt;01:11, 577kB/s]
+  6%|6         | 2.52M/41.5M [00:08&lt;01:09, 586kB/s]
+  6%|6         | 2.63M/41.5M [00:08&lt;01:07, 606kB/s]
+  7%|6         | 2.74M/41.5M [00:08&lt;01:06, 607kB/s]
+  7%|6         | 2.85M/41.5M [00:09&lt;01:06, 608kB/s]
+  7%|7         | 2.97M/41.5M [00:09&lt;01:05, 621kB/s]
+  7%|7         | 3.09M/41.5M [00:09&lt;01:03, 631kB/s]
+  8%|7         | 3.20M/41.5M [00:09&lt;01:04, 624kB/s]
+  8%|7         | 3.31M/41.5M [00:09&lt;01:03, 633kB/s]
+  8%|8         | 3.43M/41.5M [00:09&lt;01:02, 639kB/s]
+  9%|8         | 3.54M/41.5M [00:10&lt;01:03, 630kB/s]
+  9%|8         | 3.66M/41.5M [00:10&lt;01:02, 637kB/s]
+  9%|9         | 3.77M/41.5M [00:10&lt;01:02, 628kB/s]
+  9%|9         | 3.88M/41.5M [00:10&lt;01:02, 636kB/s]
+ 10%|9         | 3.99M/41.5M [00:10&lt;01:02, 628kB/s]
+ 10%|9         | 4.11M/41.5M [00:11&lt;01:01, 635kB/s]
+ 10%|#         | 4.23M/41.5M [00:11&lt;01:01, 641kB/s]
+ 10%|#         | 4.34M/41.5M [00:11&lt;01:01, 631kB/s]
+ 11%|#         | 4.45M/41.5M [00:11&lt;01:00, 638kB/s]
+ 11%|#1        | 4.57M/41.5M [00:11&lt;01:00, 642kB/s]
+ 11%|#1        | 4.69M/41.5M [00:12&lt;00:59, 645kB/s]
+ 12%|#1        | 4.80M/41.5M [00:12&lt;00:59, 648kB/s]
+ 12%|#1        | 4.92M/41.5M [00:12&lt;00:59, 649kB/s]
+ 12%|#2        | 5.04M/41.5M [00:12&lt;00:58, 650kB/s]
+ 12%|#2        | 5.16M/41.5M [00:12&lt;00:58, 651kB/s]
+ 13%|#2        | 5.28M/41.5M [00:12&lt;00:57, 665kB/s]
+ 13%|#3        | 5.41M/41.5M [00:13&lt;00:56, 674kB/s]
+ 13%|#3        | 5.53M/41.5M [00:13&lt;00:55, 681kB/s]
+ 14%|#3        | 5.66M/41.5M [00:13&lt;00:53, 698kB/s]
+ 14%|#3        | 5.80M/41.5M [00:13&lt;00:52, 711kB/s]
+ 14%|#4        | 5.94M/41.5M [00:13&lt;00:50, 732kB/s]
+ 15%|#4        | 6.08M/41.5M [00:14&lt;00:49, 748kB/s]
+ 15%|#4        | 6.22M/41.5M [00:14&lt;00:48, 758kB/s]
+ 15%|#5        | 6.37M/41.5M [00:14&lt;00:47, 779kB/s]
+ 16%|#5        | 6.52M/41.5M [00:14&lt;00:45, 806kB/s]
+ 16%|#6        | 6.69M/41.5M [00:14&lt;00:43, 839kB/s]
+ 17%|#6        | 6.85M/41.5M [00:15&lt;00:42, 861kB/s]
+ 17%|#6        | 7.03M/41.5M [00:15&lt;00:40, 903kB/s]
+ 17%|#7        | 7.21M/41.5M [00:15&lt;00:38, 932kB/s]
+ 18%|#7        | 7.41M/41.5M [00:15&lt;00:36, 979kB/s]
+ 18%|#8        | 7.60M/41.5M [00:15&lt;00:35, 1.01MB/s]
+ 19%|#8        | 7.81M/41.5M [00:16&lt;00:33, 1.06MB/s]
+ 19%|#9        | 8.02M/41.5M [00:16&lt;00:32, 1.10MB/s]
+ 20%|#9        | 8.26M/41.5M [00:16&lt;00:30, 1.16MB/s]
+ 20%|##        | 8.49M/41.5M [00:16&lt;00:28, 1.20MB/s]
+ 21%|##1       | 8.74M/41.5M [00:16&lt;00:27, 1.26MB/s]
+ 22%|##1       | 9.01M/41.5M [00:16&lt;00:25, 1.33MB/s]
+ 22%|##2       | 9.28M/41.5M [00:17&lt;00:24, 1.38MB/s]
+ 23%|##3       | 9.57M/41.5M [00:17&lt;00:23, 1.45MB/s]
+ 24%|##3       | 9.87M/41.5M [00:17&lt;00:21, 1.51MB/s]
+ 25%|##4       | 10.2M/41.5M [00:17&lt;00:20, 1.58MB/s]
+ 25%|##5       | 10.5M/41.5M [00:17&lt;00:19, 1.67MB/s]
+ 26%|##6       | 10.9M/41.5M [00:18&lt;00:18, 1.77MB/s]
+ 27%|##7       | 11.2M/41.5M [00:18&lt;00:17, 1.85MB/s]
+ 28%|##8       | 11.6M/41.5M [00:18&lt;00:16, 1.95MB/s]
+ 29%|##9       | 12.0M/41.5M [00:18&lt;00:15, 2.06MB/s]
+ 30%|###       | 12.5M/41.5M [00:18&lt;00:14, 2.17MB/s]
+ 31%|###1      | 12.9M/41.5M [00:19&lt;00:13, 2.28MB/s]
+ 32%|###2      | 13.4M/41.5M [00:19&lt;00:12, 2.39MB/s]
+ 34%|###3      | 13.9M/41.5M [00:19&lt;00:11, 2.51MB/s]
+ 35%|###4      | 14.4M/41.5M [00:19&lt;00:10, 2.62MB/s]
+ 36%|###6      | 15.0M/41.5M [00:19&lt;00:10, 2.76MB/s]
+ 38%|###7      | 15.6M/41.5M [00:19&lt;00:09, 2.90MB/s]
+ 39%|###8      | 16.2M/41.5M [00:20&lt;00:08, 3.06MB/s]
+ 41%|####      | 16.8M/41.5M [00:20&lt;00:08, 3.21MB/s]
+ 42%|####2     | 17.5M/41.5M [00:20&lt;00:07, 3.37MB/s]
+ 44%|####3     | 18.2M/41.5M [00:20&lt;00:06, 3.53MB/s]
+ 46%|####5     | 18.9M/41.5M [00:20&lt;00:06, 3.70MB/s]
+ 47%|####7     | 19.7M/41.5M [00:21&lt;00:05, 3.87MB/s]
+ 49%|####9     | 20.5M/41.5M [00:21&lt;00:04, 4.42MB/s]
+ 51%|#####1    | 21.2M/41.5M [00:21&lt;00:04, 5.06MB/s]
+ 52%|#####2    | 21.7M/41.5M [00:21&lt;00:04, 4.66MB/s]
+ 54%|#####3    | 22.2M/41.5M [00:21&lt;00:05, 3.98MB/s]
+ 56%|#####5    | 23.0M/41.5M [00:21&lt;00:04, 4.70MB/s]
+ 58%|#####7    | 23.9M/41.5M [00:21&lt;00:03, 5.57MB/s]
+ 59%|#####8    | 24.5M/41.5M [00:22&lt;00:03, 5.12MB/s]
+ 60%|######    | 25.0M/41.5M [00:22&lt;00:03, 4.91MB/s]
+ 62%|######2   | 25.9M/41.5M [00:22&lt;00:02, 6.01MB/s]
+ 64%|######3   | 26.5M/41.5M [00:22&lt;00:02, 5.46MB/s]
+ 65%|######5   | 27.1M/41.5M [00:22&lt;00:02, 5.30MB/s]
+ 68%|######7   | 28.1M/41.5M [00:22&lt;00:02, 6.58MB/s]
+ 69%|######9   | 28.8M/41.5M [00:22&lt;00:02, 5.94MB/s]
+ 71%|#######   | 29.4M/41.5M [00:22&lt;00:02, 5.76MB/s]
+ 74%|#######3  | 30.5M/41.5M [00:23&lt;00:01, 7.16MB/s]
+ 75%|#######5  | 31.3M/41.5M [00:23&lt;00:01, 6.45MB/s]
+ 77%|#######7  | 32.0M/41.5M [00:23&lt;00:01, 6.23MB/s]
+ 80%|#######9  | 33.1M/41.5M [00:23&lt;00:01, 7.76MB/s]
+ 82%|########1 | 33.9M/41.5M [00:23&lt;00:01, 6.96MB/s]
+ 84%|########3 | 34.7M/41.5M [00:23&lt;00:01, 6.73MB/s]
+ 87%|########6 | 35.9M/41.5M [00:23&lt;00:00, 8.30MB/s]
+ 89%|########8 | 36.8M/41.5M [00:23&lt;00:00, 7.44MB/s]
+ 90%|######### | 37.5M/41.5M [00:24&lt;00:00, 7.20MB/s]
+ 94%|#########3| 38.9M/41.5M [00:24&lt;00:00, 8.84MB/s]
+ 96%|#########5| 39.8M/41.5M [00:24&lt;00:00, 7.91MB/s]
+ 98%|#########7| 40.6M/41.5M [00:24&lt;00:00, 6.69MB/s]
+100%|##########| 41.5M/41.5M [00:24&lt;00:00, 1.78MB/s]
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/compile_models/from_paddle.html b/docs/how_to/compile_models/from_paddle.html
index 359517caa..3fbc8a5a5 100644
--- a/docs/how_to/compile_models/from_paddle.html
+++ b/docs/how_to/compile_models/from_paddle.html
@@ -488,7 +488,7 @@ A quick solution is</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>TVM prediction top-1 id: 282, class name:  282: &#39;tiger cat&#39;,
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  7.006 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  7.747 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-paddle-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/16269b77359771348d507395692524cf/from_paddle.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_paddle.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/from_pytorch.html b/docs/how_to/compile_models/from_pytorch.html
index 8c131dbdf..12766057a 100644
--- a/docs/how_to/compile_models/from_pytorch.html
+++ b/docs/how_to/compile_models/from_pytorch.html
@@ -409,16 +409,20 @@ be unstable.</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: &quot;https://download.pytorch.org/models/resnet18-f37072fd.pth&quot; to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
 
   0%|          | 0.00/44.7M [00:00&lt;?, ?B/s]
-  6%|5         | 2.66M/44.7M [00:00&lt;00:01, 27.8MB/s]
- 12%|#2        | 5.52M/44.7M [00:00&lt;00:01, 28.9MB/s]
- 23%|##2       | 10.3M/44.7M [00:00&lt;00:00, 38.3MB/s]
- 35%|###4      | 15.5M/44.7M [00:00&lt;00:00, 44.5MB/s]
- 45%|####5     | 20.2M/44.7M [00:00&lt;00:00, 46.4MB/s]
- 58%|#####7    | 25.8M/44.7M [00:00&lt;00:00, 49.5MB/s]
- 70%|######9   | 31.1M/44.7M [00:00&lt;00:00, 51.7MB/s]
- 82%|########1 | 36.5M/44.7M [00:00&lt;00:00, 53.2MB/s]
- 94%|#########3| 41.9M/44.7M [00:00&lt;00:00, 54.1MB/s]
-100%|##########| 44.7M/44.7M [00:00&lt;00:00, 49.1MB/s]
+  7%|7         | 3.23M/44.7M [00:00&lt;00:01, 33.9MB/s]
+ 14%|#4        | 6.47M/44.7M [00:00&lt;00:01, 33.3MB/s]
+ 22%|##1       | 9.75M/44.7M [00:00&lt;00:01, 33.7MB/s]
+ 29%|##9       | 13.0M/44.7M [00:00&lt;00:01, 31.9MB/s]
+ 38%|###8      | 17.1M/44.7M [00:00&lt;00:00, 35.3MB/s]
+ 46%|####5     | 20.5M/44.7M [00:00&lt;00:00, 34.8MB/s]
+ 53%|#####3    | 23.8M/44.7M [00:00&lt;00:01, 21.6MB/s]
+ 59%|#####9    | 26.5M/44.7M [00:01&lt;00:00, 21.8MB/s]
+ 67%|######6   | 29.8M/44.7M [00:01&lt;00:00, 24.9MB/s]
+ 75%|#######4  | 33.3M/44.7M [00:01&lt;00:00, 27.8MB/s]
+ 83%|########2 | 37.0M/44.7M [00:01&lt;00:00, 29.9MB/s]
+ 90%|########9 | 40.1M/44.7M [00:01&lt;00:00, 28.3MB/s]
+ 96%|#########6| 43.0M/44.7M [00:01&lt;00:00, 28.8MB/s]
+100%|##########| 44.7M/44.7M [00:01&lt;00:00, 28.6MB/s]
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/compile_models/from_tensorflow.html b/docs/how_to/compile_models/from_tensorflow.html
index e3f9d3502..6a1dcc90a 100644
--- a/docs/how_to/compile_models/from_tensorflow.html
+++ b/docs/how_to/compile_models/from_tensorflow.html
@@ -631,7 +631,7 @@ banana (score = 0.00022)
 desk (score = 0.00019)
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  4.119 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  1.170 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-tensorflow-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/7f1d3d1b878694c201c614c807cdebc8/from_tensorflow.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_tensorflow.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/sg_execution_times.html b/docs/how_to/compile_models/sg_execution_times.html
index c73c688af..f5be286bf 100644
--- a/docs/how_to/compile_models/sg_execution_times.html
+++ b/docs/how_to/compile_models/sg_execution_times.html
@@ -322,7 +322,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-compile-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>05:47.852</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
+<p><strong>05:55.609</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 81%" />
@@ -331,43 +331,43 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="from_paddle.html#sphx-glr-how-to-compile-models-from-paddle-py"><span class="std std-ref">Compile PaddlePaddle Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_paddle.py</span></code>)</p></td>
-<td><p>01:07.006</p></td>
+<td><p>01:07.747</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="from_tensorflow.html#sphx-glr-how-to-compile-models-from-tensorflow-py"><span class="std std-ref">Compile Tensorflow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tensorflow.py</span></code>)</p></td>
-<td><p>01:04.119</p></td>
+<td><p>01:01.170</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="from_darknet.html#sphx-glr-how-to-compile-models-from-darknet-py"><span class="std std-ref">Compile YOLO-V2 and YOLO-V3 in DarkNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_darknet.py</span></code>)</p></td>
-<td><p>00:56.601</p></td>
+<td><p>00:57.992</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="from_oneflow.html#sphx-glr-how-to-compile-models-from-oneflow-py"><span class="std std-ref">Compile OneFlow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_oneflow.py</span></code>)</p></td>
-<td><p>00:40.653</p></td>
+<td><p>00:50.009</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="from_tflite.html#sphx-glr-how-to-compile-models-from-tflite-py"><span class="std std-ref">Compile TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tflite.py</span></code>)</p></td>
-<td><p>00:38.824</p></td>
+<td><p>00:36.725</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="from_mxnet.html#sphx-glr-how-to-compile-models-from-mxnet-py"><span class="std std-ref">Compile MXNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_mxnet.py</span></code>)</p></td>
-<td><p>00:22.452</p></td>
+<td><p>00:22.695</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="from_coreml.html#sphx-glr-how-to-compile-models-from-coreml-py"><span class="std std-ref">Compile CoreML Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_coreml.py</span></code>)</p></td>
-<td><p>00:21.643</p></td>
+<td><p>00:21.516</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="from_pytorch.html#sphx-glr-how-to-compile-models-from-pytorch-py"><span class="std std-ref">Compile PyTorch Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_pytorch.py</span></code>)</p></td>
-<td><p>00:19.948</p></td>
+<td><p>00:20.885</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="from_keras.html#sphx-glr-how-to-compile-models-from-keras-py"><span class="std std-ref">Compile Keras Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_keras.py</span></code>)</p></td>
-<td><p>00:14.254</p></td>
+<td><p>00:14.522</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="from_onnx.html#sphx-glr-how-to-compile-models-from-onnx-py"><span class="std std-ref">Compile ONNX Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_onnx.py</span></code>)</p></td>
-<td><p>00:02.351</p></td>
+<td><p>00:02.349</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 </tbody>
diff --git a/docs/how_to/deploy_models/deploy_model_on_android.html b/docs/how_to/deploy_models/deploy_model_on_android.html
index b838d0eb4..4790bbf8d 100644
--- a/docs/how_to/deploy_models/deploy_model_on_android.html
+++ b/docs/how_to/deploy_models/deploy_model_on_android.html
@@ -648,7 +648,7 @@ to the remote android device.</p>
 Evaluate inference time cost...
 Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  16.0557      15.8693      16.5941      15.7658       0.3119
+  15.9465      15.9620      16.0734      15.7781       0.0909
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
index 2cffae506..c9ce36593 100644
--- a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
+++ b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
@@ -431,17 +431,15 @@ be unstable.</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: &quot;https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth&quot; to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
 
   0%|          | 0.00/170M [00:00&lt;?, ?B/s]
-  6%|5         | 9.44M/170M [00:00&lt;00:01, 98.6MB/s]
- 15%|#5        | 25.9M/170M [00:00&lt;00:01, 142MB/s]
- 25%|##5       | 42.5M/170M [00:00&lt;00:00, 156MB/s]
- 35%|###4      | 58.8M/170M [00:00&lt;00:00, 162MB/s]
- 44%|####4     | 75.3M/170M [00:00&lt;00:00, 166MB/s]
- 54%|#####4    | 91.9M/170M [00:00&lt;00:00, 169MB/s]
- 64%|######3   | 108M/170M [00:00&lt;00:00, 170MB/s]
- 74%|#######3  | 125M/170M [00:00&lt;00:00, 171MB/s]
- 83%|########3 | 141M/170M [00:00&lt;00:00, 171MB/s]
- 93%|#########2| 158M/170M [00:01&lt;00:00, 172MB/s]
-100%|##########| 170M/170M [00:01&lt;00:00, 166MB/s]
+  8%|8         | 14.2M/170M [00:00&lt;00:01, 148MB/s]
+ 21%|##        | 35.6M/170M [00:00&lt;00:00, 193MB/s]
+ 34%|###3      | 56.9M/170M [00:00&lt;00:00, 207MB/s]
+ 46%|####6     | 78.3M/170M [00:00&lt;00:00, 213MB/s]
+ 59%|#####8    | 99.7M/170M [00:00&lt;00:00, 217MB/s]
+ 71%|#######1  | 121M/170M [00:00&lt;00:00, 218MB/s]
+ 84%|########3 | 142M/170M [00:00&lt;00:00, 219MB/s]
+ 96%|#########6| 163M/170M [00:00&lt;00:00, 221MB/s]
+100%|##########| 170M/170M [00:00&lt;00:00, 214MB/s]
 /usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:3878: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
   for i in range(dim)
 /usr/local/lib/python3.7/dist-packages/torchvision/models/detection/anchor_utils.py:127: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the &#39;trunc&#39; function NOT &#39;floor&#39;). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode=&#39;trunc&#39;), or for actual floor division, use torch.div(a, b, rounding_mode=&#39;floor&#39;).
@@ -536,7 +534,7 @@ torchvision rcnn models.</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Get 9 valid boxes
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes  51.510 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes  55.836 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-object-detection-pytorch-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/7795da4b258c8feff986668b95ef57ad/deploy_object_detection_pytorch.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_object_detection_pytorch.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized.html b/docs/how_to/deploy_models/deploy_prequantized.html
index a831892ea..add09ffc8 100644
--- a/docs/how_to/deploy_models/deploy_prequantized.html
+++ b/docs/how_to/deploy_models/deploy_prequantized.html
@@ -472,7 +472,8 @@ training. Other models require a full post training calibration.</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: &quot;https://download.pytorch.org/models/mobilenet_v2-b0353104.pth&quot; to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
 
   0%|          | 0.00/13.6M [00:00&lt;?, ?B/s]
-100%|##########| 13.6M/13.6M [00:00&lt;00:00, 153MB/s]
+ 68%|######7   | 9.20M/13.6M [00:00&lt;00:00, 95.6MB/s]
+100%|##########| 13.6M/13.6M [00:00&lt;00:00, 110MB/s]
 </pre></div>
 </div>
 </div>
@@ -561,7 +562,7 @@ output values are identical out of 1000 outputs from mobilenet v2.</p>
 </div>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  90.5119      90.4735      93.1524      90.1430       0.3397
+  90.3645      90.2495      96.8307      90.1843       0.6696
 </pre></div>
 </div>
 <div class="admonition note">
@@ -600,7 +601,7 @@ This includes support for the VNNI 8 bit dot product instruction (CascadeLake or
 <div class="section" id="deploy-a-quantized-tflite-model">
 <h2>Deploy a quantized TFLite Model<a class="headerlink" href="#deploy-a-quantized-tflite-model" title="Permalink to this headline">¶</a></h2>
 <p>TODO</p>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  5.729 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  7.076 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/fb8217c13f4351224c6cf3aacf1a87fc/deploy_prequantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized_tflite.html b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
index 959b110fe..f299b9d09 100644
--- a/docs/how_to/deploy_models/deploy_prequantized_tflite.html
+++ b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
@@ -565,7 +565,7 @@ TFLite Top-5 labels: [387 102 386 341 349]
 </div>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  118.8315     118.7852     125.2959     117.9721      0.7412
+  119.8498     119.8288     121.2662     119.0162      0.3249
 </pre></div>
 </div>
 <div class="admonition note">
@@ -593,7 +593,7 @@ network for ARM CPU</span></a>.</p></li>
 </ul>
 </div></blockquote>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes  9.625 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes  3.701 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-tflite-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/56691c7a27d45da61d112276334640d3/deploy_prequantized_tflite.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized_tflite.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_quantized.html b/docs/how_to/deploy_models/deploy_quantized.html
index 244f0c0b3..e30c094dc 100644
--- a/docs/how_to/deploy_models/deploy_quantized.html
+++ b/docs/how_to/deploy_models/deploy_quantized.html
@@ -504,7 +504,7 @@ for calibration. But the accuracy might be impacted.</p>
   DeprecationWarning,
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  14.689 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  32.299 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-quantized-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/7810ecf51bfc05f7d5e8a400ac3e815d/deploy_quantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_quantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
index c583e9586..99ebcf575 100644
--- a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
+++ b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
@@ -436,24 +436,23 @@ to your device.</p>
 Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
 
   0%|          | 0/132723 [00:00&lt;?, ?KB/s]
-  2%|1         | 2047/132723 [00:00&lt;00:06, 20406.67KB/s]
-  5%|4         | 6578/132723 [00:00&lt;00:03, 35035.25KB/s]
- 11%|#         | 14180/132723 [00:00&lt;00:02, 53734.70KB/s]
- 17%|#6        | 22154/132723 [00:00&lt;00:01, 63994.50KB/s]
- 23%|##2       | 30073/132723 [00:00&lt;00:01, 69470.33KB/s]
- 29%|##8       | 38102/132723 [00:00&lt;00:01, 73147.09KB/s]
- 35%|###4      | 46097/132723 [00:00&lt;00:01, 75365.68KB/s]
- 41%|####      | 54077/132723 [00:00&lt;00:01, 76774.94KB/s]
- 47%|####6     | 62103/132723 [00:00&lt;00:00, 77862.69KB/s]
- 53%|#####2    | 70157/132723 [00:01&lt;00:00, 78685.87KB/s]
- 59%|#####8    | 78228/132723 [00:01&lt;00:00, 79301.38KB/s]
- 65%|######5   | 86317/132723 [00:01&lt;00:00, 79781.27KB/s]
- 71%|#######1  | 94447/132723 [00:01&lt;00:00, 80239.53KB/s]
- 77%|#######7  | 102596/132723 [00:01&lt;00:00, 80616.06KB/s]
- 83%|########3 | 110731/132723 [00:01&lt;00:00, 80836.08KB/s]
- 90%|########9 | 118922/132723 [00:01&lt;00:00, 81158.05KB/s]
- 96%|#########5| 127088/132723 [00:01&lt;00:00, 81306.03KB/s]
-100%|##########| 132723/132723 [00:01&lt;00:00, 74969.03KB/s]
+  2%|2         | 3124/132723 [00:00&lt;00:04, 31238.61KB/s]
+  8%|7         | 10039/132723 [00:00&lt;00:02, 53537.03KB/s]
+ 14%|#3        | 18569/132723 [00:00&lt;00:01, 68032.11KB/s]
+ 21%|##        | 27284/132723 [00:00&lt;00:01, 75576.21KB/s]
+ 27%|##7       | 36034/132723 [00:00&lt;00:01, 79872.26KB/s]
+ 34%|###3      | 44669/132723 [00:00&lt;00:01, 82069.42KB/s]
+ 40%|####      | 53394/132723 [00:00&lt;00:00, 83759.28KB/s]
+ 47%|####6     | 62119/132723 [00:00&lt;00:00, 84868.29KB/s]
+ 53%|#####3    | 70833/132723 [00:00&lt;00:00, 85572.69KB/s]
+ 60%|#####9    | 79608/132723 [00:01&lt;00:00, 86241.46KB/s]
+ 67%|######6   | 88369/132723 [00:01&lt;00:00, 86655.55KB/s]
+ 73%|#######3  | 97076/132723 [00:01&lt;00:00, 86778.49KB/s]
+ 80%|#######9  | 105828/132723 [00:01&lt;00:00, 87001.18KB/s]
+ 86%|########6 | 114574/132723 [00:01&lt;00:00, 87137.46KB/s]
+ 93%|#########2| 123289/132723 [00:01&lt;00:00, 87138.26KB/s]
+ 99%|#########9| 132003/132723 [00:01&lt;00:00, 87121.86KB/s]
+100%|##########| 132723/132723 [00:01&lt;00:00, 82401.45KB/s]
 </pre></div>
 </div>
 <p>Create TVM runtime and do inference
@@ -496,7 +495,7 @@ Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from h
 <span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
 </pre></div>
 </div>
-<img src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" srcset="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" alt="deploy ssd gluoncv" class = "sphx-glr-single-img"/><p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes  15.415 seconds)</p>
+<img src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" srcset="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" alt="deploy ssd gluoncv" class = "sphx-glr-single-img"/><p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes  17.409 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-ssd-gluoncv-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/cccb17d28e5e8b2e94ea8cd5ec59f6ed/deploy_ssd_gluoncv.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_ssd_gluoncv.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/sg_execution_times.html b/docs/how_to/deploy_models/sg_execution_times.html
index 586333aba..ead859a5d 100644
--- a/docs/how_to/deploy_models/sg_execution_times.html
+++ b/docs/how_to/deploy_models/sg_execution_times.html
@@ -322,7 +322,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-deploy-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>10:27.491</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
+<p><strong>10:47.671</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 86%" />
@@ -331,31 +331,31 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="deploy_object_detection_pytorch.html#sphx-glr-how-to-deploy-models-deploy-object-detection-pytorch-py"><span class="std std-ref">Compile PyTorch Object Detection Models</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_object_detection_pytorch.py</span></code>)</p></td>
-<td><p>02:51.510</p></td>
+<td><p>02:55.836</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="deploy_ssd_gluoncv.html#sphx-glr-how-to-deploy-models-deploy-ssd-gluoncv-py"><span class="std std-ref">Deploy Single Shot Multibox Detector(SSD) model</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_ssd_gluoncv.py</span></code>)</p></td>
-<td><p>02:15.415</p></td>
+<td><p>02:17.409</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="deploy_prequantized_tflite.html#sphx-glr-how-to-deploy-models-deploy-prequantized-tflite-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM - Part 3 (TFLite)</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized_tflite.py</span></code>)</p></td>
-<td><p>02:09.625</p></td>
+<td><p>02:03.701</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="deploy_quantized.html#sphx-glr-how-to-deploy-models-deploy-quantized-py"><span class="std std-ref">Deploy a Quantized Model on Cuda</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_quantized.py</span></code>)</p></td>
-<td><p>01:14.689</p></td>
+<td><p>01:32.299</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="deploy_prequantized.html#sphx-glr-how-to-deploy-models-deploy-prequantized-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized.py</span></code>)</p></td>
-<td><p>01:05.729</p></td>
+<td><p>01:07.076</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="deploy_model_on_android.html#sphx-glr-how-to-deploy-models-deploy-model-on-android-py"><span class="std std-ref">Deploy the Pretrained Model on Android</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_android.py</span></code>)</p></td>
-<td><p>00:28.241</p></td>
+<td><p>00:28.877</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="deploy_model_on_rasp.html#sphx-glr-how-to-deploy-models-deploy-model-on-rasp-py"><span class="std std-ref">Deploy the Pretrained Model on Raspberry Pi</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_rasp.py</span></code>)</p></td>
-<td><p>00:22.277</p></td>
+<td><p>00:22.467</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="deploy_sparse.html#sphx-glr-how-to-deploy-models-deploy-sparse-py"><span class="std std-ref">Deploy a Hugging Face Pruned Model on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_sparse.py</span></code>)</p></td>
diff --git a/docs/how_to/extend_tvm/bring_your_own_datatypes.html b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
index 842dd5a60..141021fdf 100644
--- a/docs/how_to/extend_tvm/bring_your_own_datatypes.html
+++ b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
@@ -604,7 +604,7 @@ In this alpha state of the Bring Your Own Datatypes framework, we have not imple
 <span class="n">module</span><span class="p">,</span> <a href="https://docs.python.org/3/library/stdtypes.html#dict" title="builtins.dict" class="sphx-glr-backref-module-builtins sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">params</span></a> <span class="o">=</span> <span class="n">get_mobilenet</span><span class="p">()</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip28234e9f-61c7-4f97-854a-564afd9983fc from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip8ca90524-9141-4870-9ca3-1f55665bdfc5 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
 </pre></div>
 </div>
 <p>It’s easy to execute MobileNet with native TVM:</p>
diff --git a/docs/how_to/extend_tvm/sg_execution_times.html b/docs/how_to/extend_tvm/sg_execution_times.html
index ca5877a1d..bd634d8cc 100644
--- a/docs/how_to/extend_tvm/sg_execution_times.html
+++ b/docs/how_to/extend_tvm/sg_execution_times.html
@@ -322,7 +322,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-extend-tvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:41.355</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
+<p><strong>00:40.008</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 84%" />
@@ -331,15 +331,15 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="bring_your_own_datatypes.html#sphx-glr-how-to-extend-tvm-bring-your-own-datatypes-py"><span class="std std-ref">Bring Your Own Datatypes to TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">bring_your_own_datatypes.py</span></code>)</p></td>
-<td><p>00:38.244</p></td>
+<td><p>00:36.865</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="use_pass_instrument.html#sphx-glr-how-to-extend-tvm-use-pass-instrument-py"><span class="std std-ref">How to Use TVM Pass Instrument</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_instrument.py</span></code>)</p></td>
-<td><p>00:02.187</p></td>
+<td><p>00:02.214</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="use_pass_infra.html#sphx-glr-how-to-extend-tvm-use-pass-infra-py"><span class="std std-ref">How to Use TVM Pass Infra</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_infra.py</span></code>)</p></td>
-<td><p>00:00.917</p></td>
+<td><p>00:00.923</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="low_level_custom_pass.html#sphx-glr-how-to-extend-tvm-low-level-custom-pass-py"><span class="std std-ref">Writing a Customized Pass</span></a> (<code class="docutils literal notranslate"><span class="pre">low_level_custom_pass.py</span></code>)</p></td>
diff --git a/docs/how_to/extend_tvm/use_pass_instrument.html b/docs/how_to/extend_tvm/use_pass_instrument.html
index 60e9646cf..86c6369e5 100644
--- a/docs/how_to/extend_tvm/use_pass_instrument.html
+++ b/docs/how_to/extend_tvm/use_pass_instrument.html
@@ -507,10 +507,10 @@ profile the execution time of each passes.</p>
 </pre></div>
 </div>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 6848us [6848us] (45.94%; 45.94%)
-FoldScaleAxis: 8058us [6us] (54.06%; 54.06%)
-        FoldConstant: 8051us [1612us] (54.02%; 99.92%)
-                InferType: 6439us [6439us] (43.20%; 79.98%)
+InferType: 6849us [6849us] (46.58%; 46.58%)
+FoldScaleAxis: 7855us [6us] (53.42%; 53.42%)
+        FoldConstant: 7849us [1571us] (53.38%; 99.92%)
+                InferType: 6278us [6278us] (42.70%; 79.99%)
 </pre></div>
 </div>
 </div>
@@ -532,10 +532,10 @@ Refer to following sections and <a class="reference internal" href="../../refere
 </pre></div>
 </div>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 6432us [6432us] (44.63%; 44.63%)
-FoldScaleAxis: 7981us [6us] (55.37%; 55.37%)
-        FoldConstant: 7975us [1667us] (55.33%; 99.92%)
-                InferType: 6308us [6308us] (43.77%; 79.10%)
+InferType: 6312us [6312us] (44.55%; 44.55%)
+FoldScaleAxis: 7857us [5us] (55.45%; 55.45%)
+        FoldConstant: 7852us [1593us] (55.41%; 99.93%)
+                InferType: 6259us [6259us] (44.17%; 79.72%)
 </pre></div>
 </div>
 <p>Register empty list to clear existing instruments.</p>
diff --git a/docs/how_to/optimize_operators/opt_conv_cuda.html b/docs/how_to/optimize_operators/opt_conv_cuda.html
index 31ad7a377..bfaf6e4c7 100644
--- a/docs/how_to/optimize_operators/opt_conv_cuda.html
+++ b/docs/how_to/optimize_operators/opt_conv_cuda.html
@@ -556,7 +556,7 @@ latency of convolution.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Convolution: </span><span class="si">%f</span><span class="s2"> ms&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">b</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span> <span class="o">*</span> <span cl [...]
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 54.167125 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 43.185978 ms
 </pre></div>
 </div>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-optimize-operators-opt-conv-cuda-py">
diff --git a/docs/how_to/optimize_operators/opt_conv_tensorcore.html b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
index ff4289265..4e9695122 100644
--- a/docs/how_to/optimize_operators/opt_conv_tensorcore.html
+++ b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
@@ -898,7 +898,7 @@ be able to run on our build server</p>
     <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;conv2d with tensor core: </span><span class="si">%f</span><span class="s2"> ms&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span> <span class="o">* [...]
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 6.873242 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 11.878459 ms
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/optimize_operators/opt_gemm.html b/docs/how_to/optimize_operators/opt_gemm.html
index 440b044a2..888820480 100644
--- a/docs/how_to/optimize_operators/opt_gemm.html
+++ b/docs/how_to/optimize_operators/opt_gemm.html
@@ -453,8 +453,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Baseline: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.018172
-Baseline: 3.395604
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.018950
+Baseline: 3.338291
 </pre></div>
 </div>
 <p>In TVM, we can always inspect lower level IR to debug or optimize our schedule.
@@ -514,7 +514,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt1: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.300426
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.308090
 </pre></div>
 </div>
 <p>Here is the generated IR after blocking.</p>
@@ -581,7 +581,7 @@ vastly.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt2: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.332632
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.331753
 </pre></div>
 </div>
 <p>Here is the generated IR after vectorization.</p>
@@ -642,7 +642,7 @@ the access pattern for A matrix is more cache friendly.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt3: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.117883
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.121683
 </pre></div>
 </div>
 <p>Here is the generated IR after loop permutation.</p>
@@ -725,7 +725,7 @@ flattening.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt4: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.110434
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.111281
 </pre></div>
 </div>
 <p>Here is the generated IR after array packing.</p>
@@ -811,7 +811,7 @@ write to C when all the block results are ready.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt5: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.111427
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.111284
 </pre></div>
 </div>
 <p>Here is the generated IR after blocking.</p>
@@ -901,7 +901,7 @@ write to C when all the block results are ready.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt6: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">opt6_time</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.145331
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.145534
 </pre></div>
 </div>
 <p>Here is the generated IR after parallelization.</p>
diff --git a/docs/how_to/optimize_operators/sg_execution_times.html b/docs/how_to/optimize_operators/sg_execution_times.html
index 872171522..f621f57ec 100644
--- a/docs/how_to/optimize_operators/sg_execution_times.html
+++ b/docs/how_to/optimize_operators/sg_execution_times.html
@@ -322,7 +322,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-optimize-operators-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:34.322</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
+<p><strong>00:34.419</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 83%" />
@@ -331,15 +331,15 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="opt_gemm.html#sphx-glr-how-to-optimize-operators-opt-gemm-py"><span class="std std-ref">How to optimize GEMM on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_gemm.py</span></code>)</p></td>
-<td><p>00:32.069</p></td>
+<td><p>00:32.140</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="opt_conv_tensorcore.html#sphx-glr-how-to-optimize-operators-opt-conv-tensorcore-py"><span class="std std-ref">How to optimize convolution using TensorCores</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_tensorcore.py</span></code>)</p></td>
-<td><p>00:01.234</p></td>
+<td><p>00:01.283</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="opt_conv_cuda.html#sphx-glr-how-to-optimize-operators-opt-conv-cuda-py"><span class="std std-ref">How to optimize convolution on GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_cuda.py</span></code>)</p></td>
-<td><p>00:01.020</p></td>
+<td><p>00:00.996</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 </tbody>
diff --git a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
index 149a24b1e..5cc0ef3d3 100644
--- a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
+++ b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
@@ -322,7 +322,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-tune-with-autoscheduler-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>05:12.442</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
+<p><strong>05:12.594</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 85%" />
@@ -331,27 +331,27 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="tune_conv2d_layer_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py"><span class="std std-ref">Auto-scheduling a Convolution Layer for GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_layer_cuda.py</span></code>)</p></td>
-<td><p>02:35.530</p></td>
+<td><p>02:34.321</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tune_network_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-x86-py"><span class="std std-ref">Auto-scheduling a Neural Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_x86.py</span></code>)</p></td>
-<td><p>01:19.801</p></td>
+<td><p>01:20.685</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="tune_network_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-cuda-py"><span class="std std-ref">Auto-scheduling a Neural Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_cuda.py</span></code>)</p></td>
-<td><p>00:42.629</p></td>
+<td><p>00:43.097</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tune_sparse_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-sparse-x86-py"><span class="std std-ref">Auto-scheduling Sparse Matrix Multiplication on CPU with Custom Sketch Rule</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_sparse_x86.py</span></code>)</p></td>
-<td><p>00:17.693</p></td>
+<td><p>00:17.412</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="tune_network_mali.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-mali-py"><span class="std std-ref">Auto-scheduling a Neural Network for mali GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_mali.py</span></code>)</p></td>
-<td><p>00:08.451</p></td>
+<td><p>00:08.696</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tune_network_arm.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-arm-py"><span class="std std-ref">Auto-scheduling a Neural Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_arm.py</span></code>)</p></td>
-<td><p>00:08.338</p></td>
+<td><p>00:08.384</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 </tbody>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
index 5f628f38c..07fe78b8a 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
@@ -486,72 +486,128 @@ cooperative fetching, unrolling and operator fusion.</p>
              compute: Buffer(compute_2: Pointer(float32), float32, [25088], [])}
   buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, compute_1: compute}
   preflattened_buffer_map = {data_1: data_3: Buffer(data_2, float32, [1, 512, 7, 7], []), kernel_1: kernel_3: Buffer(kernel_2, float32, [512, 512, 3, 3], []), bias_1: bias_3: Buffer(bias_2, float32, [1, 512, 1, 1], []), compute_1: compute_3: Buffer(compute_2, float32, [1, 512, 7, 7], [])} {
-  attr [IterVar(blockIdx.x: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;blockIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-  allocate(conv2d_nchw: Pointer(local float32), float32, [2]), storage_scope = local;
-  allocate(pad_temp.shared: Pointer(shared float32), float32, [2016]), storage_scope = shared;
-  allocate(kernel.shared: Pointer(shared float32), float32, [768]), storage_scope = shared;
-  attr [IterVar(threadIdx.x: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 196 {
-    conv2d_nchw_1: Buffer(conv2d_nchw, float32, [1], [], scope=&quot;local&quot;, align=4)[0] = 0f32
+  attr [IterVar(blockIdx.x: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;blockIdx.x&quot;)] &quot;thread_extent&quot; = 112;
+  allocate(conv2d_nchw: Pointer(local float32), float32, [7]), storage_scope = local;
+  allocate(pad_temp.shared: Pointer(shared float32), float32, [144]), storage_scope = shared;
+  allocate(kernel.shared: Pointer(shared float32), float32, [1536]), storage_scope = shared;
+  attr [IterVar(threadIdx.x: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 32 {
+    conv2d_nchw_1: Buffer(conv2d_nchw, float32, [7], [], scope=&quot;local&quot;, align=16)[0] = 0f32
     conv2d_nchw_1[1] = 0f32
-    for (rc.outer.outer: int32, 0, 16) {
+    conv2d_nchw_1[2] = 0f32
+    conv2d_nchw_1[3] = 0f32
+    conv2d_nchw_1[4] = 0f32
+    conv2d_nchw_1[5] = 0f32
+    conv2d_nchw_1[6] = 0f32
+    for (rc.outer.outer: int32, 0, 32) {
       for (ry.outer.outer: int32, 0, 3) {
-        let cse_var_2: int32 = (rc.outer.outer*1568)
-        let cse_var_1: int32 = (ry.outer.outer*7)
+        let cse_var_4: int32 = (rc.outer.outer*784)
+        let cse_var_3: int32 = (ry.outer.outer*7)
+        let cse_var_2: int32 = (rc.outer.outer*144)
+        let cse_var_1: int32 = (ry.outer.outer*3)
          {
-          attr [IterVar(threadIdx.x_1: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 196;
-          pad_temp.shared_1: Buffer(pad_temp.shared, float32, [2016], [], scope=&quot;shared&quot;)[threadIdx.x_1] = @tir.if_then_else(((((1 &lt;= (floordiv(floormod(threadIdx.x_1, 63), 9) + ry.outer.outer)) &amp;&amp; ((floordiv(floormod(threadIdx.x_1, 63), 9) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 9))) &amp;&amp; (floormod(threadIdx.x_1, 9) &lt; 8)), data[((((cse_var_2 + (floordiv(threadIdx.x_1, 9)*7)) + cse_var_1) + floormod(threadIdx.x_1, 9)) - 8)], 0f [...]
-          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 196;
-          pad_temp.shared_1[(threadIdx.x_1 + 196)] = @tir.if_then_else(((((1 &lt;= (floordiv(floormod((threadIdx.x_1 + 196), 63), 9) + ry.outer.outer)) &amp;&amp; ((floordiv(floormod((threadIdx.x_1 + 196), 63), 9) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod((threadIdx.x_1 + 7), 9))) &amp;&amp; (floormod((threadIdx.x_1 + 7), 9) &lt; 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 196), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 7), 9)) - 8)], 0f32, dtype=float32)
-          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 196;
-          pad_temp.shared_1[(threadIdx.x_1 + 392)] = @tir.if_then_else(((((1 &lt;= (floordiv(floormod((threadIdx.x_1 + 392), 63), 9) + ry.outer.outer)) &amp;&amp; ((floordiv(floormod((threadIdx.x_1 + 392), 63), 9) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod((threadIdx.x_1 + 5), 9))) &amp;&amp; (floormod((threadIdx.x_1 + 5), 9) &lt; 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 392), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 5), 9)) - 8)], 0f32, dtype=float32)
-          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 196;
-          pad_temp.shared_1[(threadIdx.x_1 + 588)] = @tir.if_then_else(((((1 &lt;= (floordiv(floormod((threadIdx.x_1 + 588), 63), 9) + ry.outer.outer)) &amp;&amp; ((floordiv(floormod((threadIdx.x_1 + 588), 63), 9) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod((threadIdx.x_1 + 3), 9))) &amp;&amp; (floormod((threadIdx.x_1 + 3), 9) &lt; 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 588), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 3), 9)) - 8)], 0f32, dtype=float32)
-          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 196;
-          pad_temp.shared_1[(threadIdx.x_1 + 784)] = @tir.if_then_else(((((1 &lt;= (floordiv(floormod((threadIdx.x_1 + 784), 63), 9) + ry.outer.outer)) &amp;&amp; ((floordiv(floormod((threadIdx.x_1 + 784), 63), 9) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod((threadIdx.x_1 + 1), 9))) &amp;&amp; (floormod((threadIdx.x_1 + 1), 9) &lt; 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 784), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 1), 9)) - 8)], 0f32, dtype=float32)
-          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 196;
-          pad_temp.shared_1[(threadIdx.x_1 + 980)] = @tir.if_then_else(((((1 &lt;= (floordiv(floormod((threadIdx.x_1 + 980), 63), 9) + ry.outer.outer)) &amp;&amp; ((floordiv(floormod((threadIdx.x_1 + 980), 63), 9) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod((threadIdx.x_1 + 8), 9))) &amp;&amp; (floormod((threadIdx.x_1 + 8), 9) &lt; 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 980), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 8), 9)) - 8)], 0f32, dtype=float32)
-          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 196;
-          pad_temp.shared_1[(threadIdx.x_1 + 1176)] = @tir.if_then_else(((((1 &lt;= (floordiv(floormod((threadIdx.x_1 + 1176), 63), 9) + ry.outer.outer)) &amp;&amp; ((floordiv(floormod((threadIdx.x_1 + 1176), 63), 9) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod((threadIdx.x_1 + 6), 9))) &amp;&amp; (floormod((threadIdx.x_1 + 6), 9) &lt; 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1176), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 6), 9)) - 8)], 0f32, dtype=float32)
-          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 196;
-          pad_temp.shared_1[(threadIdx.x_1 + 1372)] = @tir.if_then_else(((((1 &lt;= (floordiv(floormod((threadIdx.x_1 + 1372), 63), 9) + ry.outer.outer)) &amp;&amp; ((floordiv(floormod((threadIdx.x_1 + 1372), 63), 9) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod((threadIdx.x_1 + 4), 9))) &amp;&amp; (floormod((threadIdx.x_1 + 4), 9) &lt; 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1372), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 4), 9)) - 8)], 0f32, dtype=float32)
-          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 196;
-          pad_temp.shared_1[(threadIdx.x_1 + 1568)] = @tir.if_then_else(((((1 &lt;= (floordiv(floormod((threadIdx.x_1 + 1568), 63), 9) + ry.outer.outer)) &amp;&amp; ((floordiv(floormod((threadIdx.x_1 + 1568), 63), 9) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod((threadIdx.x_1 + 2), 9))) &amp;&amp; (floormod((threadIdx.x_1 + 2), 9) &lt; 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1568), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 2), 9)) - 8)], 0f32, dtype=float32)
-          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 196;
-          pad_temp.shared_1[(threadIdx.x_1 + 1764)] = @tir.if_then_else(((((1 &lt;= (floordiv(floormod(threadIdx.x_1, 63), 9) + ry.outer.outer)) &amp;&amp; ((floordiv(floormod(threadIdx.x_1, 63), 9) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 9))) &amp;&amp; (floormod(threadIdx.x_1, 9) &lt; 8)), data[((((cse_var_2 + (floordiv(threadIdx.x_1, 9)*7)) + cse_var_1) + floormod(threadIdx.x_1, 9)) + 1364)], 0f32, dtype=float32)
-          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 196;
-          if @tir.likely((threadIdx.x_1 &lt; 56), dtype=bool) {
-            pad_temp.shared_1[(threadIdx.x_1 + 1960)] = @tir.if_then_else(((((1 &lt;= (floordiv(floormod((threadIdx.x_1 + 1960), 63), 9) + ry.outer.outer)) &amp;&amp; ((floordiv(floormod((threadIdx.x_1 + 1960), 63), 9) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod((threadIdx.x_1 + 7), 9))) &amp;&amp; (floormod((threadIdx.x_1 + 7), 9) &lt; 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1960), 9)*7)) + cse_var_1) + floormod((threadIdx.x_1 + 7), 9)) - 8)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 32;
+          pad_temp.shared_1: Buffer(pad_temp.shared, float32, [144], [], scope=&quot;shared&quot;)[threadIdx.x_1] = @tir.if_then_else(((((1 &lt;= (ry.outer.outer + floormod(blockIdx.x, 7))) &amp;&amp; ((ry.outer.outer + floormod(blockIdx.x, 7)) &lt; 8)) &amp;&amp; (1 &lt;= floormod(threadIdx.x_1, 9))) &amp;&amp; (floormod(threadIdx.x_1, 9) &lt; 8)), data[(((((cse_var_4 + (floordiv(threadIdx.x_1, 9)*49)) + cse_var_3) + (floormod(blockIdx.x, 7)*7)) + floormod(threadIdx.x_1, 9)) - 8)], 0f32 [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 32;
+          pad_temp.shared_1[(threadIdx.x_1 + 32)] = @tir.if_then_else(((((1 &lt;= (ry.outer.outer + floormod(blockIdx.x, 7))) &amp;&amp; ((ry.outer.outer + floormod(blockIdx.x, 7)) &lt; 8)) &amp;&amp; (1 &lt;= floormod((threadIdx.x_1 + 5), 9))) &amp;&amp; (floormod((threadIdx.x_1 + 5), 9) &lt; 8)), data[(((((cse_var_4 + (floordiv((threadIdx.x_1 + 32), 9)*49)) + cse_var_3) + (floormod(blockIdx.x, 7)*7)) + floormod((threadIdx.x_1 + 5), 9)) - 8)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 32;
+          pad_temp.shared_1[(threadIdx.x_1 + 64)] = @tir.if_then_else(((((1 &lt;= (ry.outer.outer + floormod(blockIdx.x, 7))) &amp;&amp; ((ry.outer.outer + floormod(blockIdx.x, 7)) &lt; 8)) &amp;&amp; (1 &lt;= floormod((threadIdx.x_1 + 1), 9))) &amp;&amp; (floormod((threadIdx.x_1 + 1), 9) &lt; 8)), data[(((((cse_var_4 + (floordiv((threadIdx.x_1 + 64), 9)*49)) + cse_var_3) + (floormod(blockIdx.x, 7)*7)) + floormod((threadIdx.x_1 + 1), 9)) - 8)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 32;
+          pad_temp.shared_1[(threadIdx.x_1 + 96)] = @tir.if_then_else(((((1 &lt;= (ry.outer.outer + floormod(blockIdx.x, 7))) &amp;&amp; ((ry.outer.outer + floormod(blockIdx.x, 7)) &lt; 8)) &amp;&amp; (1 &lt;= floormod((threadIdx.x_1 + 6), 9))) &amp;&amp; (floormod((threadIdx.x_1 + 6), 9) &lt; 8)), data[(((((cse_var_4 + (floordiv((threadIdx.x_1 + 96), 9)*49)) + cse_var_3) + (floormod(blockIdx.x, 7)*7)) + floormod((threadIdx.x_1 + 6), 9)) - 8)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 32;
+          if @tir.likely((threadIdx.x_1 &lt; 16), dtype=bool) {
+            pad_temp.shared_1[(threadIdx.x_1 + 128)] = @tir.if_then_else(((((1 &lt;= (ry.outer.outer + floormod(blockIdx.x, 7))) &amp;&amp; ((ry.outer.outer + floormod(blockIdx.x, 7)) &lt; 8)) &amp;&amp; (1 &lt;= floormod((threadIdx.x_1 + 2), 9))) &amp;&amp; (floormod((threadIdx.x_1 + 2), 9) &lt; 8)), data[(((((cse_var_4 + (floordiv((threadIdx.x_1 + 128), 9)*49)) + cse_var_3) + (floormod(blockIdx.x, 7)*7)) + floormod((threadIdx.x_1 + 2), 9)) - 8)], 0f32, dtype=float32)
           }
-          attr [IterVar(threadIdx.x_2: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 196 {
-            kernel.shared_1: Buffer(kernel.shared, float32, [768], [], scope=&quot;shared&quot;)[(threadIdx.x_2*3)] = kernel[(((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 32)*4608)) + (rc.outer.outer*288)) + (floormod(threadIdx.x_2, 32)*9)) + (ry.outer.outer*3))]
-            kernel.shared_1[((threadIdx.x_2*3) + 1)] = kernel[((((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 32)*4608)) + (rc.outer.outer*288)) + (floormod(threadIdx.x_2, 32)*9)) + (ry.outer.outer*3)) + 1)]
-            kernel.shared_1[((threadIdx.x_2*3) + 2)] = kernel[((((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 32)*4608)) + (rc.outer.outer*288)) + (floormod(threadIdx.x_2, 32)*9)) + (ry.outer.outer*3)) + 2)]
-          }
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 196;
-          if @tir.likely((threadIdx.x_2 &lt; 60), dtype=bool) {
-            kernel.shared_1[((threadIdx.x_2*3) + 588)] = kernel[(((((blockIdx.x*36864) + (floordiv((floordiv(threadIdx.x_2, 4) + 49), 8)*4608)) + (rc.outer.outer*288)) + (floormod((threadIdx.x_2 + 4), 32)*9)) + (ry.outer.outer*3))]
-            kernel.shared_1[((threadIdx.x_2*3) + 589)] = kernel[((((((blockIdx.x*36864) + (floordiv((floordiv(threadIdx.x_2, 4) + 49), 8)*4608)) + (rc.outer.outer*288)) + (floormod((threadIdx.x_2 + 4), 32)*9)) + (ry.outer.outer*3)) + 1)]
-            kernel.shared_1[((threadIdx.x_2*3) + 590)] = kernel[((((((blockIdx.x*36864) + (floordiv((floordiv(threadIdx.x_2, 4) + 49), 8)*4608)) + (rc.outer.outer*288)) + (floormod((threadIdx.x_2 + 4), 32)*9)) + (ry.outer.outer*3)) + 2)]
-          }
-          for (rc.outer.inner: int32, 0, 16) {
-            conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7))]*kernel.shared_1[((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6))]))
-            conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7))]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 384)]))
-            conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 1)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 1)]))
-            conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 1)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 385)]))
-            conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 2)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 2)]))
-            conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 2)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 386)]))
-            conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 3)]))
-            conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 387)]))
-            conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 64)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 4)]))
-            conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 64)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 388)]))
-            conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 65)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 5)]))
-            conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((((rc.outer.inner*126) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + floormod(threadIdx.x, 7)) + 65)]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*6)) + 389)]))
+          attr [IterVar(threadIdx.x_2: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 32;
+          kernel.shared_1: Buffer(kernel.shared, float32, [1536], [], scope=&quot;shared&quot;)[ramp((threadIdx.x_2*4), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(threadIdx.x_2, 12)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp((threadIdx.x_2*4), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp(threadIdx.x_2, 1, 4), broadcast(3, 4)))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 32;
+          kernel.shared_1[ramp(((threadIdx.x_2*4) + 128), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 128), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 128), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 32), 1, 4), broadcast(3, 4)))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 32;
+          kernel.shared_1[ramp(((threadIdx.x_2*4) + 256), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 256), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 256), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 64), 1, 4), broadcast(3, 4)))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 32;
+          kernel.shared_1[ramp(((threadIdx.x_2*4) + 384), 1, 4)] = kernel[(((broadcast(((((floordiv(blockIdx.x, 7)*147456) + (floordiv(threadIdx.x_2, 12)*4608)) + cse_var_2) + 36864), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 384), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 96), 1, 4), broadcast(3, 4)))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 32;
+          kernel.shared_1[ramp(((threadIdx.x_2*4) + 512), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 512), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 512), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 128), 1, 4), broadcast(3, 4)))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 32;
+          kernel.shared_1[ramp(((threadIdx.x_2*4) + 640), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 640), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 640), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 160), 1, 4), broadcast(3, 4)))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 32;
+          kernel.shared_1[ramp(((threadIdx.x_2*4) + 768), 1, 4)] = kernel[(((broadcast(((((floordiv(blockIdx.x, 7)*147456) + (floordiv(threadIdx.x_2, 12)*4608)) + cse_var_2) + 73728), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 768), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 192), 1, 4), broadcast(3, 4)))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 32;
+          kernel.shared_1[ramp(((threadIdx.x_2*4) + 896), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 896), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 896), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 224), 1, 4), broadcast(3, 4)))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 32;
+          kernel.shared_1[ramp(((threadIdx.x_2*4) + 1024), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 1024), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 1024), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 256), 1, 4), broadcast(3, 4)))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 32;
+          kernel.shared_1[ramp(((threadIdx.x_2*4) + 1152), 1, 4)] = kernel[(((broadcast(((((floordiv(blockIdx.x, 7)*147456) + (floordiv(threadIdx.x_2, 12)*4608)) + cse_var_2) + 110592), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 1152), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 288), 1, 4), broadcast(3, 4)))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 32;
+          kernel.shared_1[ramp(((threadIdx.x_2*4) + 1280), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 1280), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 1280), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 320), 1, 4), broadcast(3, 4)))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 32;
+          kernel.shared_1[ramp(((threadIdx.x_2*4) + 1408), 1, 4)] = kernel[(((broadcast((((floordiv(blockIdx.x, 7)*147456) + (floordiv(((threadIdx.x_2*4) + 1408), 48)*4608)) + cse_var_2), 4) + (floormod(floordiv(ramp(((threadIdx.x_2*4) + 1408), 1, 4), broadcast(3, 4)), broadcast(16, 4))*broadcast(9, 4))) + broadcast(cse_var_1, 4)) + floormod(ramp((threadIdx.x_2 + 352), 1, 4), broadcast(3, 4)))]
+          for (rc.outer.inner: int32, 0, 8) {
+            let cse_var_19: int32 = (rc.outer.inner*18)
+            let cse_var_18: int32 = (cse_var_19 + 7)
+            let cse_var_17: int32 = (cse_var_19 + 6)
+            let cse_var_16: int32 = (cse_var_19 + 5)
+            let cse_var_15: int32 = (cse_var_19 + 4)
+            let cse_var_14: int32 = (cse_var_19 + 3)
+            let cse_var_13: int32 = (cse_var_19 + 2)
+            let cse_var_12: int32 = (cse_var_19 + 16)
+            let cse_var_11: int32 = (cse_var_19 + 15)
+            let cse_var_10: int32 = (cse_var_19 + 14)
+            let cse_var_9: int32 = (cse_var_19 + 13)
+            let cse_var_8: int32 = (cse_var_19 + 12)
+            let cse_var_7: int32 = (cse_var_19 + 11)
+            let cse_var_6: int32 = (cse_var_19 + 10)
+            let cse_var_5: int32 = (cse_var_19 + 1)
+             {
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_19]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_5]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_13]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_14]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_15]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_16]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_17]*kernel.shared_1[((threadIdx.x*48) + (rc.outer.inner*6))]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_5]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_13]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_14]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_15]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_16]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_17]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_18]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 1)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_13]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_14]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_15]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_16]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_17]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_18]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(cse_var_19 + 8)]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 2)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(cse_var_19 + 9)]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_6]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_7]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_8]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_9]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_10]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_11]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 3)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_6]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_7]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_8]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_9]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_10]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_11]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[cse_var_12]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 4)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[cse_var_7]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[cse_var_8]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[cse_var_9]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[cse_var_10]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[cse_var_11]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[cse_var_12]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(cse_var_19 + 17)]*kernel.shared_1[(((threadIdx.x*48) + (rc.outer.inner*6)) + 5)]))
+            }
           }
         }
       }
     }
-    compute[((blockIdx.x*392) + threadIdx.x)] = max((conv2d_nchw_1[0] + bias[((blockIdx.x*8) + floordiv(threadIdx.x, 49))]), 0f32)
-    compute[(((blockIdx.x*392) + threadIdx.x) + 196)] = max((conv2d_nchw_1[1] + bias[(((blockIdx.x*8) + floordiv(threadIdx.x, 49)) + 4)]), 0f32)
+    for (i3.inner: int32, 0, 7) {
+      compute[((((floordiv(blockIdx.x, 7)*1568) + (threadIdx.x*49)) + (floormod(blockIdx.x, 7)*7)) + i3.inner)] = max((conv2d_nchw_1[i3.inner] + bias[((floordiv(blockIdx.x, 7)*32) + threadIdx.x)]), 0f32)
+    }
   }
 }
 </pre></div>
@@ -587,7 +643,7 @@ cooperative fetching, unrolling and operator fusion.</p>
 <span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 0.321 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 0.322 ms
 </pre></div>
 </div>
 </div>
@@ -618,18 +674,18 @@ conv2d_nchw_nn_o_o_o_i, conv2d_nchw_nn_o_o_i = s[conv2d_nchw].split(conv2d_nchw_
 conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
 conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
 conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=1)
-conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=4)
-conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=2)
+conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=32)
+conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
 conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
 conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
-conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=7)
+conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=1)
 conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
-conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
+conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=7)
 conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
-conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=7)
+conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
 conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
 conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=2)
-conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=16)
+conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=8)
 conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1)
 conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=1)
 conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=3)
@@ -639,13 +695,13 @@ compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1)
 compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
 compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
 compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=1)
-compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=4)
-compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=2)
+compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=32)
+compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
 compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
-compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=7)
+compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=1)
 compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
-compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
-compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=7)
+compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
+compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
 compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
 s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
 s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
@@ -663,14 +719,14 @@ s[compute].bind(compute_i0_o_o_i_i1_o_o_i_fused_i2_o_o_i_fused_i3_o_o_i_fused, t
 compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused = s[compute].fuse(compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i)
 s[compute].bind(compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused, te.thread_axis(&quot;threadIdx.x&quot;))
 kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
-kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=3)
+kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=4)
 s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=196)
+kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=32)
 s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis(&quot;threadIdx.x&quot;))
 pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
 pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
 s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=196)
+pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=32)
 s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis(&quot;threadIdx.x&quot;))
 s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, &quot;auto_unroll_max_step&quot;, 64)
 s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, &quot;unroll_explicit&quot;, True)
@@ -690,55 +746,2561 @@ CUDA source code:
   #define int64_t long long
   #define uint64_t unsigned long long
 #endif
-extern &quot;C&quot; __global__ void __launch_bounds__(196) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
-  float conv2d_nchw[2];
-  __shared__ float pad_temp_shared[2016];
-  __shared__ float kernel_shared[768];
+extern &quot;C&quot; __global__ void __launch_bounds__(32) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+  float conv2d_nchw[7];
+  __shared__ float pad_temp_shared[144];
+  __shared__ float kernel_shared[1536];
   conv2d_nchw[0] = 0.000000e+00f;
   conv2d_nchw[1] = 0.000000e+00f;
-  for (int rc_outer_outer = 0; rc_outer_outer &lt; 16; ++rc_outer_outer) {
+  conv2d_nchw[2] = 0.000000e+00f;
+  conv2d_nchw[3] = 0.000000e+00f;
+  conv2d_nchw[4] = 0.000000e+00f;
+  conv2d_nchw[5] = 0.000000e+00f;
+  conv2d_nchw[6] = 0.000000e+00f;
+  for (int rc_outer_outer = 0; rc_outer_outer &lt; 32; ++rc_outer_outer) {
     for (int ry_outer_outer = 0; ry_outer_outer &lt; 3; ++ry_outer_outer) {
       __syncthreads();
-      pad_temp_shared[((int)threadIdx.x)] = (((((1 &lt;= (((((int)threadIdx.x) % 63) / 9) + ry_outer_outer)) &amp;&amp; ((((((int)threadIdx.x) % 63) / 9) + ry_outer_outer) &lt; 8)) &amp;&amp; (1 &lt;= (((int)threadIdx.x) % 9))) &amp;&amp; ((((int)threadIdx.x) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + ((((int)threadIdx.x) / 9) * 7)) + (ry_outer_outer * 7)) + (((int)threadIdx.x) % 9)) - 8)] : 0.000000e+00f);
-      pad_temp_shared[(((int)threadIdx.x) + 196)] = (((((1 &lt;= ((((((int)threadIdx.x) + 7) % 63) / 9) + ry_outer_outer)) &amp;&amp; (((((((int)threadIdx.x) + 7) % 63) / 9) + ry_outer_outer) &lt; 8)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 7) % 9))) &amp;&amp; (((((int)threadIdx.x) + 7) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 196) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 7) % 9)) - 8)] : 0.000000e+00f);
-      pad_temp_shared[(((int)threadIdx.x) + 392)] = (((((1 &lt;= ((((((int)threadIdx.x) + 14) % 63) / 9) + ry_outer_outer)) &amp;&amp; (((((((int)threadIdx.x) + 14) % 63) / 9) + ry_outer_outer) &lt; 8)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 5) % 9))) &amp;&amp; (((((int)threadIdx.x) + 5) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 392) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 5) % 9)) - 8)] : 0.000000e+00f);
-      pad_temp_shared[(((int)threadIdx.x) + 588)] = (((((1 &lt;= ((((((int)threadIdx.x) + 21) % 63) / 9) + ry_outer_outer)) &amp;&amp; (((((((int)threadIdx.x) + 21) % 63) / 9) + ry_outer_outer) &lt; 8)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 3) % 9))) &amp;&amp; (((((int)threadIdx.x) + 3) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 588) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 3) % 9)) - 8)] : 0.000000e+00f);
-      pad_temp_shared[(((int)threadIdx.x) + 784)] = (((((1 &lt;= ((((((int)threadIdx.x) + 28) % 63) / 9) + ry_outer_outer)) &amp;&amp; (((((((int)threadIdx.x) + 28) % 63) / 9) + ry_outer_outer) &lt; 8)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 1) % 9))) &amp;&amp; (((((int)threadIdx.x) + 1) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 784) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 1) % 9)) - 8)] : 0.000000e+00f);
-      pad_temp_shared[(((int)threadIdx.x) + 980)] = (((((1 &lt;= ((((((int)threadIdx.x) + 35) % 63) / 9) + ry_outer_outer)) &amp;&amp; (((((((int)threadIdx.x) + 35) % 63) / 9) + ry_outer_outer) &lt; 8)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 8) % 9))) &amp;&amp; (((((int)threadIdx.x) + 8) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 980) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 8) % 9)) - 8)] : 0.000000e+00f);
-      pad_temp_shared[(((int)threadIdx.x) + 1176)] = (((((1 &lt;= ((((((int)threadIdx.x) + 42) % 63) / 9) + ry_outer_outer)) &amp;&amp; (((((((int)threadIdx.x) + 42) % 63) / 9) + ry_outer_outer) &lt; 8)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 6) % 9))) &amp;&amp; (((((int)threadIdx.x) + 6) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1176) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 6) % 9)) - 8)] : 0.000000e+00f);
-      pad_temp_shared[(((int)threadIdx.x) + 1372)] = (((((1 &lt;= ((((((int)threadIdx.x) + 49) % 63) / 9) + ry_outer_outer)) &amp;&amp; (((((((int)threadIdx.x) + 49) % 63) / 9) + ry_outer_outer) &lt; 8)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 4) % 9))) &amp;&amp; (((((int)threadIdx.x) + 4) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1372) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 4) % 9)) - 8)] : 0.000000e+00f);
-      pad_temp_shared[(((int)threadIdx.x) + 1568)] = (((((1 &lt;= ((((((int)threadIdx.x) + 56) % 63) / 9) + ry_outer_outer)) &amp;&amp; (((((((int)threadIdx.x) + 56) % 63) / 9) + ry_outer_outer) &lt; 8)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 2) % 9))) &amp;&amp; (((((int)threadIdx.x) + 2) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1568) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 2) % 9)) - 8)] : 0.000000e+00f);
-      pad_temp_shared[(((int)threadIdx.x) + 1764)] = (((((1 &lt;= (((((int)threadIdx.x) % 63) / 9) + ry_outer_outer)) &amp;&amp; ((((((int)threadIdx.x) % 63) / 9) + ry_outer_outer) &lt; 8)) &amp;&amp; (1 &lt;= (((int)threadIdx.x) % 9))) &amp;&amp; ((((int)threadIdx.x) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + ((((int)threadIdx.x) / 9) * 7)) + (ry_outer_outer * 7)) + (((int)threadIdx.x) % 9)) + 1364)] : 0.000000e+00f);
-      if (((int)threadIdx.x) &lt; 56) {
-        pad_temp_shared[(((int)threadIdx.x) + 1960)] = (((((1 &lt;= ((((((int)threadIdx.x) + 7) % 63) / 9) + ry_outer_outer)) &amp;&amp; (((((((int)threadIdx.x) + 7) % 63) / 9) + ry_outer_outer) &lt; 8)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 7) % 9))) &amp;&amp; (((((int)threadIdx.x) + 7) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1960) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) + 7) % 9)) - 8)] : 0.000000e+00f);
-      }
-      kernel_shared[(((int)threadIdx.x) * 3)] = kernel[(((((((int)blockIdx.x) * 36864) + ((((int)threadIdx.x) &gt;&gt; 5) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) &amp; 31) * 9)) + (ry_outer_outer * 3))];
-      kernel_shared[((((int)threadIdx.x) * 3) + 1)] = kernel[((((((((int)blockIdx.x) * 36864) + ((((int)threadIdx.x) &gt;&gt; 5) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) &amp; 31) * 9)) + (ry_outer_outer * 3)) + 1)];
-      kernel_shared[((((int)threadIdx.x) * 3) + 2)] = kernel[((((((((int)blockIdx.x) * 36864) + ((((int)threadIdx.x) &gt;&gt; 5) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) &amp; 31) * 9)) + (ry_outer_outer * 3)) + 2)];
-      if (((int)threadIdx.x) &lt; 60) {
-        kernel_shared[((((int)threadIdx.x) * 3) + 588)] = kernel[(((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 196) &gt;&gt; 5) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 4) &amp; 31) * 9)) + (ry_outer_outer * 3))];
-        kernel_shared[((((int)threadIdx.x) * 3) + 589)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 196) &gt;&gt; 5) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 4) &amp; 31) * 9)) + (ry_outer_outer * 3)) + 1)];
-        kernel_shared[((((int)threadIdx.x) * 3) + 590)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 196) &gt;&gt; 5) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 4) &amp; 31) * 9)) + (ry_outer_outer * 3)) + 2)];
+      pad_temp_shared[((int)threadIdx.x)] = (((((1 &lt;= (ry_outer_outer + (((int)blockIdx.x) % 7))) &amp;&amp; ((ry_outer_outer + (((int)blockIdx.x) % 7)) &lt; 8)) &amp;&amp; (1 &lt;= (((int)threadIdx.x) % 9))) &amp;&amp; ((((int)threadIdx.x) % 9) &lt; 8)) ? data[((((((rc_outer_outer * 784) + ((((int)threadIdx.x) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((int)threadIdx.x) % 9)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 32)] = (((((1 &lt;= (ry_outer_outer + (((int)blockIdx.x) % 7))) &amp;&amp; ((ry_outer_outer + (((int)blockIdx.x) % 7)) &lt; 8)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 5) % 9))) &amp;&amp; (((((int)threadIdx.x) + 5) % 9) &lt; 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 32) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 5) % 9)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 64)] = (((((1 &lt;= (ry_outer_outer + (((int)blockIdx.x) % 7))) &amp;&amp; ((ry_outer_outer + (((int)blockIdx.x) % 7)) &lt; 8)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 1) % 9))) &amp;&amp; (((((int)threadIdx.x) + 1) % 9) &lt; 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 64) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 1) % 9)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 96)] = (((((1 &lt;= (ry_outer_outer + (((int)blockIdx.x) % 7))) &amp;&amp; ((ry_outer_outer + (((int)blockIdx.x) % 7)) &lt; 8)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 6) % 9))) &amp;&amp; (((((int)threadIdx.x) + 6) % 9) &lt; 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 96) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 6) % 9)) - 8)] : 0.000000e+00f);
+      if (((int)threadIdx.x) &lt; 16) {
+        pad_temp_shared[(((int)threadIdx.x) + 128)] = (((((1 &lt;= (ry_outer_outer + (((int)blockIdx.x) % 7))) &amp;&amp; ((ry_outer_outer + (((int)blockIdx.x) % 7)) &lt; 8)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 2) % 9))) &amp;&amp; (((((int)threadIdx.x) + 2) % 9) &lt; 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 128) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 2) % 9)) - 8)] : 0.000000e+00f);
       }
+      int4 _1;
+        int4 _2;
+          int4 _3;
+            int4 _4 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)));
+            int4 _5;
+              int4 _6;
+                int4 _7;
+                  int4 _8 = make_int4(((((int)threadIdx.x) * 4))+(1*0), ((((int)threadIdx.x) * 4))+(1*1), ((((int)threadIdx.x) * 4))+(1*2), ((((int)threadIdx.x) * 4))+(1*3));
+                  int4 _9 = make_int4(3, 3, 3, 3);
+                  _7.x = (_8.x%_9.x);
+                  _7.y = (_8.y%_9.y);
+                  _7.z = (_8.z%_9.z);
+                  _7.w = (_8.w%_9.w);
+                int4 _10;
+                  int4 _11 = make_int4(((((int)threadIdx.x) * 4))+(1*0), ((((int)threadIdx.x) * 4))+(1*1), ((((int)threadIdx.x) * 4))+(1*2), ((((int)threadIdx.x) * 4))+(1*3));
+                  int4 _12 = make_int4(3, 3, 3, 3);
+                  _10.x = (_11.x/_12.x);
+                  _10.y = (_11.y/_12.y);
+                  _10.z = (_11.z/_12.z);
+                  _10.w = (_11.w/_12.w);
+                int4 _13;
+                ushort4 _14;
+                  ushort4 _15;
+                    ushort4 _16;
+                      int4 _17 = make_int4(3, 3, 3, 3);
+                      int4 _18 = make_int4(0, 0, 0, 0);
+                      _16.x = (_17.x&gt;=_18.x);
+                      _16.y = (_17.y&gt;=_18.y);
+                      _16.z = (_17.z&gt;=_18.z);
+                      _16.w = (_17.w&gt;=_18.w);
+                    ushort4 _19;
+                      int4 _20 = make_int4(0, 0, 0, 0);
+                      _19.x = (_7.x&gt;=_20.x);
+                      _19.y = (_7.y&gt;=_20.y);
+                      _19.z = (_7.z&gt;=_20.z);
+                      _19.w = (_7.w&gt;=_20.w);
+                    _15.x = (_16.x&amp;&amp;_19.x);
+                    _15.y = (_16.y&amp;&amp;_19.y);
+                    _15.z = (_16.z&amp;&amp;_19.z);
+                    _15.w = (_16.w&amp;&amp;_19.w);
+                  ushort4 _21;
+                    ushort4 _22;
+                      int4 _23 = make_int4(3, 3, 3, 3);
+                      int4 _24 = make_int4(0, 0, 0, 0);
+                      _22.x = (_23.x&lt;_24.x);
+                      _22.y = (_23.y&lt;_24.y);
+                      _22.z = (_23.z&lt;_24.z);
+                      _22.w = (_23.w&lt;_24.w);
+                    ushort4 _25;
+                      int4 _26 = make_int4(0, 0, 0, 0);
+                      _25.x = (_7.x&lt;=_26.x);
+                      _25.y = (_7.y&lt;=_26.y);
+                      _25.z = (_7.z&lt;=_26.z);
+                      _25.w = (_7.w&lt;=_26.w);
+                    _21.x = (_22.x&amp;&amp;_25.x);
+                    _21.y = (_22.y&amp;&amp;_25.y);
+                    _21.z = (_22.z&amp;&amp;_25.z);
+                    _21.w = (_22.w&amp;&amp;_25.w);
+                  _14.x = (_15.x||_21.x);
+                  _14.y = (_15.y||_21.y);
+                  _14.z = (_15.z||_21.z);
+                  _14.w = (_15.w||_21.w);
+                int4 _27;
+                  int4 _28 = make_int4(1, 1, 1, 1);
+                  _27.x = (_10.x-_28.x);
+                  _27.y = (_10.y-_28.y);
+                  _27.z = (_10.z-_28.z);
+                  _27.w = (_10.w-_28.w);
+                _13.x = (bool(_14.x)?_10.x:_27.x);
+                _13.y = (bool(_14.y)?_10.y:_27.y);
+                _13.z = (bool(_14.z)?_10.z:_27.z);
+                _13.w = (bool(_14.w)?_10.w:_27.w);
+                int4 _29 = make_int4(16, 16, 16, 16);
+                _6.x = (_13.x%_29.x);
+                _6.y = (_13.y%_29.y);
+                _6.z = (_13.z%_29.z);
+                _6.w = (_13.w%_29.w);
+              int4 _30;
+              ushort4 _31;
+                ushort4 _32;
+                  ushort4 _33;
+                    int4 _34 = make_int4(16, 16, 16, 16);
+                    int4 _35 = make_int4(0, 0, 0, 0);
+                    _33.x = (_34.x&gt;=_35.x);
+                    _33.y = (_34.y&gt;=_35.y);
+                    _33.z = (_34.z&gt;=_35.z);
+                    _33.w = (_34.w&gt;=_35.w);
+                  ushort4 _36;
+                    int4 _37 = make_int4(0, 0, 0, 0);
+                    _36.x = (_6.x&gt;=_37.x);
+                    _36.y = (_6.y&gt;=_37.y);
+                    _36.z = (_6.z&gt;=_37.z);
+                    _36.w = (_6.w&gt;=_37.w);
+                  _32.x = (_33.x&amp;&amp;_36.x);
+                  _32.y = (_33.y&amp;&amp;_36.y);
+                  _32.z = (_33.z&amp;&amp;_36.z);
+                  _32.w = (_33.w&amp;&amp;_36.w);
+                ushort4 _38;
+                  ushort4 _39;
+                    int4 _40 = make_int4(16, 16, 16, 16);
+                    int4 _41 = make_int4(0, 0, 0, 0);
+                    _39.x = (_40.x&lt;_41.x);
+                    _39.y = (_40.y&lt;_41.y);
+                    _39.z = (_40.z&lt;_41.z);
+                    _39.w = (_40.w&lt;_41.w);
+                  ushort4 _42;
+                    int4 _43 = make_int4(0, 0, 0, 0);
+                    _42.x = (_6.x&lt;=_43.x);
+                    _42.y = (_6.y&lt;=_43.y);
+                    _42.z = (_6.z&lt;=_43.z);
+                    _42.w = (_6.w&lt;=_43.w);
+                  _38.x = (_39.x&amp;&amp;_42.x);
+                  _38.y = (_39.y&amp;&amp;_42.y);
+                  _38.z = (_39.z&amp;&amp;_42.z);
+                  _38.w = (_39.w&amp;&amp;_42.w);
+                _31.x = (_32.x||_38.x);
+                _31.y = (_32.y||_38.y);
+                _31.z = (_32.z||_38.z);
+                _31.w = (_32.w||_38.w);
+              int4 _44;
+                int4 _45 = make_int4(16, 16, 16, 16);
+                _44.x = (_6.x+_45.x);
+                _44.y = (_6.y+_45.y);
+                _44.z = (_6.z+_45.z);
+                _44.w = (_6.w+_45.w);
+              _30.x = (bool(_31.x)?_6.x:_44.x);
+              _30.y = (bool(_31.y)?_6.y:_44.y);
+              _30.z = (bool(_31.z)?_6.z:_44.z);
+              _30.w = (bool(_31.w)?_6.w:_44.w);
+              int4 _46 = make_int4(9, 9, 9, 9);
+              _5.x = (_30.x*_46.x);
+              _5.y = (_30.y*_46.y);
+              _5.z = (_30.z*_46.z);
+              _5.w = (_30.w*_46.w);
+            _3.x = (_4.x+_5.x);
+            _3.y = (_4.y+_5.y);
+            _3.z = (_4.z+_5.z);
+            _3.w = (_4.w+_5.w);
+          int4 _47 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+          _2.x = (_3.x+_47.x);
+          _2.y = (_3.y+_47.y);
+          _2.z = (_3.z+_47.z);
+          _2.w = (_3.w+_47.w);
+        int4 _48;
+          int4 _49 = make_int4((((int)threadIdx.x))+(1*0), (((int)threadIdx.x))+(1*1), (((int)threadIdx.x))+(1*2), (((int)threadIdx.x))+(1*3));
+          int4 _50 = make_int4(3, 3, 3, 3);
+          _48.x = (_49.x%_50.x);
+          _48.y = (_49.y%_50.y);
+          _48.z = (_49.z%_50.z);
+          _48.w = (_49.w%_50.w);
+        int4 _51;
+        ushort4 _52;
+          ushort4 _53;
+            ushort4 _54;
+              int4 _55 = make_int4(3, 3, 3, 3);
+              int4 _56 = make_int4(0, 0, 0, 0);
+              _54.x = (_55.x&gt;=_56.x);
+              _54.y = (_55.y&gt;=_56.y);
+              _54.z = (_55.z&gt;=_56.z);
+              _54.w = (_55.w&gt;=_56.w);
+            ushort4 _57;
+              int4 _58 = make_int4(0, 0, 0, 0);
+              _57.x = (_48.x&gt;=_58.x);
+              _57.y = (_48.y&gt;=_58.y);
+              _57.z = (_48.z&gt;=_58.z);
+              _57.w = (_48.w&gt;=_58.w);
+            _53.x = (_54.x&amp;&amp;_57.x);
+            _53.y = (_54.y&amp;&amp;_57.y);
+            _53.z = (_54.z&amp;&amp;_57.z);
+            _53.w = (_54.w&amp;&amp;_57.w);
+          ushort4 _59;
+            ushort4 _60;
+              int4 _61 = make_int4(3, 3, 3, 3);
+              int4 _62 = make_int4(0, 0, 0, 0);
+              _60.x = (_61.x&lt;_62.x);
+              _60.y = (_61.y&lt;_62.y);
+              _60.z = (_61.z&lt;_62.z);
+              _60.w = (_61.w&lt;_62.w);
+            ushort4 _63;
+              int4 _64 = make_int4(0, 0, 0, 0);
+              _63.x = (_48.x&lt;=_64.x);
+              _63.y = (_48.y&lt;=_64.y);
+              _63.z = (_48.z&lt;=_64.z);
+              _63.w = (_48.w&lt;=_64.w);
+            _59.x = (_60.x&amp;&amp;_63.x);
+            _59.y = (_60.y&amp;&amp;_63.y);
+            _59.z = (_60.z&amp;&amp;_63.z);
+            _59.w = (_60.w&amp;&amp;_63.w);
+          _52.x = (_53.x||_59.x);
+          _52.y = (_53.y||_59.y);
+          _52.z = (_53.z||_59.z);
+          _52.w = (_53.w||_59.w);
+        int4 _65;
+          int4 _66 = make_int4(3, 3, 3, 3);
+          _65.x = (_48.x+_66.x);
+          _65.y = (_48.y+_66.y);
+          _65.z = (_48.z+_66.z);
+          _65.w = (_48.w+_66.w);
+        _51.x = (bool(_52.x)?_48.x:_65.x);
+        _51.y = (bool(_52.y)?_48.y:_65.y);
+        _51.z = (bool(_52.z)?_48.z:_65.z);
+        _51.w = (bool(_52.w)?_48.w:_65.w);
+        _1.x = (_2.x+_51.x);
+        _1.y = (_2.y+_51.y);
+        _1.z = (_2.z+_51.z);
+        _1.w = (_2.w+_51.w);
+      *(float4*)(kernel_shared + (((int)threadIdx.x) * 4)) = make_float4(kernel[_1.x],kernel[_1.y],kernel[_1.z],kernel[_1.w]);
+      int4 _67;
+        int4 _68;
+          int4 _69;
+            int4 _70 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 128) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 128) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 128) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 128) / 48) * 4608)) + (rc_outer_outer * 144)));
+            int4 _71;
+              int4 _72;
+                int4 _73;
+                  int4 _74 = make_int4((((((int)threadIdx.x) * 4) + 128))+(1*0), (((((int)threadIdx.x) * 4) + 128))+(1*1), (((((int)threadIdx.x) * 4) + 128))+(1*2), (((((int)threadIdx.x) * 4) + 128))+(1*3));
+                  int4 _75 = make_int4(3, 3, 3, 3);
+                  _73.x = (_74.x%_75.x);
+                  _73.y = (_74.y%_75.y);
+                  _73.z = (_74.z%_75.z);
+                  _73.w = (_74.w%_75.w);
+                int4 _76;
+                  int4 _77 = make_int4((((((int)threadIdx.x) * 4) + 128))+(1*0), (((((int)threadIdx.x) * 4) + 128))+(1*1), (((((int)threadIdx.x) * 4) + 128))+(1*2), (((((int)threadIdx.x) * 4) + 128))+(1*3));
+                  int4 _78 = make_int4(3, 3, 3, 3);
+                  _76.x = (_77.x/_78.x);
+                  _76.y = (_77.y/_78.y);
+                  _76.z = (_77.z/_78.z);
+                  _76.w = (_77.w/_78.w);
+                int4 _79;
+                ushort4 _80;
+                  ushort4 _81;
+                    ushort4 _82;
+                      int4 _83 = make_int4(3, 3, 3, 3);
+                      int4 _84 = make_int4(0, 0, 0, 0);
+                      _82.x = (_83.x&gt;=_84.x);
+                      _82.y = (_83.y&gt;=_84.y);
+                      _82.z = (_83.z&gt;=_84.z);
+                      _82.w = (_83.w&gt;=_84.w);
+                    ushort4 _85;
+                      int4 _86 = make_int4(0, 0, 0, 0);
+                      _85.x = (_73.x&gt;=_86.x);
+                      _85.y = (_73.y&gt;=_86.y);
+                      _85.z = (_73.z&gt;=_86.z);
+                      _85.w = (_73.w&gt;=_86.w);
+                    _81.x = (_82.x&amp;&amp;_85.x);
+                    _81.y = (_82.y&amp;&amp;_85.y);
+                    _81.z = (_82.z&amp;&amp;_85.z);
+                    _81.w = (_82.w&amp;&amp;_85.w);
+                  ushort4 _87;
+                    ushort4 _88;
+                      int4 _89 = make_int4(3, 3, 3, 3);
+                      int4 _90 = make_int4(0, 0, 0, 0);
+                      _88.x = (_89.x&lt;_90.x);
+                      _88.y = (_89.y&lt;_90.y);
+                      _88.z = (_89.z&lt;_90.z);
+                      _88.w = (_89.w&lt;_90.w);
+                    ushort4 _91;
+                      int4 _92 = make_int4(0, 0, 0, 0);
+                      _91.x = (_73.x&lt;=_92.x);
+                      _91.y = (_73.y&lt;=_92.y);
+                      _91.z = (_73.z&lt;=_92.z);
+                      _91.w = (_73.w&lt;=_92.w);
+                    _87.x = (_88.x&amp;&amp;_91.x);
+                    _87.y = (_88.y&amp;&amp;_91.y);
+                    _87.z = (_88.z&amp;&amp;_91.z);
+                    _87.w = (_88.w&amp;&amp;_91.w);
+                  _80.x = (_81.x||_87.x);
+                  _80.y = (_81.y||_87.y);
+                  _80.z = (_81.z||_87.z);
+                  _80.w = (_81.w||_87.w);
+                int4 _93;
+                  int4 _94 = make_int4(1, 1, 1, 1);
+                  _93.x = (_76.x-_94.x);
+                  _93.y = (_76.y-_94.y);
+                  _93.z = (_76.z-_94.z);
+                  _93.w = (_76.w-_94.w);
+                _79.x = (bool(_80.x)?_76.x:_93.x);
+                _79.y = (bool(_80.y)?_76.y:_93.y);
+                _79.z = (bool(_80.z)?_76.z:_93.z);
+                _79.w = (bool(_80.w)?_76.w:_93.w);
+                int4 _95 = make_int4(16, 16, 16, 16);
+                _72.x = (_79.x%_95.x);
+                _72.y = (_79.y%_95.y);
+                _72.z = (_79.z%_95.z);
+                _72.w = (_79.w%_95.w);
+              int4 _96;
+              ushort4 _97;
+                ushort4 _98;
+                  ushort4 _99;
+                    int4 _100 = make_int4(16, 16, 16, 16);
+                    int4 _101 = make_int4(0, 0, 0, 0);
+                    _99.x = (_100.x&gt;=_101.x);
+                    _99.y = (_100.y&gt;=_101.y);
+                    _99.z = (_100.z&gt;=_101.z);
+                    _99.w = (_100.w&gt;=_101.w);
+                  ushort4 _102;
+                    int4 _103 = make_int4(0, 0, 0, 0);
+                    _102.x = (_72.x&gt;=_103.x);
+                    _102.y = (_72.y&gt;=_103.y);
+                    _102.z = (_72.z&gt;=_103.z);
+                    _102.w = (_72.w&gt;=_103.w);
+                  _98.x = (_99.x&amp;&amp;_102.x);
+                  _98.y = (_99.y&amp;&amp;_102.y);
+                  _98.z = (_99.z&amp;&amp;_102.z);
+                  _98.w = (_99.w&amp;&amp;_102.w);
+                ushort4 _104;
+                  ushort4 _105;
+                    int4 _106 = make_int4(16, 16, 16, 16);
+                    int4 _107 = make_int4(0, 0, 0, 0);
+                    _105.x = (_106.x&lt;_107.x);
+                    _105.y = (_106.y&lt;_107.y);
+                    _105.z = (_106.z&lt;_107.z);
+                    _105.w = (_106.w&lt;_107.w);
+                  ushort4 _108;
+                    int4 _109 = make_int4(0, 0, 0, 0);
+                    _108.x = (_72.x&lt;=_109.x);
+                    _108.y = (_72.y&lt;=_109.y);
+                    _108.z = (_72.z&lt;=_109.z);
+                    _108.w = (_72.w&lt;=_109.w);
+                  _104.x = (_105.x&amp;&amp;_108.x);
+                  _104.y = (_105.y&amp;&amp;_108.y);
+                  _104.z = (_105.z&amp;&amp;_108.z);
+                  _104.w = (_105.w&amp;&amp;_108.w);
+                _97.x = (_98.x||_104.x);
+                _97.y = (_98.y||_104.y);
+                _97.z = (_98.z||_104.z);
+                _97.w = (_98.w||_104.w);
+              int4 _110;
+                int4 _111 = make_int4(16, 16, 16, 16);
+                _110.x = (_72.x+_111.x);
+                _110.y = (_72.y+_111.y);
+                _110.z = (_72.z+_111.z);
+                _110.w = (_72.w+_111.w);
+              _96.x = (bool(_97.x)?_72.x:_110.x);
+              _96.y = (bool(_97.y)?_72.y:_110.y);
+              _96.z = (bool(_97.z)?_72.z:_110.z);
+              _96.w = (bool(_97.w)?_72.w:_110.w);
+              int4 _112 = make_int4(9, 9, 9, 9);
+              _71.x = (_96.x*_112.x);
+              _71.y = (_96.y*_112.y);
+              _71.z = (_96.z*_112.z);
+              _71.w = (_96.w*_112.w);
+            _69.x = (_70.x+_71.x);
+            _69.y = (_70.y+_71.y);
+            _69.z = (_70.z+_71.z);
+            _69.w = (_70.w+_71.w);
+          int4 _113 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+          _68.x = (_69.x+_113.x);
+          _68.y = (_69.y+_113.y);
+          _68.z = (_69.z+_113.z);
+          _68.w = (_69.w+_113.w);
+        int4 _114;
+          int4 _115 = make_int4(((((int)threadIdx.x) + 32))+(1*0), ((((int)threadIdx.x) + 32))+(1*1), ((((int)threadIdx.x) + 32))+(1*2), ((((int)threadIdx.x) + 32))+(1*3));
+          int4 _116 = make_int4(3, 3, 3, 3);
+          _114.x = (_115.x%_116.x);
+          _114.y = (_115.y%_116.y);
+          _114.z = (_115.z%_116.z);
+          _114.w = (_115.w%_116.w);
+        int4 _117;
+        ushort4 _118;
+          ushort4 _119;
+            ushort4 _120;
+              int4 _121 = make_int4(3, 3, 3, 3);
+              int4 _122 = make_int4(0, 0, 0, 0);
+              _120.x = (_121.x&gt;=_122.x);
+              _120.y = (_121.y&gt;=_122.y);
+              _120.z = (_121.z&gt;=_122.z);
+              _120.w = (_121.w&gt;=_122.w);
+            ushort4 _123;
+              int4 _124 = make_int4(0, 0, 0, 0);
+              _123.x = (_114.x&gt;=_124.x);
+              _123.y = (_114.y&gt;=_124.y);
+              _123.z = (_114.z&gt;=_124.z);
+              _123.w = (_114.w&gt;=_124.w);
+            _119.x = (_120.x&amp;&amp;_123.x);
+            _119.y = (_120.y&amp;&amp;_123.y);
+            _119.z = (_120.z&amp;&amp;_123.z);
+            _119.w = (_120.w&amp;&amp;_123.w);
+          ushort4 _125;
+            ushort4 _126;
+              int4 _127 = make_int4(3, 3, 3, 3);
+              int4 _128 = make_int4(0, 0, 0, 0);
+              _126.x = (_127.x&lt;_128.x);
+              _126.y = (_127.y&lt;_128.y);
+              _126.z = (_127.z&lt;_128.z);
+              _126.w = (_127.w&lt;_128.w);
+            ushort4 _129;
+              int4 _130 = make_int4(0, 0, 0, 0);
+              _129.x = (_114.x&lt;=_130.x);
+              _129.y = (_114.y&lt;=_130.y);
+              _129.z = (_114.z&lt;=_130.z);
+              _129.w = (_114.w&lt;=_130.w);
+            _125.x = (_126.x&amp;&amp;_129.x);
+            _125.y = (_126.y&amp;&amp;_129.y);
+            _125.z = (_126.z&amp;&amp;_129.z);
+            _125.w = (_126.w&amp;&amp;_129.w);
+          _118.x = (_119.x||_125.x);
+          _118.y = (_119.y||_125.y);
+          _118.z = (_119.z||_125.z);
+          _118.w = (_119.w||_125.w);
+        int4 _131;
+          int4 _132 = make_int4(3, 3, 3, 3);
+          _131.x = (_114.x+_132.x);
+          _131.y = (_114.y+_132.y);
+          _131.z = (_114.z+_132.z);
+          _131.w = (_114.w+_132.w);
+        _117.x = (bool(_118.x)?_114.x:_131.x);
+        _117.y = (bool(_118.y)?_114.y:_131.y);
+        _117.z = (bool(_118.z)?_114.z:_131.z);
+        _117.w = (bool(_118.w)?_114.w:_131.w);
+        _67.x = (_68.x+_117.x);
+        _67.y = (_68.y+_117.y);
+        _67.z = (_68.z+_117.z);
+        _67.w = (_68.w+_117.w);
+      *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 128)) = make_float4(kernel[_67.x],kernel[_67.y],kernel[_67.z],kernel[_67.w]);
+      int4 _133;
+        int4 _134;
+          int4 _135;
+            int4 _136 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 256) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 256) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 256) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 256) / 48) * 4608)) + (rc_outer_outer * 144)));
+            int4 _137;
+              int4 _138;
+                int4 _139;
+                  int4 _140 = make_int4((((((int)threadIdx.x) * 4) + 256))+(1*0), (((((int)threadIdx.x) * 4) + 256))+(1*1), (((((int)threadIdx.x) * 4) + 256))+(1*2), (((((int)threadIdx.x) * 4) + 256))+(1*3));
+                  int4 _141 = make_int4(3, 3, 3, 3);
+                  _139.x = (_140.x%_141.x);
+                  _139.y = (_140.y%_141.y);
+                  _139.z = (_140.z%_141.z);
+                  _139.w = (_140.w%_141.w);
+                int4 _142;
+                  int4 _143 = make_int4((((((int)threadIdx.x) * 4) + 256))+(1*0), (((((int)threadIdx.x) * 4) + 256))+(1*1), (((((int)threadIdx.x) * 4) + 256))+(1*2), (((((int)threadIdx.x) * 4) + 256))+(1*3));
+                  int4 _144 = make_int4(3, 3, 3, 3);
+                  _142.x = (_143.x/_144.x);
+                  _142.y = (_143.y/_144.y);
+                  _142.z = (_143.z/_144.z);
+                  _142.w = (_143.w/_144.w);
+                int4 _145;
+                ushort4 _146;
+                  ushort4 _147;
+                    ushort4 _148;
+                      int4 _149 = make_int4(3, 3, 3, 3);
+                      int4 _150 = make_int4(0, 0, 0, 0);
+                      _148.x = (_149.x&gt;=_150.x);
+                      _148.y = (_149.y&gt;=_150.y);
+                      _148.z = (_149.z&gt;=_150.z);
+                      _148.w = (_149.w&gt;=_150.w);
+                    ushort4 _151;
+                      int4 _152 = make_int4(0, 0, 0, 0);
+                      _151.x = (_139.x&gt;=_152.x);
+                      _151.y = (_139.y&gt;=_152.y);
+                      _151.z = (_139.z&gt;=_152.z);
+                      _151.w = (_139.w&gt;=_152.w);
+                    _147.x = (_148.x&amp;&amp;_151.x);
+                    _147.y = (_148.y&amp;&amp;_151.y);
+                    _147.z = (_148.z&amp;&amp;_151.z);
+                    _147.w = (_148.w&amp;&amp;_151.w);
+                  ushort4 _153;
+                    ushort4 _154;
+                      int4 _155 = make_int4(3, 3, 3, 3);
+                      int4 _156 = make_int4(0, 0, 0, 0);
+                      _154.x = (_155.x&lt;_156.x);
+                      _154.y = (_155.y&lt;_156.y);
+                      _154.z = (_155.z&lt;_156.z);
+                      _154.w = (_155.w&lt;_156.w);
+                    ushort4 _157;
+                      int4 _158 = make_int4(0, 0, 0, 0);
+                      _157.x = (_139.x&lt;=_158.x);
+                      _157.y = (_139.y&lt;=_158.y);
+                      _157.z = (_139.z&lt;=_158.z);
+                      _157.w = (_139.w&lt;=_158.w);
+                    _153.x = (_154.x&amp;&amp;_157.x);
+                    _153.y = (_154.y&amp;&amp;_157.y);
+                    _153.z = (_154.z&amp;&amp;_157.z);
+                    _153.w = (_154.w&amp;&amp;_157.w);
+                  _146.x = (_147.x||_153.x);
+                  _146.y = (_147.y||_153.y);
+                  _146.z = (_147.z||_153.z);
+                  _146.w = (_147.w||_153.w);
+                int4 _159;
+                  int4 _160 = make_int4(1, 1, 1, 1);
+                  _159.x = (_142.x-_160.x);
+                  _159.y = (_142.y-_160.y);
+                  _159.z = (_142.z-_160.z);
+                  _159.w = (_142.w-_160.w);
+                _145.x = (bool(_146.x)?_142.x:_159.x);
+                _145.y = (bool(_146.y)?_142.y:_159.y);
+                _145.z = (bool(_146.z)?_142.z:_159.z);
+                _145.w = (bool(_146.w)?_142.w:_159.w);
+                int4 _161 = make_int4(16, 16, 16, 16);
+                _138.x = (_145.x%_161.x);
+                _138.y = (_145.y%_161.y);
+                _138.z = (_145.z%_161.z);
+                _138.w = (_145.w%_161.w);
+              int4 _162;
+              ushort4 _163;
+                ushort4 _164;
+                  ushort4 _165;
+                    int4 _166 = make_int4(16, 16, 16, 16);
+                    int4 _167 = make_int4(0, 0, 0, 0);
+                    _165.x = (_166.x&gt;=_167.x);
+                    _165.y = (_166.y&gt;=_167.y);
+                    _165.z = (_166.z&gt;=_167.z);
+                    _165.w = (_166.w&gt;=_167.w);
+                  ushort4 _168;
+                    int4 _169 = make_int4(0, 0, 0, 0);
+                    _168.x = (_138.x&gt;=_169.x);
+                    _168.y = (_138.y&gt;=_169.y);
+                    _168.z = (_138.z&gt;=_169.z);
+                    _168.w = (_138.w&gt;=_169.w);
+                  _164.x = (_165.x&amp;&amp;_168.x);
+                  _164.y = (_165.y&amp;&amp;_168.y);
+                  _164.z = (_165.z&amp;&amp;_168.z);
+                  _164.w = (_165.w&amp;&amp;_168.w);
+                ushort4 _170;
+                  ushort4 _171;
+                    int4 _172 = make_int4(16, 16, 16, 16);
+                    int4 _173 = make_int4(0, 0, 0, 0);
+                    _171.x = (_172.x&lt;_173.x);
+                    _171.y = (_172.y&lt;_173.y);
+                    _171.z = (_172.z&lt;_173.z);
+                    _171.w = (_172.w&lt;_173.w);
+                  ushort4 _174;
+                    int4 _175 = make_int4(0, 0, 0, 0);
+                    _174.x = (_138.x&lt;=_175.x);
+                    _174.y = (_138.y&lt;=_175.y);
+                    _174.z = (_138.z&lt;=_175.z);
+                    _174.w = (_138.w&lt;=_175.w);
+                  _170.x = (_171.x&amp;&amp;_174.x);
+                  _170.y = (_171.y&amp;&amp;_174.y);
+                  _170.z = (_171.z&amp;&amp;_174.z);
+                  _170.w = (_171.w&amp;&amp;_174.w);
+                _163.x = (_164.x||_170.x);
+                _163.y = (_164.y||_170.y);
+                _163.z = (_164.z||_170.z);
+                _163.w = (_164.w||_170.w);
+              int4 _176;
+                int4 _177 = make_int4(16, 16, 16, 16);
+                _176.x = (_138.x+_177.x);
+                _176.y = (_138.y+_177.y);
+                _176.z = (_138.z+_177.z);
+                _176.w = (_138.w+_177.w);
+              _162.x = (bool(_163.x)?_138.x:_176.x);
+              _162.y = (bool(_163.y)?_138.y:_176.y);
+              _162.z = (bool(_163.z)?_138.z:_176.z);
+              _162.w = (bool(_163.w)?_138.w:_176.w);
+              int4 _178 = make_int4(9, 9, 9, 9);
+              _137.x = (_162.x*_178.x);
+              _137.y = (_162.y*_178.y);
+              _137.z = (_162.z*_178.z);
+              _137.w = (_162.w*_178.w);
+            _135.x = (_136.x+_137.x);
+            _135.y = (_136.y+_137.y);
+            _135.z = (_136.z+_137.z);
+            _135.w = (_136.w+_137.w);
+          int4 _179 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+          _134.x = (_135.x+_179.x);
+          _134.y = (_135.y+_179.y);
+          _134.z = (_135.z+_179.z);
+          _134.w = (_135.w+_179.w);
+        int4 _180;
+          int4 _181 = make_int4(((((int)threadIdx.x) + 64))+(1*0), ((((int)threadIdx.x) + 64))+(1*1), ((((int)threadIdx.x) + 64))+(1*2), ((((int)threadIdx.x) + 64))+(1*3));
+          int4 _182 = make_int4(3, 3, 3, 3);
+          _180.x = (_181.x%_182.x);
+          _180.y = (_181.y%_182.y);
+          _180.z = (_181.z%_182.z);
+          _180.w = (_181.w%_182.w);
+        int4 _183;
+        ushort4 _184;
+          ushort4 _185;
+            ushort4 _186;
+              int4 _187 = make_int4(3, 3, 3, 3);
+              int4 _188 = make_int4(0, 0, 0, 0);
+              _186.x = (_187.x&gt;=_188.x);
+              _186.y = (_187.y&gt;=_188.y);
+              _186.z = (_187.z&gt;=_188.z);
+              _186.w = (_187.w&gt;=_188.w);
+            ushort4 _189;
+              int4 _190 = make_int4(0, 0, 0, 0);
+              _189.x = (_180.x&gt;=_190.x);
+              _189.y = (_180.y&gt;=_190.y);
+              _189.z = (_180.z&gt;=_190.z);
+              _189.w = (_180.w&gt;=_190.w);
+            _185.x = (_186.x&amp;&amp;_189.x);
+            _185.y = (_186.y&amp;&amp;_189.y);
+            _185.z = (_186.z&amp;&amp;_189.z);
+            _185.w = (_186.w&amp;&amp;_189.w);
+          ushort4 _191;
+            ushort4 _192;
+              int4 _193 = make_int4(3, 3, 3, 3);
+              int4 _194 = make_int4(0, 0, 0, 0);
+              _192.x = (_193.x&lt;_194.x);
+              _192.y = (_193.y&lt;_194.y);
+              _192.z = (_193.z&lt;_194.z);
+              _192.w = (_193.w&lt;_194.w);
+            ushort4 _195;
+              int4 _196 = make_int4(0, 0, 0, 0);
+              _195.x = (_180.x&lt;=_196.x);
+              _195.y = (_180.y&lt;=_196.y);
+              _195.z = (_180.z&lt;=_196.z);
+              _195.w = (_180.w&lt;=_196.w);
+            _191.x = (_192.x&amp;&amp;_195.x);
+            _191.y = (_192.y&amp;&amp;_195.y);
+            _191.z = (_192.z&amp;&amp;_195.z);
+            _191.w = (_192.w&amp;&amp;_195.w);
+          _184.x = (_185.x||_191.x);
+          _184.y = (_185.y||_191.y);
+          _184.z = (_185.z||_191.z);
+          _184.w = (_185.w||_191.w);
+        int4 _197;
+          int4 _198 = make_int4(3, 3, 3, 3);
+          _197.x = (_180.x+_198.x);
+          _197.y = (_180.y+_198.y);
+          _197.z = (_180.z+_198.z);
+          _197.w = (_180.w+_198.w);
+        _183.x = (bool(_184.x)?_180.x:_197.x);
+        _183.y = (bool(_184.y)?_180.y:_197.y);
+        _183.z = (bool(_184.z)?_180.z:_197.z);
+        _183.w = (bool(_184.w)?_180.w:_197.w);
+        _133.x = (_134.x+_183.x);
+        _133.y = (_134.y+_183.y);
+        _133.z = (_134.z+_183.z);
+        _133.w = (_134.w+_183.w);
+      *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 256)) = make_float4(kernel[_133.x],kernel[_133.y],kernel[_133.z],kernel[_133.w]);
+      int4 _199;
+        int4 _200;
+          int4 _201;
+            int4 _202 = make_int4((((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 36864), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 36864), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 36864), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 36864));
+            int4 _203;
+              int4 _204;
+                int4 _205;
+                  int4 _206 = make_int4((((((int)threadIdx.x) * 4) + 384))+(1*0), (((((int)threadIdx.x) * 4) + 384))+(1*1), (((((int)threadIdx.x) * 4) + 384))+(1*2), (((((int)threadIdx.x) * 4) + 384))+(1*3));
+                  int4 _207 = make_int4(3, 3, 3, 3);
+                  _205.x = (_206.x%_207.x);
+                  _205.y = (_206.y%_207.y);
+                  _205.z = (_206.z%_207.z);
+                  _205.w = (_206.w%_207.w);
+                int4 _208;
+                  int4 _209 = make_int4((((((int)threadIdx.x) * 4) + 384))+(1*0), (((((int)threadIdx.x) * 4) + 384))+(1*1), (((((int)threadIdx.x) * 4) + 384))+(1*2), (((((int)threadIdx.x) * 4) + 384))+(1*3));
+                  int4 _210 = make_int4(3, 3, 3, 3);
+                  _208.x = (_209.x/_210.x);
+                  _208.y = (_209.y/_210.y);
+                  _208.z = (_209.z/_210.z);
+                  _208.w = (_209.w/_210.w);
+                int4 _211;
+                ushort4 _212;
+                  ushort4 _213;
+                    ushort4 _214;
+                      int4 _215 = make_int4(3, 3, 3, 3);
+                      int4 _216 = make_int4(0, 0, 0, 0);
+                      _214.x = (_215.x&gt;=_216.x);
+                      _214.y = (_215.y&gt;=_216.y);
+                      _214.z = (_215.z&gt;=_216.z);
+                      _214.w = (_215.w&gt;=_216.w);
+                    ushort4 _217;
+                      int4 _218 = make_int4(0, 0, 0, 0);
+                      _217.x = (_205.x&gt;=_218.x);
+                      _217.y = (_205.y&gt;=_218.y);
+                      _217.z = (_205.z&gt;=_218.z);
+                      _217.w = (_205.w&gt;=_218.w);
+                    _213.x = (_214.x&amp;&amp;_217.x);
+                    _213.y = (_214.y&amp;&amp;_217.y);
+                    _213.z = (_214.z&amp;&amp;_217.z);
+                    _213.w = (_214.w&amp;&amp;_217.w);
+                  ushort4 _219;
+                    ushort4 _220;
+                      int4 _221 = make_int4(3, 3, 3, 3);
+                      int4 _222 = make_int4(0, 0, 0, 0);
+                      _220.x = (_221.x&lt;_222.x);
+                      _220.y = (_221.y&lt;_222.y);
+                      _220.z = (_221.z&lt;_222.z);
+                      _220.w = (_221.w&lt;_222.w);
+                    ushort4 _223;
+                      int4 _224 = make_int4(0, 0, 0, 0);
+                      _223.x = (_205.x&lt;=_224.x);
+                      _223.y = (_205.y&lt;=_224.y);
+                      _223.z = (_205.z&lt;=_224.z);
+                      _223.w = (_205.w&lt;=_224.w);
+                    _219.x = (_220.x&amp;&amp;_223.x);
+                    _219.y = (_220.y&amp;&amp;_223.y);
+                    _219.z = (_220.z&amp;&amp;_223.z);
+                    _219.w = (_220.w&amp;&amp;_223.w);
+                  _212.x = (_213.x||_219.x);
+                  _212.y = (_213.y||_219.y);
+                  _212.z = (_213.z||_219.z);
+                  _212.w = (_213.w||_219.w);
+                int4 _225;
+                  int4 _226 = make_int4(1, 1, 1, 1);
+                  _225.x = (_208.x-_226.x);
+                  _225.y = (_208.y-_226.y);
+                  _225.z = (_208.z-_226.z);
+                  _225.w = (_208.w-_226.w);
+                _211.x = (bool(_212.x)?_208.x:_225.x);
+                _211.y = (bool(_212.y)?_208.y:_225.y);
+                _211.z = (bool(_212.z)?_208.z:_225.z);
+                _211.w = (bool(_212.w)?_208.w:_225.w);
+                int4 _227 = make_int4(16, 16, 16, 16);
+                _204.x = (_211.x%_227.x);
+                _204.y = (_211.y%_227.y);
+                _204.z = (_211.z%_227.z);
+                _204.w = (_211.w%_227.w);
+              int4 _228;
+              ushort4 _229;
+                ushort4 _230;
+                  ushort4 _231;
+                    int4 _232 = make_int4(16, 16, 16, 16);
+                    int4 _233 = make_int4(0, 0, 0, 0);
+                    _231.x = (_232.x&gt;=_233.x);
+                    _231.y = (_232.y&gt;=_233.y);
+                    _231.z = (_232.z&gt;=_233.z);
+                    _231.w = (_232.w&gt;=_233.w);
+                  ushort4 _234;
+                    int4 _235 = make_int4(0, 0, 0, 0);
+                    _234.x = (_204.x&gt;=_235.x);
+                    _234.y = (_204.y&gt;=_235.y);
+                    _234.z = (_204.z&gt;=_235.z);
+                    _234.w = (_204.w&gt;=_235.w);
+                  _230.x = (_231.x&amp;&amp;_234.x);
+                  _230.y = (_231.y&amp;&amp;_234.y);
+                  _230.z = (_231.z&amp;&amp;_234.z);
+                  _230.w = (_231.w&amp;&amp;_234.w);
+                ushort4 _236;
+                  ushort4 _237;
+                    int4 _238 = make_int4(16, 16, 16, 16);
+                    int4 _239 = make_int4(0, 0, 0, 0);
+                    _237.x = (_238.x&lt;_239.x);
+                    _237.y = (_238.y&lt;_239.y);
+                    _237.z = (_238.z&lt;_239.z);
+                    _237.w = (_238.w&lt;_239.w);
+                  ushort4 _240;
+                    int4 _241 = make_int4(0, 0, 0, 0);
+                    _240.x = (_204.x&lt;=_241.x);
+                    _240.y = (_204.y&lt;=_241.y);
+                    _240.z = (_204.z&lt;=_241.z);
+                    _240.w = (_204.w&lt;=_241.w);
+                  _236.x = (_237.x&amp;&amp;_240.x);
+                  _236.y = (_237.y&amp;&amp;_240.y);
+                  _236.z = (_237.z&amp;&amp;_240.z);
+                  _236.w = (_237.w&amp;&amp;_240.w);
+                _229.x = (_230.x||_236.x);
+                _229.y = (_230.y||_236.y);
+                _229.z = (_230.z||_236.z);
+                _229.w = (_230.w||_236.w);
+              int4 _242;
+                int4 _243 = make_int4(16, 16, 16, 16);
+                _242.x = (_204.x+_243.x);
+                _242.y = (_204.y+_243.y);
+                _242.z = (_204.z+_243.z);
+                _242.w = (_204.w+_243.w);
+              _228.x = (bool(_229.x)?_204.x:_242.x);
+              _228.y = (bool(_229.y)?_204.y:_242.y);
+              _228.z = (bool(_229.z)?_204.z:_242.z);
+              _228.w = (bool(_229.w)?_204.w:_242.w);
+              int4 _244 = make_int4(9, 9, 9, 9);
+              _203.x = (_228.x*_244.x);
+              _203.y = (_228.y*_244.y);
+              _203.z = (_228.z*_244.z);
+              _203.w = (_228.w*_244.w);
+            _201.x = (_202.x+_203.x);
+            _201.y = (_202.y+_203.y);
+            _201.z = (_202.z+_203.z);
+            _201.w = (_202.w+_203.w);
+          int4 _245 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+          _200.x = (_201.x+_245.x);
+          _200.y = (_201.y+_245.y);
+          _200.z = (_201.z+_245.z);
+          _200.w = (_201.w+_245.w);
+        int4 _246;
+          int4 _247 = make_int4(((((int)threadIdx.x) + 96))+(1*0), ((((int)threadIdx.x) + 96))+(1*1), ((((int)threadIdx.x) + 96))+(1*2), ((((int)threadIdx.x) + 96))+(1*3));
+          int4 _248 = make_int4(3, 3, 3, 3);
+          _246.x = (_247.x%_248.x);
+          _246.y = (_247.y%_248.y);
+          _246.z = (_247.z%_248.z);
+          _246.w = (_247.w%_248.w);
+        int4 _249;
+        ushort4 _250;
+          ushort4 _251;
+            ushort4 _252;
+              int4 _253 = make_int4(3, 3, 3, 3);
+              int4 _254 = make_int4(0, 0, 0, 0);
+              _252.x = (_253.x&gt;=_254.x);
+              _252.y = (_253.y&gt;=_254.y);
+              _252.z = (_253.z&gt;=_254.z);
+              _252.w = (_253.w&gt;=_254.w);
+            ushort4 _255;
+              int4 _256 = make_int4(0, 0, 0, 0);
+              _255.x = (_246.x&gt;=_256.x);
+              _255.y = (_246.y&gt;=_256.y);
+              _255.z = (_246.z&gt;=_256.z);
+              _255.w = (_246.w&gt;=_256.w);
+            _251.x = (_252.x&amp;&amp;_255.x);
+            _251.y = (_252.y&amp;&amp;_255.y);
+            _251.z = (_252.z&amp;&amp;_255.z);
+            _251.w = (_252.w&amp;&amp;_255.w);
+          ushort4 _257;
+            ushort4 _258;
+              int4 _259 = make_int4(3, 3, 3, 3);
+              int4 _260 = make_int4(0, 0, 0, 0);
+              _258.x = (_259.x&lt;_260.x);
+              _258.y = (_259.y&lt;_260.y);
+              _258.z = (_259.z&lt;_260.z);
+              _258.w = (_259.w&lt;_260.w);
+            ushort4 _261;
+              int4 _262 = make_int4(0, 0, 0, 0);
+              _261.x = (_246.x&lt;=_262.x);
+              _261.y = (_246.y&lt;=_262.y);
+              _261.z = (_246.z&lt;=_262.z);
+              _261.w = (_246.w&lt;=_262.w);
+            _257.x = (_258.x&amp;&amp;_261.x);
+            _257.y = (_258.y&amp;&amp;_261.y);
+            _257.z = (_258.z&amp;&amp;_261.z);
+            _257.w = (_258.w&amp;&amp;_261.w);
+          _250.x = (_251.x||_257.x);
+          _250.y = (_251.y||_257.y);
+          _250.z = (_251.z||_257.z);
+          _250.w = (_251.w||_257.w);
+        int4 _263;
+          int4 _264 = make_int4(3, 3, 3, 3);
+          _263.x = (_246.x+_264.x);
+          _263.y = (_246.y+_264.y);
+          _263.z = (_246.z+_264.z);
+          _263.w = (_246.w+_264.w);
+        _249.x = (bool(_250.x)?_246.x:_263.x);
+        _249.y = (bool(_250.y)?_246.y:_263.y);
+        _249.z = (bool(_250.z)?_246.z:_263.z);
+        _249.w = (bool(_250.w)?_246.w:_263.w);
+        _199.x = (_200.x+_249.x);
+        _199.y = (_200.y+_249.y);
+        _199.z = (_200.z+_249.z);
+        _199.w = (_200.w+_249.w);
+      *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 384)) = make_float4(kernel[_199.x],kernel[_199.y],kernel[_199.z],kernel[_199.w]);
+      int4 _265;
+        int4 _266;
+          int4 _267;
+            int4 _268 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 512) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 512) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 512) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 512) / 48) * 4608)) + (rc_outer_outer * 144)));
+            int4 _269;
+              int4 _270;
+                int4 _271;
+                  int4 _272 = make_int4((((((int)threadIdx.x) * 4) + 512))+(1*0), (((((int)threadIdx.x) * 4) + 512))+(1*1), (((((int)threadIdx.x) * 4) + 512))+(1*2), (((((int)threadIdx.x) * 4) + 512))+(1*3));
+                  int4 _273 = make_int4(3, 3, 3, 3);
+                  _271.x = (_272.x%_273.x);
+                  _271.y = (_272.y%_273.y);
+                  _271.z = (_272.z%_273.z);
+                  _271.w = (_272.w%_273.w);
+                int4 _274;
+                  int4 _275 = make_int4((((((int)threadIdx.x) * 4) + 512))+(1*0), (((((int)threadIdx.x) * 4) + 512))+(1*1), (((((int)threadIdx.x) * 4) + 512))+(1*2), (((((int)threadIdx.x) * 4) + 512))+(1*3));
+                  int4 _276 = make_int4(3, 3, 3, 3);
+                  _274.x = (_275.x/_276.x);
+                  _274.y = (_275.y/_276.y);
+                  _274.z = (_275.z/_276.z);
+                  _274.w = (_275.w/_276.w);
+                int4 _277;
+                ushort4 _278;
+                  ushort4 _279;
+                    ushort4 _280;
+                      int4 _281 = make_int4(3, 3, 3, 3);
+                      int4 _282 = make_int4(0, 0, 0, 0);
+                      _280.x = (_281.x&gt;=_282.x);
+                      _280.y = (_281.y&gt;=_282.y);
+                      _280.z = (_281.z&gt;=_282.z);
+                      _280.w = (_281.w&gt;=_282.w);
+                    ushort4 _283;
+                      int4 _284 = make_int4(0, 0, 0, 0);
+                      _283.x = (_271.x&gt;=_284.x);
+                      _283.y = (_271.y&gt;=_284.y);
+                      _283.z = (_271.z&gt;=_284.z);
+                      _283.w = (_271.w&gt;=_284.w);
+                    _279.x = (_280.x&amp;&amp;_283.x);
+                    _279.y = (_280.y&amp;&amp;_283.y);
+                    _279.z = (_280.z&amp;&amp;_283.z);
+                    _279.w = (_280.w&amp;&amp;_283.w);
+                  ushort4 _285;
+                    ushort4 _286;
+                      int4 _287 = make_int4(3, 3, 3, 3);
+                      int4 _288 = make_int4(0, 0, 0, 0);
+                      _286.x = (_287.x&lt;_288.x);
+                      _286.y = (_287.y&lt;_288.y);
+                      _286.z = (_287.z&lt;_288.z);
+                      _286.w = (_287.w&lt;_288.w);
+                    ushort4 _289;
+                      int4 _290 = make_int4(0, 0, 0, 0);
+                      _289.x = (_271.x&lt;=_290.x);
+                      _289.y = (_271.y&lt;=_290.y);
+                      _289.z = (_271.z&lt;=_290.z);
+                      _289.w = (_271.w&lt;=_290.w);
+                    _285.x = (_286.x&amp;&amp;_289.x);
+                    _285.y = (_286.y&amp;&amp;_289.y);
+                    _285.z = (_286.z&amp;&amp;_289.z);
+                    _285.w = (_286.w&amp;&amp;_289.w);
+                  _278.x = (_279.x||_285.x);
+                  _278.y = (_279.y||_285.y);
+                  _278.z = (_279.z||_285.z);
+                  _278.w = (_279.w||_285.w);
+                int4 _291;
+                  int4 _292 = make_int4(1, 1, 1, 1);
+                  _291.x = (_274.x-_292.x);
+                  _291.y = (_274.y-_292.y);
+                  _291.z = (_274.z-_292.z);
+                  _291.w = (_274.w-_292.w);
+                _277.x = (bool(_278.x)?_274.x:_291.x);
+                _277.y = (bool(_278.y)?_274.y:_291.y);
+                _277.z = (bool(_278.z)?_274.z:_291.z);
+                _277.w = (bool(_278.w)?_274.w:_291.w);
+                int4 _293 = make_int4(16, 16, 16, 16);
+                _270.x = (_277.x%_293.x);
+                _270.y = (_277.y%_293.y);
+                _270.z = (_277.z%_293.z);
+                _270.w = (_277.w%_293.w);
+              int4 _294;
+              ushort4 _295;
+                ushort4 _296;
+                  ushort4 _297;
+                    int4 _298 = make_int4(16, 16, 16, 16);
+                    int4 _299 = make_int4(0, 0, 0, 0);
+                    _297.x = (_298.x&gt;=_299.x);
+                    _297.y = (_298.y&gt;=_299.y);
+                    _297.z = (_298.z&gt;=_299.z);
+                    _297.w = (_298.w&gt;=_299.w);
+                  ushort4 _300;
+                    int4 _301 = make_int4(0, 0, 0, 0);
+                    _300.x = (_270.x&gt;=_301.x);
+                    _300.y = (_270.y&gt;=_301.y);
+                    _300.z = (_270.z&gt;=_301.z);
+                    _300.w = (_270.w&gt;=_301.w);
+                  _296.x = (_297.x&amp;&amp;_300.x);
+                  _296.y = (_297.y&amp;&amp;_300.y);
+                  _296.z = (_297.z&amp;&amp;_300.z);
+                  _296.w = (_297.w&amp;&amp;_300.w);
+                ushort4 _302;
+                  ushort4 _303;
+                    int4 _304 = make_int4(16, 16, 16, 16);
+                    int4 _305 = make_int4(0, 0, 0, 0);
+                    _303.x = (_304.x&lt;_305.x);
+                    _303.y = (_304.y&lt;_305.y);
+                    _303.z = (_304.z&lt;_305.z);
+                    _303.w = (_304.w&lt;_305.w);
+                  ushort4 _306;
+                    int4 _307 = make_int4(0, 0, 0, 0);
+                    _306.x = (_270.x&lt;=_307.x);
+                    _306.y = (_270.y&lt;=_307.y);
+                    _306.z = (_270.z&lt;=_307.z);
+                    _306.w = (_270.w&lt;=_307.w);
+                  _302.x = (_303.x&amp;&amp;_306.x);
+                  _302.y = (_303.y&amp;&amp;_306.y);
+                  _302.z = (_303.z&amp;&amp;_306.z);
+                  _302.w = (_303.w&amp;&amp;_306.w);
+                _295.x = (_296.x||_302.x);
+                _295.y = (_296.y||_302.y);
+                _295.z = (_296.z||_302.z);
+                _295.w = (_296.w||_302.w);
+              int4 _308;
+                int4 _309 = make_int4(16, 16, 16, 16);
+                _308.x = (_270.x+_309.x);
+                _308.y = (_270.y+_309.y);
+                _308.z = (_270.z+_309.z);
+                _308.w = (_270.w+_309.w);
+              _294.x = (bool(_295.x)?_270.x:_308.x);
+              _294.y = (bool(_295.y)?_270.y:_308.y);
+              _294.z = (bool(_295.z)?_270.z:_308.z);
+              _294.w = (bool(_295.w)?_270.w:_308.w);
+              int4 _310 = make_int4(9, 9, 9, 9);
+              _269.x = (_294.x*_310.x);
+              _269.y = (_294.y*_310.y);
+              _269.z = (_294.z*_310.z);
+              _269.w = (_294.w*_310.w);
+            _267.x = (_268.x+_269.x);
+            _267.y = (_268.y+_269.y);
+            _267.z = (_268.z+_269.z);
+            _267.w = (_268.w+_269.w);
+          int4 _311 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+          _266.x = (_267.x+_311.x);
+          _266.y = (_267.y+_311.y);
+          _266.z = (_267.z+_311.z);
+          _266.w = (_267.w+_311.w);
+        int4 _312;
+          int4 _313 = make_int4(((((int)threadIdx.x) + 128))+(1*0), ((((int)threadIdx.x) + 128))+(1*1), ((((int)threadIdx.x) + 128))+(1*2), ((((int)threadIdx.x) + 128))+(1*3));
+          int4 _314 = make_int4(3, 3, 3, 3);
+          _312.x = (_313.x%_314.x);
+          _312.y = (_313.y%_314.y);
+          _312.z = (_313.z%_314.z);
+          _312.w = (_313.w%_314.w);
+        int4 _315;
+        ushort4 _316;
+          ushort4 _317;
+            ushort4 _318;
+              int4 _319 = make_int4(3, 3, 3, 3);
+              int4 _320 = make_int4(0, 0, 0, 0);
+              _318.x = (_319.x&gt;=_320.x);
+              _318.y = (_319.y&gt;=_320.y);
+              _318.z = (_319.z&gt;=_320.z);
+              _318.w = (_319.w&gt;=_320.w);
+            ushort4 _321;
+              int4 _322 = make_int4(0, 0, 0, 0);
+              _321.x = (_312.x&gt;=_322.x);
+              _321.y = (_312.y&gt;=_322.y);
+              _321.z = (_312.z&gt;=_322.z);
+              _321.w = (_312.w&gt;=_322.w);
+            _317.x = (_318.x&amp;&amp;_321.x);
+            _317.y = (_318.y&amp;&amp;_321.y);
+            _317.z = (_318.z&amp;&amp;_321.z);
+            _317.w = (_318.w&amp;&amp;_321.w);
+          ushort4 _323;
+            ushort4 _324;
+              int4 _325 = make_int4(3, 3, 3, 3);
+              int4 _326 = make_int4(0, 0, 0, 0);
+              _324.x = (_325.x&lt;_326.x);
+              _324.y = (_325.y&lt;_326.y);
+              _324.z = (_325.z&lt;_326.z);
+              _324.w = (_325.w&lt;_326.w);
+            ushort4 _327;
+              int4 _328 = make_int4(0, 0, 0, 0);
+              _327.x = (_312.x&lt;=_328.x);
+              _327.y = (_312.y&lt;=_328.y);
+              _327.z = (_312.z&lt;=_328.z);
+              _327.w = (_312.w&lt;=_328.w);
+            _323.x = (_324.x&amp;&amp;_327.x);
+            _323.y = (_324.y&amp;&amp;_327.y);
+            _323.z = (_324.z&amp;&amp;_327.z);
+            _323.w = (_324.w&amp;&amp;_327.w);
+          _316.x = (_317.x||_323.x);
+          _316.y = (_317.y||_323.y);
+          _316.z = (_317.z||_323.z);
+          _316.w = (_317.w||_323.w);
+        int4 _329;
+          int4 _330 = make_int4(3, 3, 3, 3);
+          _329.x = (_312.x+_330.x);
+          _329.y = (_312.y+_330.y);
+          _329.z = (_312.z+_330.z);
+          _329.w = (_312.w+_330.w);
+        _315.x = (bool(_316.x)?_312.x:_329.x);
+        _315.y = (bool(_316.y)?_312.y:_329.y);
+        _315.z = (bool(_316.z)?_312.z:_329.z);
+        _315.w = (bool(_316.w)?_312.w:_329.w);
+        _265.x = (_266.x+_315.x);
+        _265.y = (_266.y+_315.y);
+        _265.z = (_266.z+_315.z);
+        _265.w = (_266.w+_315.w);
+      *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 512)) = make_float4(kernel[_265.x],kernel[_265.y],kernel[_265.z],kernel[_265.w]);
+      int4 _331;
+        int4 _332;
+          int4 _333;
+            int4 _334 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 640) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 640) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 640) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 640) / 48) * 4608)) + (rc_outer_outer * 144)));
+            int4 _335;
+              int4 _336;
+                int4 _337;
+                  int4 _338 = make_int4((((((int)threadIdx.x) * 4) + 640))+(1*0), (((((int)threadIdx.x) * 4) + 640))+(1*1), (((((int)threadIdx.x) * 4) + 640))+(1*2), (((((int)threadIdx.x) * 4) + 640))+(1*3));
+                  int4 _339 = make_int4(3, 3, 3, 3);
+                  _337.x = (_338.x%_339.x);
+                  _337.y = (_338.y%_339.y);
+                  _337.z = (_338.z%_339.z);
+                  _337.w = (_338.w%_339.w);
+                int4 _340;
+                  int4 _341 = make_int4((((((int)threadIdx.x) * 4) + 640))+(1*0), (((((int)threadIdx.x) * 4) + 640))+(1*1), (((((int)threadIdx.x) * 4) + 640))+(1*2), (((((int)threadIdx.x) * 4) + 640))+(1*3));
+                  int4 _342 = make_int4(3, 3, 3, 3);
+                  _340.x = (_341.x/_342.x);
+                  _340.y = (_341.y/_342.y);
+                  _340.z = (_341.z/_342.z);
+                  _340.w = (_341.w/_342.w);
+                int4 _343;
+                ushort4 _344;
+                  ushort4 _345;
+                    ushort4 _346;
+                      int4 _347 = make_int4(3, 3, 3, 3);
+                      int4 _348 = make_int4(0, 0, 0, 0);
+                      _346.x = (_347.x&gt;=_348.x);
+                      _346.y = (_347.y&gt;=_348.y);
+                      _346.z = (_347.z&gt;=_348.z);
+                      _346.w = (_347.w&gt;=_348.w);
+                    ushort4 _349;
+                      int4 _350 = make_int4(0, 0, 0, 0);
+                      _349.x = (_337.x&gt;=_350.x);
+                      _349.y = (_337.y&gt;=_350.y);
+                      _349.z = (_337.z&gt;=_350.z);
+                      _349.w = (_337.w&gt;=_350.w);
+                    _345.x = (_346.x&amp;&amp;_349.x);
+                    _345.y = (_346.y&amp;&amp;_349.y);
+                    _345.z = (_346.z&amp;&amp;_349.z);
+                    _345.w = (_346.w&amp;&amp;_349.w);
+                  ushort4 _351;
+                    ushort4 _352;
+                      int4 _353 = make_int4(3, 3, 3, 3);
+                      int4 _354 = make_int4(0, 0, 0, 0);
+                      _352.x = (_353.x&lt;_354.x);
+                      _352.y = (_353.y&lt;_354.y);
+                      _352.z = (_353.z&lt;_354.z);
+                      _352.w = (_353.w&lt;_354.w);
+                    ushort4 _355;
+                      int4 _356 = make_int4(0, 0, 0, 0);
+                      _355.x = (_337.x&lt;=_356.x);
+                      _355.y = (_337.y&lt;=_356.y);
+                      _355.z = (_337.z&lt;=_356.z);
+                      _355.w = (_337.w&lt;=_356.w);
+                    _351.x = (_352.x&amp;&amp;_355.x);
+                    _351.y = (_352.y&amp;&amp;_355.y);
+                    _351.z = (_352.z&amp;&amp;_355.z);
+                    _351.w = (_352.w&amp;&amp;_355.w);
+                  _344.x = (_345.x||_351.x);
+                  _344.y = (_345.y||_351.y);
+                  _344.z = (_345.z||_351.z);
+                  _344.w = (_345.w||_351.w);
+                int4 _357;
+                  int4 _358 = make_int4(1, 1, 1, 1);
+                  _357.x = (_340.x-_358.x);
+                  _357.y = (_340.y-_358.y);
+                  _357.z = (_340.z-_358.z);
+                  _357.w = (_340.w-_358.w);
+                _343.x = (bool(_344.x)?_340.x:_357.x);
+                _343.y = (bool(_344.y)?_340.y:_357.y);
+                _343.z = (bool(_344.z)?_340.z:_357.z);
+                _343.w = (bool(_344.w)?_340.w:_357.w);
+                int4 _359 = make_int4(16, 16, 16, 16);
+                _336.x = (_343.x%_359.x);
+                _336.y = (_343.y%_359.y);
+                _336.z = (_343.z%_359.z);
+                _336.w = (_343.w%_359.w);
+              int4 _360;
+              ushort4 _361;
+                ushort4 _362;
+                  ushort4 _363;
+                    int4 _364 = make_int4(16, 16, 16, 16);
+                    int4 _365 = make_int4(0, 0, 0, 0);
+                    _363.x = (_364.x&gt;=_365.x);
+                    _363.y = (_364.y&gt;=_365.y);
+                    _363.z = (_364.z&gt;=_365.z);
+                    _363.w = (_364.w&gt;=_365.w);
+                  ushort4 _366;
+                    int4 _367 = make_int4(0, 0, 0, 0);
+                    _366.x = (_336.x&gt;=_367.x);
+                    _366.y = (_336.y&gt;=_367.y);
+                    _366.z = (_336.z&gt;=_367.z);
+                    _366.w = (_336.w&gt;=_367.w);
+                  _362.x = (_363.x&amp;&amp;_366.x);
+                  _362.y = (_363.y&amp;&amp;_366.y);
+                  _362.z = (_363.z&amp;&amp;_366.z);
+                  _362.w = (_363.w&amp;&amp;_366.w);
+                ushort4 _368;
+                  ushort4 _369;
+                    int4 _370 = make_int4(16, 16, 16, 16);
+                    int4 _371 = make_int4(0, 0, 0, 0);
+                    _369.x = (_370.x&lt;_371.x);
+                    _369.y = (_370.y&lt;_371.y);
+                    _369.z = (_370.z&lt;_371.z);
+                    _369.w = (_370.w&lt;_371.w);
+                  ushort4 _372;
+                    int4 _373 = make_int4(0, 0, 0, 0);
+                    _372.x = (_336.x&lt;=_373.x);
+                    _372.y = (_336.y&lt;=_373.y);
+                    _372.z = (_336.z&lt;=_373.z);
+                    _372.w = (_336.w&lt;=_373.w);
+                  _368.x = (_369.x&amp;&amp;_372.x);
+                  _368.y = (_369.y&amp;&amp;_372.y);
+                  _368.z = (_369.z&amp;&amp;_372.z);
+                  _368.w = (_369.w&amp;&amp;_372.w);
+                _361.x = (_362.x||_368.x);
+                _361.y = (_362.y||_368.y);
+                _361.z = (_362.z||_368.z);
+                _361.w = (_362.w||_368.w);
+              int4 _374;
+                int4 _375 = make_int4(16, 16, 16, 16);
+                _374.x = (_336.x+_375.x);
+                _374.y = (_336.y+_375.y);
+                _374.z = (_336.z+_375.z);
+                _374.w = (_336.w+_375.w);
+              _360.x = (bool(_361.x)?_336.x:_374.x);
+              _360.y = (bool(_361.y)?_336.y:_374.y);
+              _360.z = (bool(_361.z)?_336.z:_374.z);
+              _360.w = (bool(_361.w)?_336.w:_374.w);
+              int4 _376 = make_int4(9, 9, 9, 9);
+              _335.x = (_360.x*_376.x);
+              _335.y = (_360.y*_376.y);
+              _335.z = (_360.z*_376.z);
+              _335.w = (_360.w*_376.w);
+            _333.x = (_334.x+_335.x);
+            _333.y = (_334.y+_335.y);
+            _333.z = (_334.z+_335.z);
+            _333.w = (_334.w+_335.w);
+          int4 _377 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+          _332.x = (_333.x+_377.x);
+          _332.y = (_333.y+_377.y);
+          _332.z = (_333.z+_377.z);
+          _332.w = (_333.w+_377.w);
+        int4 _378;
+          int4 _379 = make_int4(((((int)threadIdx.x) + 160))+(1*0), ((((int)threadIdx.x) + 160))+(1*1), ((((int)threadIdx.x) + 160))+(1*2), ((((int)threadIdx.x) + 160))+(1*3));
+          int4 _380 = make_int4(3, 3, 3, 3);
+          _378.x = (_379.x%_380.x);
+          _378.y = (_379.y%_380.y);
+          _378.z = (_379.z%_380.z);
+          _378.w = (_379.w%_380.w);
+        int4 _381;
+        ushort4 _382;
+          ushort4 _383;
+            ushort4 _384;
+              int4 _385 = make_int4(3, 3, 3, 3);
+              int4 _386 = make_int4(0, 0, 0, 0);
+              _384.x = (_385.x&gt;=_386.x);
+              _384.y = (_385.y&gt;=_386.y);
+              _384.z = (_385.z&gt;=_386.z);
+              _384.w = (_385.w&gt;=_386.w);
+            ushort4 _387;
+              int4 _388 = make_int4(0, 0, 0, 0);
+              _387.x = (_378.x&gt;=_388.x);
+              _387.y = (_378.y&gt;=_388.y);
+              _387.z = (_378.z&gt;=_388.z);
+              _387.w = (_378.w&gt;=_388.w);
+            _383.x = (_384.x&amp;&amp;_387.x);
+            _383.y = (_384.y&amp;&amp;_387.y);
+            _383.z = (_384.z&amp;&amp;_387.z);
+            _383.w = (_384.w&amp;&amp;_387.w);
+          ushort4 _389;
+            ushort4 _390;
+              int4 _391 = make_int4(3, 3, 3, 3);
+              int4 _392 = make_int4(0, 0, 0, 0);
+              _390.x = (_391.x&lt;_392.x);
+              _390.y = (_391.y&lt;_392.y);
+              _390.z = (_391.z&lt;_392.z);
+              _390.w = (_391.w&lt;_392.w);
+            ushort4 _393;
+              int4 _394 = make_int4(0, 0, 0, 0);
+              _393.x = (_378.x&lt;=_394.x);
+              _393.y = (_378.y&lt;=_394.y);
+              _393.z = (_378.z&lt;=_394.z);
+              _393.w = (_378.w&lt;=_394.w);
+            _389.x = (_390.x&amp;&amp;_393.x);
+            _389.y = (_390.y&amp;&amp;_393.y);
+            _389.z = (_390.z&amp;&amp;_393.z);
+            _389.w = (_390.w&amp;&amp;_393.w);
+          _382.x = (_383.x||_389.x);
+          _382.y = (_383.y||_389.y);
+          _382.z = (_383.z||_389.z);
+          _382.w = (_383.w||_389.w);
+        int4 _395;
+          int4 _396 = make_int4(3, 3, 3, 3);
+          _395.x = (_378.x+_396.x);
+          _395.y = (_378.y+_396.y);
+          _395.z = (_378.z+_396.z);
+          _395.w = (_378.w+_396.w);
+        _381.x = (bool(_382.x)?_378.x:_395.x);
+        _381.y = (bool(_382.y)?_378.y:_395.y);
+        _381.z = (bool(_382.z)?_378.z:_395.z);
+        _381.w = (bool(_382.w)?_378.w:_395.w);
+        _331.x = (_332.x+_381.x);
+        _331.y = (_332.y+_381.y);
+        _331.z = (_332.z+_381.z);
+        _331.w = (_332.w+_381.w);
+      *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 640)) = make_float4(kernel[_331.x],kernel[_331.y],kernel[_331.z],kernel[_331.w]);
+      int4 _397;
+        int4 _398;
+          int4 _399;
+            int4 _400 = make_int4((((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 73728), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 73728), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 73728), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 73728));
+            int4 _401;
+              int4 _402;
+                int4 _403;
+                  int4 _404 = make_int4((((((int)threadIdx.x) * 4) + 768))+(1*0), (((((int)threadIdx.x) * 4) + 768))+(1*1), (((((int)threadIdx.x) * 4) + 768))+(1*2), (((((int)threadIdx.x) * 4) + 768))+(1*3));
+                  int4 _405 = make_int4(3, 3, 3, 3);
+                  _403.x = (_404.x%_405.x);
+                  _403.y = (_404.y%_405.y);
+                  _403.z = (_404.z%_405.z);
+                  _403.w = (_404.w%_405.w);
+                int4 _406;
+                  int4 _407 = make_int4((((((int)threadIdx.x) * 4) + 768))+(1*0), (((((int)threadIdx.x) * 4) + 768))+(1*1), (((((int)threadIdx.x) * 4) + 768))+(1*2), (((((int)threadIdx.x) * 4) + 768))+(1*3));
+                  int4 _408 = make_int4(3, 3, 3, 3);
+                  _406.x = (_407.x/_408.x);
+                  _406.y = (_407.y/_408.y);
+                  _406.z = (_407.z/_408.z);
+                  _406.w = (_407.w/_408.w);
+                int4 _409;
+                ushort4 _410;
+                  ushort4 _411;
+                    ushort4 _412;
+                      int4 _413 = make_int4(3, 3, 3, 3);
+                      int4 _414 = make_int4(0, 0, 0, 0);
+                      _412.x = (_413.x&gt;=_414.x);
+                      _412.y = (_413.y&gt;=_414.y);
+                      _412.z = (_413.z&gt;=_414.z);
+                      _412.w = (_413.w&gt;=_414.w);
+                    ushort4 _415;
+                      int4 _416 = make_int4(0, 0, 0, 0);
+                      _415.x = (_403.x&gt;=_416.x);
+                      _415.y = (_403.y&gt;=_416.y);
+                      _415.z = (_403.z&gt;=_416.z);
+                      _415.w = (_403.w&gt;=_416.w);
+                    _411.x = (_412.x&amp;&amp;_415.x);
+                    _411.y = (_412.y&amp;&amp;_415.y);
+                    _411.z = (_412.z&amp;&amp;_415.z);
+                    _411.w = (_412.w&amp;&amp;_415.w);
+                  ushort4 _417;
+                    ushort4 _418;
+                      int4 _419 = make_int4(3, 3, 3, 3);
+                      int4 _420 = make_int4(0, 0, 0, 0);
+                      _418.x = (_419.x&lt;_420.x);
+                      _418.y = (_419.y&lt;_420.y);
+                      _418.z = (_419.z&lt;_420.z);
+                      _418.w = (_419.w&lt;_420.w);
+                    ushort4 _421;
+                      int4 _422 = make_int4(0, 0, 0, 0);
+                      _421.x = (_403.x&lt;=_422.x);
+                      _421.y = (_403.y&lt;=_422.y);
+                      _421.z = (_403.z&lt;=_422.z);
+                      _421.w = (_403.w&lt;=_422.w);
+                    _417.x = (_418.x&amp;&amp;_421.x);
+                    _417.y = (_418.y&amp;&amp;_421.y);
+                    _417.z = (_418.z&amp;&amp;_421.z);
+                    _417.w = (_418.w&amp;&amp;_421.w);
+                  _410.x = (_411.x||_417.x);
+                  _410.y = (_411.y||_417.y);
+                  _410.z = (_411.z||_417.z);
+                  _410.w = (_411.w||_417.w);
+                int4 _423;
+                  int4 _424 = make_int4(1, 1, 1, 1);
+                  _423.x = (_406.x-_424.x);
+                  _423.y = (_406.y-_424.y);
+                  _423.z = (_406.z-_424.z);
+                  _423.w = (_406.w-_424.w);
+                _409.x = (bool(_410.x)?_406.x:_423.x);
+                _409.y = (bool(_410.y)?_406.y:_423.y);
+                _409.z = (bool(_410.z)?_406.z:_423.z);
+                _409.w = (bool(_410.w)?_406.w:_423.w);
+                int4 _425 = make_int4(16, 16, 16, 16);
+                _402.x = (_409.x%_425.x);
+                _402.y = (_409.y%_425.y);
+                _402.z = (_409.z%_425.z);
+                _402.w = (_409.w%_425.w);
+              int4 _426;
+              ushort4 _427;
+                ushort4 _428;
+                  ushort4 _429;
+                    int4 _430 = make_int4(16, 16, 16, 16);
+                    int4 _431 = make_int4(0, 0, 0, 0);
+                    _429.x = (_430.x&gt;=_431.x);
+                    _429.y = (_430.y&gt;=_431.y);
+                    _429.z = (_430.z&gt;=_431.z);
+                    _429.w = (_430.w&gt;=_431.w);
+                  ushort4 _432;
+                    int4 _433 = make_int4(0, 0, 0, 0);
+                    _432.x = (_402.x&gt;=_433.x);
+                    _432.y = (_402.y&gt;=_433.y);
+                    _432.z = (_402.z&gt;=_433.z);
+                    _432.w = (_402.w&gt;=_433.w);
+                  _428.x = (_429.x&amp;&amp;_432.x);
+                  _428.y = (_429.y&amp;&amp;_432.y);
+                  _428.z = (_429.z&amp;&amp;_432.z);
+                  _428.w = (_429.w&amp;&amp;_432.w);
+                ushort4 _434;
+                  ushort4 _435;
+                    int4 _436 = make_int4(16, 16, 16, 16);
+                    int4 _437 = make_int4(0, 0, 0, 0);
+                    _435.x = (_436.x&lt;_437.x);
+                    _435.y = (_436.y&lt;_437.y);
+                    _435.z = (_436.z&lt;_437.z);
+                    _435.w = (_436.w&lt;_437.w);
+                  ushort4 _438;
+                    int4 _439 = make_int4(0, 0, 0, 0);
+                    _438.x = (_402.x&lt;=_439.x);
+                    _438.y = (_402.y&lt;=_439.y);
+                    _438.z = (_402.z&lt;=_439.z);
+                    _438.w = (_402.w&lt;=_439.w);
+                  _434.x = (_435.x&amp;&amp;_438.x);
+                  _434.y = (_435.y&amp;&amp;_438.y);
+                  _434.z = (_435.z&amp;&amp;_438.z);
+                  _434.w = (_435.w&amp;&amp;_438.w);
+                _427.x = (_428.x||_434.x);
+                _427.y = (_428.y||_434.y);
+                _427.z = (_428.z||_434.z);
+                _427.w = (_428.w||_434.w);
+              int4 _440;
+                int4 _441 = make_int4(16, 16, 16, 16);
+                _440.x = (_402.x+_441.x);
+                _440.y = (_402.y+_441.y);
+                _440.z = (_402.z+_441.z);
+                _440.w = (_402.w+_441.w);
+              _426.x = (bool(_427.x)?_402.x:_440.x);
+              _426.y = (bool(_427.y)?_402.y:_440.y);
+              _426.z = (bool(_427.z)?_402.z:_440.z);
+              _426.w = (bool(_427.w)?_402.w:_440.w);
+              int4 _442 = make_int4(9, 9, 9, 9);
+              _401.x = (_426.x*_442.x);
+              _401.y = (_426.y*_442.y);
+              _401.z = (_426.z*_442.z);
+              _401.w = (_426.w*_442.w);
+            _399.x = (_400.x+_401.x);
+            _399.y = (_400.y+_401.y);
+            _399.z = (_400.z+_401.z);
+            _399.w = (_400.w+_401.w);
+          int4 _443 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+          _398.x = (_399.x+_443.x);
+          _398.y = (_399.y+_443.y);
+          _398.z = (_399.z+_443.z);
+          _398.w = (_399.w+_443.w);
+        int4 _444;
+          int4 _445 = make_int4(((((int)threadIdx.x) + 192))+(1*0), ((((int)threadIdx.x) + 192))+(1*1), ((((int)threadIdx.x) + 192))+(1*2), ((((int)threadIdx.x) + 192))+(1*3));
+          int4 _446 = make_int4(3, 3, 3, 3);
+          _444.x = (_445.x%_446.x);
+          _444.y = (_445.y%_446.y);
+          _444.z = (_445.z%_446.z);
+          _444.w = (_445.w%_446.w);
+        int4 _447;
+        ushort4 _448;
+          ushort4 _449;
+            ushort4 _450;
+              int4 _451 = make_int4(3, 3, 3, 3);
+              int4 _452 = make_int4(0, 0, 0, 0);
+              _450.x = (_451.x&gt;=_452.x);
+              _450.y = (_451.y&gt;=_452.y);
+              _450.z = (_451.z&gt;=_452.z);
+              _450.w = (_451.w&gt;=_452.w);
+            ushort4 _453;
+              int4 _454 = make_int4(0, 0, 0, 0);
+              _453.x = (_444.x&gt;=_454.x);
+              _453.y = (_444.y&gt;=_454.y);
+              _453.z = (_444.z&gt;=_454.z);
+              _453.w = (_444.w&gt;=_454.w);
+            _449.x = (_450.x&amp;&amp;_453.x);
+            _449.y = (_450.y&amp;&amp;_453.y);
+            _449.z = (_450.z&amp;&amp;_453.z);
+            _449.w = (_450.w&amp;&amp;_453.w);
+          ushort4 _455;
+            ushort4 _456;
+              int4 _457 = make_int4(3, 3, 3, 3);
+              int4 _458 = make_int4(0, 0, 0, 0);
+              _456.x = (_457.x&lt;_458.x);
+              _456.y = (_457.y&lt;_458.y);
+              _456.z = (_457.z&lt;_458.z);
+              _456.w = (_457.w&lt;_458.w);
+            ushort4 _459;
+              int4 _460 = make_int4(0, 0, 0, 0);
+              _459.x = (_444.x&lt;=_460.x);
+              _459.y = (_444.y&lt;=_460.y);
+              _459.z = (_444.z&lt;=_460.z);
+              _459.w = (_444.w&lt;=_460.w);
+            _455.x = (_456.x&amp;&amp;_459.x);
+            _455.y = (_456.y&amp;&amp;_459.y);
+            _455.z = (_456.z&amp;&amp;_459.z);
+            _455.w = (_456.w&amp;&amp;_459.w);
+          _448.x = (_449.x||_455.x);
+          _448.y = (_449.y||_455.y);
+          _448.z = (_449.z||_455.z);
+          _448.w = (_449.w||_455.w);
+        int4 _461;
+          int4 _462 = make_int4(3, 3, 3, 3);
+          _461.x = (_444.x+_462.x);
+          _461.y = (_444.y+_462.y);
+          _461.z = (_444.z+_462.z);
+          _461.w = (_444.w+_462.w);
+        _447.x = (bool(_448.x)?_444.x:_461.x);
+        _447.y = (bool(_448.y)?_444.y:_461.y);
+        _447.z = (bool(_448.z)?_444.z:_461.z);
+        _447.w = (bool(_448.w)?_444.w:_461.w);
+        _397.x = (_398.x+_447.x);
+        _397.y = (_398.y+_447.y);
+        _397.z = (_398.z+_447.z);
+        _397.w = (_398.w+_447.w);
+      *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 768)) = make_float4(kernel[_397.x],kernel[_397.y],kernel[_397.z],kernel[_397.w]);
+      int4 _463;
+        int4 _464;
+          int4 _465;
+            int4 _466 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 896) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 896) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 896) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 896) / 48) * 4608)) + (rc_outer_outer * 144)));
+            int4 _467;
+              int4 _468;
+                int4 _469;
+                  int4 _470 = make_int4((((((int)threadIdx.x) * 4) + 896))+(1*0), (((((int)threadIdx.x) * 4) + 896))+(1*1), (((((int)threadIdx.x) * 4) + 896))+(1*2), (((((int)threadIdx.x) * 4) + 896))+(1*3));
+                  int4 _471 = make_int4(3, 3, 3, 3);
+                  _469.x = (_470.x%_471.x);
+                  _469.y = (_470.y%_471.y);
+                  _469.z = (_470.z%_471.z);
+                  _469.w = (_470.w%_471.w);
+                int4 _472;
+                  int4 _473 = make_int4((((((int)threadIdx.x) * 4) + 896))+(1*0), (((((int)threadIdx.x) * 4) + 896))+(1*1), (((((int)threadIdx.x) * 4) + 896))+(1*2), (((((int)threadIdx.x) * 4) + 896))+(1*3));
+                  int4 _474 = make_int4(3, 3, 3, 3);
+                  _472.x = (_473.x/_474.x);
+                  _472.y = (_473.y/_474.y);
+                  _472.z = (_473.z/_474.z);
+                  _472.w = (_473.w/_474.w);
+                int4 _475;
+                ushort4 _476;
+                  ushort4 _477;
+                    ushort4 _478;
+                      int4 _479 = make_int4(3, 3, 3, 3);
+                      int4 _480 = make_int4(0, 0, 0, 0);
+                      _478.x = (_479.x&gt;=_480.x);
+                      _478.y = (_479.y&gt;=_480.y);
+                      _478.z = (_479.z&gt;=_480.z);
+                      _478.w = (_479.w&gt;=_480.w);
+                    ushort4 _481;
+                      int4 _482 = make_int4(0, 0, 0, 0);
+                      _481.x = (_469.x&gt;=_482.x);
+                      _481.y = (_469.y&gt;=_482.y);
+                      _481.z = (_469.z&gt;=_482.z);
+                      _481.w = (_469.w&gt;=_482.w);
+                    _477.x = (_478.x&amp;&amp;_481.x);
+                    _477.y = (_478.y&amp;&amp;_481.y);
+                    _477.z = (_478.z&amp;&amp;_481.z);
+                    _477.w = (_478.w&amp;&amp;_481.w);
+                  ushort4 _483;
+                    ushort4 _484;
+                      int4 _485 = make_int4(3, 3, 3, 3);
+                      int4 _486 = make_int4(0, 0, 0, 0);
+                      _484.x = (_485.x&lt;_486.x);
+                      _484.y = (_485.y&lt;_486.y);
+                      _484.z = (_485.z&lt;_486.z);
+                      _484.w = (_485.w&lt;_486.w);
+                    ushort4 _487;
+                      int4 _488 = make_int4(0, 0, 0, 0);
+                      _487.x = (_469.x&lt;=_488.x);
+                      _487.y = (_469.y&lt;=_488.y);
+                      _487.z = (_469.z&lt;=_488.z);
+                      _487.w = (_469.w&lt;=_488.w);
+                    _483.x = (_484.x&amp;&amp;_487.x);
+                    _483.y = (_484.y&amp;&amp;_487.y);
+                    _483.z = (_484.z&amp;&amp;_487.z);
+                    _483.w = (_484.w&amp;&amp;_487.w);
+                  _476.x = (_477.x||_483.x);
+                  _476.y = (_477.y||_483.y);
+                  _476.z = (_477.z||_483.z);
+                  _476.w = (_477.w||_483.w);
+                int4 _489;
+                  int4 _490 = make_int4(1, 1, 1, 1);
+                  _489.x = (_472.x-_490.x);
+                  _489.y = (_472.y-_490.y);
+                  _489.z = (_472.z-_490.z);
+                  _489.w = (_472.w-_490.w);
+                _475.x = (bool(_476.x)?_472.x:_489.x);
+                _475.y = (bool(_476.y)?_472.y:_489.y);
+                _475.z = (bool(_476.z)?_472.z:_489.z);
+                _475.w = (bool(_476.w)?_472.w:_489.w);
+                int4 _491 = make_int4(16, 16, 16, 16);
+                _468.x = (_475.x%_491.x);
+                _468.y = (_475.y%_491.y);
+                _468.z = (_475.z%_491.z);
+                _468.w = (_475.w%_491.w);
+              int4 _492;
+              ushort4 _493;
+                ushort4 _494;
+                  ushort4 _495;
+                    int4 _496 = make_int4(16, 16, 16, 16);
+                    int4 _497 = make_int4(0, 0, 0, 0);
+                    _495.x = (_496.x&gt;=_497.x);
+                    _495.y = (_496.y&gt;=_497.y);
+                    _495.z = (_496.z&gt;=_497.z);
+                    _495.w = (_496.w&gt;=_497.w);
+                  ushort4 _498;
+                    int4 _499 = make_int4(0, 0, 0, 0);
+                    _498.x = (_468.x&gt;=_499.x);
+                    _498.y = (_468.y&gt;=_499.y);
+                    _498.z = (_468.z&gt;=_499.z);
+                    _498.w = (_468.w&gt;=_499.w);
+                  _494.x = (_495.x&amp;&amp;_498.x);
+                  _494.y = (_495.y&amp;&amp;_498.y);
+                  _494.z = (_495.z&amp;&amp;_498.z);
+                  _494.w = (_495.w&amp;&amp;_498.w);
+                ushort4 _500;
+                  ushort4 _501;
+                    int4 _502 = make_int4(16, 16, 16, 16);
+                    int4 _503 = make_int4(0, 0, 0, 0);
+                    _501.x = (_502.x&lt;_503.x);
+                    _501.y = (_502.y&lt;_503.y);
+                    _501.z = (_502.z&lt;_503.z);
+                    _501.w = (_502.w&lt;_503.w);
+                  ushort4 _504;
+                    int4 _505 = make_int4(0, 0, 0, 0);
+                    _504.x = (_468.x&lt;=_505.x);
+                    _504.y = (_468.y&lt;=_505.y);
+                    _504.z = (_468.z&lt;=_505.z);
+                    _504.w = (_468.w&lt;=_505.w);
+                  _500.x = (_501.x&amp;&amp;_504.x);
+                  _500.y = (_501.y&amp;&amp;_504.y);
+                  _500.z = (_501.z&amp;&amp;_504.z);
+                  _500.w = (_501.w&amp;&amp;_504.w);
+                _493.x = (_494.x||_500.x);
+                _493.y = (_494.y||_500.y);
+                _493.z = (_494.z||_500.z);
+                _493.w = (_494.w||_500.w);
+              int4 _506;
+                int4 _507 = make_int4(16, 16, 16, 16);
+                _506.x = (_468.x+_507.x);
+                _506.y = (_468.y+_507.y);
+                _506.z = (_468.z+_507.z);
+                _506.w = (_468.w+_507.w);
+              _492.x = (bool(_493.x)?_468.x:_506.x);
+              _492.y = (bool(_493.y)?_468.y:_506.y);
+              _492.z = (bool(_493.z)?_468.z:_506.z);
+              _492.w = (bool(_493.w)?_468.w:_506.w);
+              int4 _508 = make_int4(9, 9, 9, 9);
+              _467.x = (_492.x*_508.x);
+              _467.y = (_492.y*_508.y);
+              _467.z = (_492.z*_508.z);
+              _467.w = (_492.w*_508.w);
+            _465.x = (_466.x+_467.x);
+            _465.y = (_466.y+_467.y);
+            _465.z = (_466.z+_467.z);
+            _465.w = (_466.w+_467.w);
+          int4 _509 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+          _464.x = (_465.x+_509.x);
+          _464.y = (_465.y+_509.y);
+          _464.z = (_465.z+_509.z);
+          _464.w = (_465.w+_509.w);
+        int4 _510;
+          int4 _511 = make_int4(((((int)threadIdx.x) + 224))+(1*0), ((((int)threadIdx.x) + 224))+(1*1), ((((int)threadIdx.x) + 224))+(1*2), ((((int)threadIdx.x) + 224))+(1*3));
+          int4 _512 = make_int4(3, 3, 3, 3);
+          _510.x = (_511.x%_512.x);
+          _510.y = (_511.y%_512.y);
+          _510.z = (_511.z%_512.z);
+          _510.w = (_511.w%_512.w);
+        int4 _513;
+        ushort4 _514;
+          ushort4 _515;
+            ushort4 _516;
+              int4 _517 = make_int4(3, 3, 3, 3);
+              int4 _518 = make_int4(0, 0, 0, 0);
+              _516.x = (_517.x&gt;=_518.x);
+              _516.y = (_517.y&gt;=_518.y);
+              _516.z = (_517.z&gt;=_518.z);
+              _516.w = (_517.w&gt;=_518.w);
+            ushort4 _519;
+              int4 _520 = make_int4(0, 0, 0, 0);
+              _519.x = (_510.x&gt;=_520.x);
+              _519.y = (_510.y&gt;=_520.y);
+              _519.z = (_510.z&gt;=_520.z);
+              _519.w = (_510.w&gt;=_520.w);
+            _515.x = (_516.x&amp;&amp;_519.x);
+            _515.y = (_516.y&amp;&amp;_519.y);
+            _515.z = (_516.z&amp;&amp;_519.z);
+            _515.w = (_516.w&amp;&amp;_519.w);
+          ushort4 _521;
+            ushort4 _522;
+              int4 _523 = make_int4(3, 3, 3, 3);
+              int4 _524 = make_int4(0, 0, 0, 0);
+              _522.x = (_523.x&lt;_524.x);
+              _522.y = (_523.y&lt;_524.y);
+              _522.z = (_523.z&lt;_524.z);
+              _522.w = (_523.w&lt;_524.w);
+            ushort4 _525;
+              int4 _526 = make_int4(0, 0, 0, 0);
+              _525.x = (_510.x&lt;=_526.x);
+              _525.y = (_510.y&lt;=_526.y);
+              _525.z = (_510.z&lt;=_526.z);
+              _525.w = (_510.w&lt;=_526.w);
+            _521.x = (_522.x&amp;&amp;_525.x);
+            _521.y = (_522.y&amp;&amp;_525.y);
+            _521.z = (_522.z&amp;&amp;_525.z);
+            _521.w = (_522.w&amp;&amp;_525.w);
+          _514.x = (_515.x||_521.x);
+          _514.y = (_515.y||_521.y);
+          _514.z = (_515.z||_521.z);
+          _514.w = (_515.w||_521.w);
+        int4 _527;
+          int4 _528 = make_int4(3, 3, 3, 3);
+          _527.x = (_510.x+_528.x);
+          _527.y = (_510.y+_528.y);
+          _527.z = (_510.z+_528.z);
+          _527.w = (_510.w+_528.w);
+        _513.x = (bool(_514.x)?_510.x:_527.x);
+        _513.y = (bool(_514.y)?_510.y:_527.y);
+        _513.z = (bool(_514.z)?_510.z:_527.z);
+        _513.w = (bool(_514.w)?_510.w:_527.w);
+        _463.x = (_464.x+_513.x);
+        _463.y = (_464.y+_513.y);
+        _463.z = (_464.z+_513.z);
+        _463.w = (_464.w+_513.w);
+      *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 896)) = make_float4(kernel[_463.x],kernel[_463.y],kernel[_463.z],kernel[_463.w]);
+      int4 _529;
+        int4 _530;
+          int4 _531;
+            int4 _532 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1024) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1024) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1024) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1024) / 48) * 4608)) + (rc_outer_outer [...]
+            int4 _533;
+              int4 _534;
+                int4 _535;
+                  int4 _536 = make_int4((((((int)threadIdx.x) * 4) + 1024))+(1*0), (((((int)threadIdx.x) * 4) + 1024))+(1*1), (((((int)threadIdx.x) * 4) + 1024))+(1*2), (((((int)threadIdx.x) * 4) + 1024))+(1*3));
+                  int4 _537 = make_int4(3, 3, 3, 3);
+                  _535.x = (_536.x%_537.x);
+                  _535.y = (_536.y%_537.y);
+                  _535.z = (_536.z%_537.z);
+                  _535.w = (_536.w%_537.w);
+                int4 _538;
+                  int4 _539 = make_int4((((((int)threadIdx.x) * 4) + 1024))+(1*0), (((((int)threadIdx.x) * 4) + 1024))+(1*1), (((((int)threadIdx.x) * 4) + 1024))+(1*2), (((((int)threadIdx.x) * 4) + 1024))+(1*3));
+                  int4 _540 = make_int4(3, 3, 3, 3);
+                  _538.x = (_539.x/_540.x);
+                  _538.y = (_539.y/_540.y);
+                  _538.z = (_539.z/_540.z);
+                  _538.w = (_539.w/_540.w);
+                int4 _541;
+                ushort4 _542;
+                  ushort4 _543;
+                    ushort4 _544;
+                      int4 _545 = make_int4(3, 3, 3, 3);
+                      int4 _546 = make_int4(0, 0, 0, 0);
+                      _544.x = (_545.x&gt;=_546.x);
+                      _544.y = (_545.y&gt;=_546.y);
+                      _544.z = (_545.z&gt;=_546.z);
+                      _544.w = (_545.w&gt;=_546.w);
+                    ushort4 _547;
+                      int4 _548 = make_int4(0, 0, 0, 0);
+                      _547.x = (_535.x&gt;=_548.x);
+                      _547.y = (_535.y&gt;=_548.y);
+                      _547.z = (_535.z&gt;=_548.z);
+                      _547.w = (_535.w&gt;=_548.w);
+                    _543.x = (_544.x&amp;&amp;_547.x);
+                    _543.y = (_544.y&amp;&amp;_547.y);
+                    _543.z = (_544.z&amp;&amp;_547.z);
+                    _543.w = (_544.w&amp;&amp;_547.w);
+                  ushort4 _549;
+                    ushort4 _550;
+                      int4 _551 = make_int4(3, 3, 3, 3);
+                      int4 _552 = make_int4(0, 0, 0, 0);
+                      _550.x = (_551.x&lt;_552.x);
+                      _550.y = (_551.y&lt;_552.y);
+                      _550.z = (_551.z&lt;_552.z);
+                      _550.w = (_551.w&lt;_552.w);
+                    ushort4 _553;
+                      int4 _554 = make_int4(0, 0, 0, 0);
+                      _553.x = (_535.x&lt;=_554.x);
+                      _553.y = (_535.y&lt;=_554.y);
+                      _553.z = (_535.z&lt;=_554.z);
+                      _553.w = (_535.w&lt;=_554.w);
+                    _549.x = (_550.x&amp;&amp;_553.x);
+                    _549.y = (_550.y&amp;&amp;_553.y);
+                    _549.z = (_550.z&amp;&amp;_553.z);
+                    _549.w = (_550.w&amp;&amp;_553.w);
+                  _542.x = (_543.x||_549.x);
+                  _542.y = (_543.y||_549.y);
+                  _542.z = (_543.z||_549.z);
+                  _542.w = (_543.w||_549.w);
+                int4 _555;
+                  int4 _556 = make_int4(1, 1, 1, 1);
+                  _555.x = (_538.x-_556.x);
+                  _555.y = (_538.y-_556.y);
+                  _555.z = (_538.z-_556.z);
+                  _555.w = (_538.w-_556.w);
+                _541.x = (bool(_542.x)?_538.x:_555.x);
+                _541.y = (bool(_542.y)?_538.y:_555.y);
+                _541.z = (bool(_542.z)?_538.z:_555.z);
+                _541.w = (bool(_542.w)?_538.w:_555.w);
+                int4 _557 = make_int4(16, 16, 16, 16);
+                _534.x = (_541.x%_557.x);
+                _534.y = (_541.y%_557.y);
+                _534.z = (_541.z%_557.z);
+                _534.w = (_541.w%_557.w);
+              int4 _558;
+              ushort4 _559;
+                ushort4 _560;
+                  ushort4 _561;
+                    int4 _562 = make_int4(16, 16, 16, 16);
+                    int4 _563 = make_int4(0, 0, 0, 0);
+                    _561.x = (_562.x&gt;=_563.x);
+                    _561.y = (_562.y&gt;=_563.y);
+                    _561.z = (_562.z&gt;=_563.z);
+                    _561.w = (_562.w&gt;=_563.w);
+                  ushort4 _564;
+                    int4 _565 = make_int4(0, 0, 0, 0);
+                    _564.x = (_534.x&gt;=_565.x);
+                    _564.y = (_534.y&gt;=_565.y);
+                    _564.z = (_534.z&gt;=_565.z);
+                    _564.w = (_534.w&gt;=_565.w);
+                  _560.x = (_561.x&amp;&amp;_564.x);
+                  _560.y = (_561.y&amp;&amp;_564.y);
+                  _560.z = (_561.z&amp;&amp;_564.z);
+                  _560.w = (_561.w&amp;&amp;_564.w);
+                ushort4 _566;
+                  ushort4 _567;
+                    int4 _568 = make_int4(16, 16, 16, 16);
+                    int4 _569 = make_int4(0, 0, 0, 0);
+                    _567.x = (_568.x&lt;_569.x);
+                    _567.y = (_568.y&lt;_569.y);
+                    _567.z = (_568.z&lt;_569.z);
+                    _567.w = (_568.w&lt;_569.w);
+                  ushort4 _570;
+                    int4 _571 = make_int4(0, 0, 0, 0);
+                    _570.x = (_534.x&lt;=_571.x);
+                    _570.y = (_534.y&lt;=_571.y);
+                    _570.z = (_534.z&lt;=_571.z);
+                    _570.w = (_534.w&lt;=_571.w);
+                  _566.x = (_567.x&amp;&amp;_570.x);
+                  _566.y = (_567.y&amp;&amp;_570.y);
+                  _566.z = (_567.z&amp;&amp;_570.z);
+                  _566.w = (_567.w&amp;&amp;_570.w);
+                _559.x = (_560.x||_566.x);
+                _559.y = (_560.y||_566.y);
+                _559.z = (_560.z||_566.z);
+                _559.w = (_560.w||_566.w);
+              int4 _572;
+                int4 _573 = make_int4(16, 16, 16, 16);
+                _572.x = (_534.x+_573.x);
+                _572.y = (_534.y+_573.y);
+                _572.z = (_534.z+_573.z);
+                _572.w = (_534.w+_573.w);
+              _558.x = (bool(_559.x)?_534.x:_572.x);
+              _558.y = (bool(_559.y)?_534.y:_572.y);
+              _558.z = (bool(_559.z)?_534.z:_572.z);
+              _558.w = (bool(_559.w)?_534.w:_572.w);
+              int4 _574 = make_int4(9, 9, 9, 9);
+              _533.x = (_558.x*_574.x);
+              _533.y = (_558.y*_574.y);
+              _533.z = (_558.z*_574.z);
+              _533.w = (_558.w*_574.w);
+            _531.x = (_532.x+_533.x);
+            _531.y = (_532.y+_533.y);
+            _531.z = (_532.z+_533.z);
+            _531.w = (_532.w+_533.w);
+          int4 _575 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+          _530.x = (_531.x+_575.x);
+          _530.y = (_531.y+_575.y);
+          _530.z = (_531.z+_575.z);
+          _530.w = (_531.w+_575.w);
+        int4 _576;
+          int4 _577 = make_int4(((((int)threadIdx.x) + 256))+(1*0), ((((int)threadIdx.x) + 256))+(1*1), ((((int)threadIdx.x) + 256))+(1*2), ((((int)threadIdx.x) + 256))+(1*3));
+          int4 _578 = make_int4(3, 3, 3, 3);
+          _576.x = (_577.x%_578.x);
+          _576.y = (_577.y%_578.y);
+          _576.z = (_577.z%_578.z);
+          _576.w = (_577.w%_578.w);
+        int4 _579;
+        ushort4 _580;
+          ushort4 _581;
+            ushort4 _582;
+              int4 _583 = make_int4(3, 3, 3, 3);
+              int4 _584 = make_int4(0, 0, 0, 0);
+              _582.x = (_583.x&gt;=_584.x);
+              _582.y = (_583.y&gt;=_584.y);
+              _582.z = (_583.z&gt;=_584.z);
+              _582.w = (_583.w&gt;=_584.w);
+            ushort4 _585;
+              int4 _586 = make_int4(0, 0, 0, 0);
+              _585.x = (_576.x&gt;=_586.x);
+              _585.y = (_576.y&gt;=_586.y);
+              _585.z = (_576.z&gt;=_586.z);
+              _585.w = (_576.w&gt;=_586.w);
+            _581.x = (_582.x&amp;&amp;_585.x);
+            _581.y = (_582.y&amp;&amp;_585.y);
+            _581.z = (_582.z&amp;&amp;_585.z);
+            _581.w = (_582.w&amp;&amp;_585.w);
+          ushort4 _587;
+            ushort4 _588;
+              int4 _589 = make_int4(3, 3, 3, 3);
+              int4 _590 = make_int4(0, 0, 0, 0);
+              _588.x = (_589.x&lt;_590.x);
+              _588.y = (_589.y&lt;_590.y);
+              _588.z = (_589.z&lt;_590.z);
+              _588.w = (_589.w&lt;_590.w);
+            ushort4 _591;
+              int4 _592 = make_int4(0, 0, 0, 0);
+              _591.x = (_576.x&lt;=_592.x);
+              _591.y = (_576.y&lt;=_592.y);
+              _591.z = (_576.z&lt;=_592.z);
+              _591.w = (_576.w&lt;=_592.w);
+            _587.x = (_588.x&amp;&amp;_591.x);
+            _587.y = (_588.y&amp;&amp;_591.y);
+            _587.z = (_588.z&amp;&amp;_591.z);
+            _587.w = (_588.w&amp;&amp;_591.w);
+          _580.x = (_581.x||_587.x);
+          _580.y = (_581.y||_587.y);
+          _580.z = (_581.z||_587.z);
+          _580.w = (_581.w||_587.w);
+        int4 _593;
+          int4 _594 = make_int4(3, 3, 3, 3);
+          _593.x = (_576.x+_594.x);
+          _593.y = (_576.y+_594.y);
+          _593.z = (_576.z+_594.z);
+          _593.w = (_576.w+_594.w);
+        _579.x = (bool(_580.x)?_576.x:_593.x);
+        _579.y = (bool(_580.y)?_576.y:_593.y);
+        _579.z = (bool(_580.z)?_576.z:_593.z);
+        _579.w = (bool(_580.w)?_576.w:_593.w);
+        _529.x = (_530.x+_579.x);
+        _529.y = (_530.y+_579.y);
+        _529.z = (_530.z+_579.z);
+        _529.w = (_530.w+_579.w);
+      *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 1024)) = make_float4(kernel[_529.x],kernel[_529.y],kernel[_529.z],kernel[_529.w]);
+      int4 _595;
+        int4 _596;
+          int4 _597;
+            int4 _598 = make_int4((((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 110592), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 110592), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 110592), (((((((int)blockIdx.x) / 7) * 147456) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 144)) + 110592));
+            int4 _599;
+              int4 _600;
+                int4 _601;
+                  int4 _602 = make_int4((((((int)threadIdx.x) * 4) + 1152))+(1*0), (((((int)threadIdx.x) * 4) + 1152))+(1*1), (((((int)threadIdx.x) * 4) + 1152))+(1*2), (((((int)threadIdx.x) * 4) + 1152))+(1*3));
+                  int4 _603 = make_int4(3, 3, 3, 3);
+                  _601.x = (_602.x%_603.x);
+                  _601.y = (_602.y%_603.y);
+                  _601.z = (_602.z%_603.z);
+                  _601.w = (_602.w%_603.w);
+                int4 _604;
+                  int4 _605 = make_int4((((((int)threadIdx.x) * 4) + 1152))+(1*0), (((((int)threadIdx.x) * 4) + 1152))+(1*1), (((((int)threadIdx.x) * 4) + 1152))+(1*2), (((((int)threadIdx.x) * 4) + 1152))+(1*3));
+                  int4 _606 = make_int4(3, 3, 3, 3);
+                  _604.x = (_605.x/_606.x);
+                  _604.y = (_605.y/_606.y);
+                  _604.z = (_605.z/_606.z);
+                  _604.w = (_605.w/_606.w);
+                int4 _607;
+                ushort4 _608;
+                  ushort4 _609;
+                    ushort4 _610;
+                      int4 _611 = make_int4(3, 3, 3, 3);
+                      int4 _612 = make_int4(0, 0, 0, 0);
+                      _610.x = (_611.x&gt;=_612.x);
+                      _610.y = (_611.y&gt;=_612.y);
+                      _610.z = (_611.z&gt;=_612.z);
+                      _610.w = (_611.w&gt;=_612.w);
+                    ushort4 _613;
+                      int4 _614 = make_int4(0, 0, 0, 0);
+                      _613.x = (_601.x&gt;=_614.x);
+                      _613.y = (_601.y&gt;=_614.y);
+                      _613.z = (_601.z&gt;=_614.z);
+                      _613.w = (_601.w&gt;=_614.w);
+                    _609.x = (_610.x&amp;&amp;_613.x);
+                    _609.y = (_610.y&amp;&amp;_613.y);
+                    _609.z = (_610.z&amp;&amp;_613.z);
+                    _609.w = (_610.w&amp;&amp;_613.w);
+                  ushort4 _615;
+                    ushort4 _616;
+                      int4 _617 = make_int4(3, 3, 3, 3);
+                      int4 _618 = make_int4(0, 0, 0, 0);
+                      _616.x = (_617.x&lt;_618.x);
+                      _616.y = (_617.y&lt;_618.y);
+                      _616.z = (_617.z&lt;_618.z);
+                      _616.w = (_617.w&lt;_618.w);
+                    ushort4 _619;
+                      int4 _620 = make_int4(0, 0, 0, 0);
+                      _619.x = (_601.x&lt;=_620.x);
+                      _619.y = (_601.y&lt;=_620.y);
+                      _619.z = (_601.z&lt;=_620.z);
+                      _619.w = (_601.w&lt;=_620.w);
+                    _615.x = (_616.x&amp;&amp;_619.x);
+                    _615.y = (_616.y&amp;&amp;_619.y);
+                    _615.z = (_616.z&amp;&amp;_619.z);
+                    _615.w = (_616.w&amp;&amp;_619.w);
+                  _608.x = (_609.x||_615.x);
+                  _608.y = (_609.y||_615.y);
+                  _608.z = (_609.z||_615.z);
+                  _608.w = (_609.w||_615.w);
+                int4 _621;
+                  int4 _622 = make_int4(1, 1, 1, 1);
+                  _621.x = (_604.x-_622.x);
+                  _621.y = (_604.y-_622.y);
+                  _621.z = (_604.z-_622.z);
+                  _621.w = (_604.w-_622.w);
+                _607.x = (bool(_608.x)?_604.x:_621.x);
+                _607.y = (bool(_608.y)?_604.y:_621.y);
+                _607.z = (bool(_608.z)?_604.z:_621.z);
+                _607.w = (bool(_608.w)?_604.w:_621.w);
+                int4 _623 = make_int4(16, 16, 16, 16);
+                _600.x = (_607.x%_623.x);
+                _600.y = (_607.y%_623.y);
+                _600.z = (_607.z%_623.z);
+                _600.w = (_607.w%_623.w);
+              int4 _624;
+              ushort4 _625;
+                ushort4 _626;
+                  ushort4 _627;
+                    int4 _628 = make_int4(16, 16, 16, 16);
+                    int4 _629 = make_int4(0, 0, 0, 0);
+                    _627.x = (_628.x&gt;=_629.x);
+                    _627.y = (_628.y&gt;=_629.y);
+                    _627.z = (_628.z&gt;=_629.z);
+                    _627.w = (_628.w&gt;=_629.w);
+                  ushort4 _630;
+                    int4 _631 = make_int4(0, 0, 0, 0);
+                    _630.x = (_600.x&gt;=_631.x);
+                    _630.y = (_600.y&gt;=_631.y);
+                    _630.z = (_600.z&gt;=_631.z);
+                    _630.w = (_600.w&gt;=_631.w);
+                  _626.x = (_627.x&amp;&amp;_630.x);
+                  _626.y = (_627.y&amp;&amp;_630.y);
+                  _626.z = (_627.z&amp;&amp;_630.z);
+                  _626.w = (_627.w&amp;&amp;_630.w);
+                ushort4 _632;
+                  ushort4 _633;
+                    int4 _634 = make_int4(16, 16, 16, 16);
+                    int4 _635 = make_int4(0, 0, 0, 0);
+                    _633.x = (_634.x&lt;_635.x);
+                    _633.y = (_634.y&lt;_635.y);
+                    _633.z = (_634.z&lt;_635.z);
+                    _633.w = (_634.w&lt;_635.w);
+                  ushort4 _636;
+                    int4 _637 = make_int4(0, 0, 0, 0);
+                    _636.x = (_600.x&lt;=_637.x);
+                    _636.y = (_600.y&lt;=_637.y);
+                    _636.z = (_600.z&lt;=_637.z);
+                    _636.w = (_600.w&lt;=_637.w);
+                  _632.x = (_633.x&amp;&amp;_636.x);
+                  _632.y = (_633.y&amp;&amp;_636.y);
+                  _632.z = (_633.z&amp;&amp;_636.z);
+                  _632.w = (_633.w&amp;&amp;_636.w);
+                _625.x = (_626.x||_632.x);
+                _625.y = (_626.y||_632.y);
+                _625.z = (_626.z||_632.z);
+                _625.w = (_626.w||_632.w);
+              int4 _638;
+                int4 _639 = make_int4(16, 16, 16, 16);
+                _638.x = (_600.x+_639.x);
+                _638.y = (_600.y+_639.y);
+                _638.z = (_600.z+_639.z);
+                _638.w = (_600.w+_639.w);
+              _624.x = (bool(_625.x)?_600.x:_638.x);
+              _624.y = (bool(_625.y)?_600.y:_638.y);
+              _624.z = (bool(_625.z)?_600.z:_638.z);
+              _624.w = (bool(_625.w)?_600.w:_638.w);
+              int4 _640 = make_int4(9, 9, 9, 9);
+              _599.x = (_624.x*_640.x);
+              _599.y = (_624.y*_640.y);
+              _599.z = (_624.z*_640.z);
+              _599.w = (_624.w*_640.w);
+            _597.x = (_598.x+_599.x);
+            _597.y = (_598.y+_599.y);
+            _597.z = (_598.z+_599.z);
+            _597.w = (_598.w+_599.w);
+          int4 _641 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+          _596.x = (_597.x+_641.x);
+          _596.y = (_597.y+_641.y);
+          _596.z = (_597.z+_641.z);
+          _596.w = (_597.w+_641.w);
+        int4 _642;
+          int4 _643 = make_int4(((((int)threadIdx.x) + 288))+(1*0), ((((int)threadIdx.x) + 288))+(1*1), ((((int)threadIdx.x) + 288))+(1*2), ((((int)threadIdx.x) + 288))+(1*3));
+          int4 _644 = make_int4(3, 3, 3, 3);
+          _642.x = (_643.x%_644.x);
+          _642.y = (_643.y%_644.y);
+          _642.z = (_643.z%_644.z);
+          _642.w = (_643.w%_644.w);
+        int4 _645;
+        ushort4 _646;
+          ushort4 _647;
+            ushort4 _648;
+              int4 _649 = make_int4(3, 3, 3, 3);
+              int4 _650 = make_int4(0, 0, 0, 0);
+              _648.x = (_649.x&gt;=_650.x);
+              _648.y = (_649.y&gt;=_650.y);
+              _648.z = (_649.z&gt;=_650.z);
+              _648.w = (_649.w&gt;=_650.w);
+            ushort4 _651;
+              int4 _652 = make_int4(0, 0, 0, 0);
+              _651.x = (_642.x&gt;=_652.x);
+              _651.y = (_642.y&gt;=_652.y);
+              _651.z = (_642.z&gt;=_652.z);
+              _651.w = (_642.w&gt;=_652.w);
+            _647.x = (_648.x&amp;&amp;_651.x);
+            _647.y = (_648.y&amp;&amp;_651.y);
+            _647.z = (_648.z&amp;&amp;_651.z);
+            _647.w = (_648.w&amp;&amp;_651.w);
+          ushort4 _653;
+            ushort4 _654;
+              int4 _655 = make_int4(3, 3, 3, 3);
+              int4 _656 = make_int4(0, 0, 0, 0);
+              _654.x = (_655.x&lt;_656.x);
+              _654.y = (_655.y&lt;_656.y);
+              _654.z = (_655.z&lt;_656.z);
+              _654.w = (_655.w&lt;_656.w);
+            ushort4 _657;
+              int4 _658 = make_int4(0, 0, 0, 0);
+              _657.x = (_642.x&lt;=_658.x);
+              _657.y = (_642.y&lt;=_658.y);
+              _657.z = (_642.z&lt;=_658.z);
+              _657.w = (_642.w&lt;=_658.w);
+            _653.x = (_654.x&amp;&amp;_657.x);
+            _653.y = (_654.y&amp;&amp;_657.y);
+            _653.z = (_654.z&amp;&amp;_657.z);
+            _653.w = (_654.w&amp;&amp;_657.w);
+          _646.x = (_647.x||_653.x);
+          _646.y = (_647.y||_653.y);
+          _646.z = (_647.z||_653.z);
+          _646.w = (_647.w||_653.w);
+        int4 _659;
+          int4 _660 = make_int4(3, 3, 3, 3);
+          _659.x = (_642.x+_660.x);
+          _659.y = (_642.y+_660.y);
+          _659.z = (_642.z+_660.z);
+          _659.w = (_642.w+_660.w);
+        _645.x = (bool(_646.x)?_642.x:_659.x);
+        _645.y = (bool(_646.y)?_642.y:_659.y);
+        _645.z = (bool(_646.z)?_642.z:_659.z);
+        _645.w = (bool(_646.w)?_642.w:_659.w);
+        _595.x = (_596.x+_645.x);
+        _595.y = (_596.y+_645.y);
+        _595.z = (_596.z+_645.z);
+        _595.w = (_596.w+_645.w);
+      *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 1152)) = make_float4(kernel[_595.x],kernel[_595.y],kernel[_595.z],kernel[_595.w]);
+      int4 _661;
+        int4 _662;
+          int4 _663;
+            int4 _664 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1280) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1280) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1280) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1280) / 48) * 4608)) + (rc_outer_outer [...]
+            int4 _665;
+              int4 _666;
+                int4 _667;
+                  int4 _668 = make_int4((((((int)threadIdx.x) * 4) + 1280))+(1*0), (((((int)threadIdx.x) * 4) + 1280))+(1*1), (((((int)threadIdx.x) * 4) + 1280))+(1*2), (((((int)threadIdx.x) * 4) + 1280))+(1*3));
+                  int4 _669 = make_int4(3, 3, 3, 3);
+                  _667.x = (_668.x%_669.x);
+                  _667.y = (_668.y%_669.y);
+                  _667.z = (_668.z%_669.z);
+                  _667.w = (_668.w%_669.w);
+                int4 _670;
+                  int4 _671 = make_int4((((((int)threadIdx.x) * 4) + 1280))+(1*0), (((((int)threadIdx.x) * 4) + 1280))+(1*1), (((((int)threadIdx.x) * 4) + 1280))+(1*2), (((((int)threadIdx.x) * 4) + 1280))+(1*3));
+                  int4 _672 = make_int4(3, 3, 3, 3);
+                  _670.x = (_671.x/_672.x);
+                  _670.y = (_671.y/_672.y);
+                  _670.z = (_671.z/_672.z);
+                  _670.w = (_671.w/_672.w);
+                int4 _673;
+                ushort4 _674;
+                  ushort4 _675;
+                    ushort4 _676;
+                      int4 _677 = make_int4(3, 3, 3, 3);
+                      int4 _678 = make_int4(0, 0, 0, 0);
+                      _676.x = (_677.x&gt;=_678.x);
+                      _676.y = (_677.y&gt;=_678.y);
+                      _676.z = (_677.z&gt;=_678.z);
+                      _676.w = (_677.w&gt;=_678.w);
+                    ushort4 _679;
+                      int4 _680 = make_int4(0, 0, 0, 0);
+                      _679.x = (_667.x&gt;=_680.x);
+                      _679.y = (_667.y&gt;=_680.y);
+                      _679.z = (_667.z&gt;=_680.z);
+                      _679.w = (_667.w&gt;=_680.w);
+                    _675.x = (_676.x&amp;&amp;_679.x);
+                    _675.y = (_676.y&amp;&amp;_679.y);
+                    _675.z = (_676.z&amp;&amp;_679.z);
+                    _675.w = (_676.w&amp;&amp;_679.w);
+                  ushort4 _681;
+                    ushort4 _682;
+                      int4 _683 = make_int4(3, 3, 3, 3);
+                      int4 _684 = make_int4(0, 0, 0, 0);
+                      _682.x = (_683.x&lt;_684.x);
+                      _682.y = (_683.y&lt;_684.y);
+                      _682.z = (_683.z&lt;_684.z);
+                      _682.w = (_683.w&lt;_684.w);
+                    ushort4 _685;
+                      int4 _686 = make_int4(0, 0, 0, 0);
+                      _685.x = (_667.x&lt;=_686.x);
+                      _685.y = (_667.y&lt;=_686.y);
+                      _685.z = (_667.z&lt;=_686.z);
+                      _685.w = (_667.w&lt;=_686.w);
+                    _681.x = (_682.x&amp;&amp;_685.x);
+                    _681.y = (_682.y&amp;&amp;_685.y);
+                    _681.z = (_682.z&amp;&amp;_685.z);
+                    _681.w = (_682.w&amp;&amp;_685.w);
+                  _674.x = (_675.x||_681.x);
+                  _674.y = (_675.y||_681.y);
+                  _674.z = (_675.z||_681.z);
+                  _674.w = (_675.w||_681.w);
+                int4 _687;
+                  int4 _688 = make_int4(1, 1, 1, 1);
+                  _687.x = (_670.x-_688.x);
+                  _687.y = (_670.y-_688.y);
+                  _687.z = (_670.z-_688.z);
+                  _687.w = (_670.w-_688.w);
+                _673.x = (bool(_674.x)?_670.x:_687.x);
+                _673.y = (bool(_674.y)?_670.y:_687.y);
+                _673.z = (bool(_674.z)?_670.z:_687.z);
+                _673.w = (bool(_674.w)?_670.w:_687.w);
+                int4 _689 = make_int4(16, 16, 16, 16);
+                _666.x = (_673.x%_689.x);
+                _666.y = (_673.y%_689.y);
+                _666.z = (_673.z%_689.z);
+                _666.w = (_673.w%_689.w);
+              int4 _690;
+              ushort4 _691;
+                ushort4 _692;
+                  ushort4 _693;
+                    int4 _694 = make_int4(16, 16, 16, 16);
+                    int4 _695 = make_int4(0, 0, 0, 0);
+                    _693.x = (_694.x&gt;=_695.x);
+                    _693.y = (_694.y&gt;=_695.y);
+                    _693.z = (_694.z&gt;=_695.z);
+                    _693.w = (_694.w&gt;=_695.w);
+                  ushort4 _696;
+                    int4 _697 = make_int4(0, 0, 0, 0);
+                    _696.x = (_666.x&gt;=_697.x);
+                    _696.y = (_666.y&gt;=_697.y);
+                    _696.z = (_666.z&gt;=_697.z);
+                    _696.w = (_666.w&gt;=_697.w);
+                  _692.x = (_693.x&amp;&amp;_696.x);
+                  _692.y = (_693.y&amp;&amp;_696.y);
+                  _692.z = (_693.z&amp;&amp;_696.z);
+                  _692.w = (_693.w&amp;&amp;_696.w);
+                ushort4 _698;
+                  ushort4 _699;
+                    int4 _700 = make_int4(16, 16, 16, 16);
+                    int4 _701 = make_int4(0, 0, 0, 0);
+                    _699.x = (_700.x&lt;_701.x);
+                    _699.y = (_700.y&lt;_701.y);
+                    _699.z = (_700.z&lt;_701.z);
+                    _699.w = (_700.w&lt;_701.w);
+                  ushort4 _702;
+                    int4 _703 = make_int4(0, 0, 0, 0);
+                    _702.x = (_666.x&lt;=_703.x);
+                    _702.y = (_666.y&lt;=_703.y);
+                    _702.z = (_666.z&lt;=_703.z);
+                    _702.w = (_666.w&lt;=_703.w);
+                  _698.x = (_699.x&amp;&amp;_702.x);
+                  _698.y = (_699.y&amp;&amp;_702.y);
+                  _698.z = (_699.z&amp;&amp;_702.z);
+                  _698.w = (_699.w&amp;&amp;_702.w);
+                _691.x = (_692.x||_698.x);
+                _691.y = (_692.y||_698.y);
+                _691.z = (_692.z||_698.z);
+                _691.w = (_692.w||_698.w);
+              int4 _704;
+                int4 _705 = make_int4(16, 16, 16, 16);
+                _704.x = (_666.x+_705.x);
+                _704.y = (_666.y+_705.y);
+                _704.z = (_666.z+_705.z);
+                _704.w = (_666.w+_705.w);
+              _690.x = (bool(_691.x)?_666.x:_704.x);
+              _690.y = (bool(_691.y)?_666.y:_704.y);
+              _690.z = (bool(_691.z)?_666.z:_704.z);
+              _690.w = (bool(_691.w)?_666.w:_704.w);
+              int4 _706 = make_int4(9, 9, 9, 9);
+              _665.x = (_690.x*_706.x);
+              _665.y = (_690.y*_706.y);
+              _665.z = (_690.z*_706.z);
+              _665.w = (_690.w*_706.w);
+            _663.x = (_664.x+_665.x);
+            _663.y = (_664.y+_665.y);
+            _663.z = (_664.z+_665.z);
+            _663.w = (_664.w+_665.w);
+          int4 _707 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+          _662.x = (_663.x+_707.x);
+          _662.y = (_663.y+_707.y);
+          _662.z = (_663.z+_707.z);
+          _662.w = (_663.w+_707.w);
+        int4 _708;
+          int4 _709 = make_int4(((((int)threadIdx.x) + 320))+(1*0), ((((int)threadIdx.x) + 320))+(1*1), ((((int)threadIdx.x) + 320))+(1*2), ((((int)threadIdx.x) + 320))+(1*3));
+          int4 _710 = make_int4(3, 3, 3, 3);
+          _708.x = (_709.x%_710.x);
+          _708.y = (_709.y%_710.y);
+          _708.z = (_709.z%_710.z);
+          _708.w = (_709.w%_710.w);
+        int4 _711;
+        ushort4 _712;
+          ushort4 _713;
+            ushort4 _714;
+              int4 _715 = make_int4(3, 3, 3, 3);
+              int4 _716 = make_int4(0, 0, 0, 0);
+              _714.x = (_715.x&gt;=_716.x);
+              _714.y = (_715.y&gt;=_716.y);
+              _714.z = (_715.z&gt;=_716.z);
+              _714.w = (_715.w&gt;=_716.w);
+            ushort4 _717;
+              int4 _718 = make_int4(0, 0, 0, 0);
+              _717.x = (_708.x&gt;=_718.x);
+              _717.y = (_708.y&gt;=_718.y);
+              _717.z = (_708.z&gt;=_718.z);
+              _717.w = (_708.w&gt;=_718.w);
+            _713.x = (_714.x&amp;&amp;_717.x);
+            _713.y = (_714.y&amp;&amp;_717.y);
+            _713.z = (_714.z&amp;&amp;_717.z);
+            _713.w = (_714.w&amp;&amp;_717.w);
+          ushort4 _719;
+            ushort4 _720;
+              int4 _721 = make_int4(3, 3, 3, 3);
+              int4 _722 = make_int4(0, 0, 0, 0);
+              _720.x = (_721.x&lt;_722.x);
+              _720.y = (_721.y&lt;_722.y);
+              _720.z = (_721.z&lt;_722.z);
+              _720.w = (_721.w&lt;_722.w);
+            ushort4 _723;
+              int4 _724 = make_int4(0, 0, 0, 0);
+              _723.x = (_708.x&lt;=_724.x);
+              _723.y = (_708.y&lt;=_724.y);
+              _723.z = (_708.z&lt;=_724.z);
+              _723.w = (_708.w&lt;=_724.w);
+            _719.x = (_720.x&amp;&amp;_723.x);
+            _719.y = (_720.y&amp;&amp;_723.y);
+            _719.z = (_720.z&amp;&amp;_723.z);
+            _719.w = (_720.w&amp;&amp;_723.w);
+          _712.x = (_713.x||_719.x);
+          _712.y = (_713.y||_719.y);
+          _712.z = (_713.z||_719.z);
+          _712.w = (_713.w||_719.w);
+        int4 _725;
+          int4 _726 = make_int4(3, 3, 3, 3);
+          _725.x = (_708.x+_726.x);
+          _725.y = (_708.y+_726.y);
+          _725.z = (_708.z+_726.z);
+          _725.w = (_708.w+_726.w);
+        _711.x = (bool(_712.x)?_708.x:_725.x);
+        _711.y = (bool(_712.y)?_708.y:_725.y);
+        _711.z = (bool(_712.z)?_708.z:_725.z);
+        _711.w = (bool(_712.w)?_708.w:_725.w);
+        _661.x = (_662.x+_711.x);
+        _661.y = (_662.y+_711.y);
+        _661.z = (_662.z+_711.z);
+        _661.w = (_662.w+_711.w);
+      *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 1280)) = make_float4(kernel[_661.x],kernel[_661.y],kernel[_661.z],kernel[_661.w]);
+      int4 _727;
+        int4 _728;
+          int4 _729;
+            int4 _730 = make_int4(((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1408) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1408) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1408) / 48) * 4608)) + (rc_outer_outer * 144)), ((((((int)blockIdx.x) / 7) * 147456) + ((((((int)threadIdx.x) * 4) + 1408) / 48) * 4608)) + (rc_outer_outer [...]
+            int4 _731;
+              int4 _732;
+                int4 _733;
+                  int4 _734 = make_int4((((((int)threadIdx.x) * 4) + 1408))+(1*0), (((((int)threadIdx.x) * 4) + 1408))+(1*1), (((((int)threadIdx.x) * 4) + 1408))+(1*2), (((((int)threadIdx.x) * 4) + 1408))+(1*3));
+                  int4 _735 = make_int4(3, 3, 3, 3);
+                  _733.x = (_734.x%_735.x);
+                  _733.y = (_734.y%_735.y);
+                  _733.z = (_734.z%_735.z);
+                  _733.w = (_734.w%_735.w);
+                int4 _736;
+                  int4 _737 = make_int4((((((int)threadIdx.x) * 4) + 1408))+(1*0), (((((int)threadIdx.x) * 4) + 1408))+(1*1), (((((int)threadIdx.x) * 4) + 1408))+(1*2), (((((int)threadIdx.x) * 4) + 1408))+(1*3));
+                  int4 _738 = make_int4(3, 3, 3, 3);
+                  _736.x = (_737.x/_738.x);
+                  _736.y = (_737.y/_738.y);
+                  _736.z = (_737.z/_738.z);
+                  _736.w = (_737.w/_738.w);
+                int4 _739;
+                ushort4 _740;
+                  ushort4 _741;
+                    ushort4 _742;
+                      int4 _743 = make_int4(3, 3, 3, 3);
+                      int4 _744 = make_int4(0, 0, 0, 0);
+                      _742.x = (_743.x&gt;=_744.x);
+                      _742.y = (_743.y&gt;=_744.y);
+                      _742.z = (_743.z&gt;=_744.z);
+                      _742.w = (_743.w&gt;=_744.w);
+                    ushort4 _745;
+                      int4 _746 = make_int4(0, 0, 0, 0);
+                      _745.x = (_733.x&gt;=_746.x);
+                      _745.y = (_733.y&gt;=_746.y);
+                      _745.z = (_733.z&gt;=_746.z);
+                      _745.w = (_733.w&gt;=_746.w);
+                    _741.x = (_742.x&amp;&amp;_745.x);
+                    _741.y = (_742.y&amp;&amp;_745.y);
+                    _741.z = (_742.z&amp;&amp;_745.z);
+                    _741.w = (_742.w&amp;&amp;_745.w);
+                  ushort4 _747;
+                    ushort4 _748;
+                      int4 _749 = make_int4(3, 3, 3, 3);
+                      int4 _750 = make_int4(0, 0, 0, 0);
+                      _748.x = (_749.x&lt;_750.x);
+                      _748.y = (_749.y&lt;_750.y);
+                      _748.z = (_749.z&lt;_750.z);
+                      _748.w = (_749.w&lt;_750.w);
+                    ushort4 _751;
+                      int4 _752 = make_int4(0, 0, 0, 0);
+                      _751.x = (_733.x&lt;=_752.x);
+                      _751.y = (_733.y&lt;=_752.y);
+                      _751.z = (_733.z&lt;=_752.z);
+                      _751.w = (_733.w&lt;=_752.w);
+                    _747.x = (_748.x&amp;&amp;_751.x);
+                    _747.y = (_748.y&amp;&amp;_751.y);
+                    _747.z = (_748.z&amp;&amp;_751.z);
+                    _747.w = (_748.w&amp;&amp;_751.w);
+                  _740.x = (_741.x||_747.x);
+                  _740.y = (_741.y||_747.y);
+                  _740.z = (_741.z||_747.z);
+                  _740.w = (_741.w||_747.w);
+                int4 _753;
+                  int4 _754 = make_int4(1, 1, 1, 1);
+                  _753.x = (_736.x-_754.x);
+                  _753.y = (_736.y-_754.y);
+                  _753.z = (_736.z-_754.z);
+                  _753.w = (_736.w-_754.w);
+                _739.x = (bool(_740.x)?_736.x:_753.x);
+                _739.y = (bool(_740.y)?_736.y:_753.y);
+                _739.z = (bool(_740.z)?_736.z:_753.z);
+                _739.w = (bool(_740.w)?_736.w:_753.w);
+                int4 _755 = make_int4(16, 16, 16, 16);
+                _732.x = (_739.x%_755.x);
+                _732.y = (_739.y%_755.y);
+                _732.z = (_739.z%_755.z);
+                _732.w = (_739.w%_755.w);
+              int4 _756;
+              ushort4 _757;
+                ushort4 _758;
+                  ushort4 _759;
+                    int4 _760 = make_int4(16, 16, 16, 16);
+                    int4 _761 = make_int4(0, 0, 0, 0);
+                    _759.x = (_760.x&gt;=_761.x);
+                    _759.y = (_760.y&gt;=_761.y);
+                    _759.z = (_760.z&gt;=_761.z);
+                    _759.w = (_760.w&gt;=_761.w);
+                  ushort4 _762;
+                    int4 _763 = make_int4(0, 0, 0, 0);
+                    _762.x = (_732.x&gt;=_763.x);
+                    _762.y = (_732.y&gt;=_763.y);
+                    _762.z = (_732.z&gt;=_763.z);
+                    _762.w = (_732.w&gt;=_763.w);
+                  _758.x = (_759.x&amp;&amp;_762.x);
+                  _758.y = (_759.y&amp;&amp;_762.y);
+                  _758.z = (_759.z&amp;&amp;_762.z);
+                  _758.w = (_759.w&amp;&amp;_762.w);
+                ushort4 _764;
+                  ushort4 _765;
+                    int4 _766 = make_int4(16, 16, 16, 16);
+                    int4 _767 = make_int4(0, 0, 0, 0);
+                    _765.x = (_766.x&lt;_767.x);
+                    _765.y = (_766.y&lt;_767.y);
+                    _765.z = (_766.z&lt;_767.z);
+                    _765.w = (_766.w&lt;_767.w);
+                  ushort4 _768;
+                    int4 _769 = make_int4(0, 0, 0, 0);
+                    _768.x = (_732.x&lt;=_769.x);
+                    _768.y = (_732.y&lt;=_769.y);
+                    _768.z = (_732.z&lt;=_769.z);
+                    _768.w = (_732.w&lt;=_769.w);
+                  _764.x = (_765.x&amp;&amp;_768.x);
+                  _764.y = (_765.y&amp;&amp;_768.y);
+                  _764.z = (_765.z&amp;&amp;_768.z);
+                  _764.w = (_765.w&amp;&amp;_768.w);
+                _757.x = (_758.x||_764.x);
+                _757.y = (_758.y||_764.y);
+                _757.z = (_758.z||_764.z);
+                _757.w = (_758.w||_764.w);
+              int4 _770;
+                int4 _771 = make_int4(16, 16, 16, 16);
+                _770.x = (_732.x+_771.x);
+                _770.y = (_732.y+_771.y);
+                _770.z = (_732.z+_771.z);
+                _770.w = (_732.w+_771.w);
+              _756.x = (bool(_757.x)?_732.x:_770.x);
+              _756.y = (bool(_757.y)?_732.y:_770.y);
+              _756.z = (bool(_757.z)?_732.z:_770.z);
+              _756.w = (bool(_757.w)?_732.w:_770.w);
+              int4 _772 = make_int4(9, 9, 9, 9);
+              _731.x = (_756.x*_772.x);
+              _731.y = (_756.y*_772.y);
+              _731.z = (_756.z*_772.z);
+              _731.w = (_756.w*_772.w);
+            _729.x = (_730.x+_731.x);
+            _729.y = (_730.y+_731.y);
+            _729.z = (_730.z+_731.z);
+            _729.w = (_730.w+_731.w);
+          int4 _773 = make_int4((ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3), (ry_outer_outer * 3));
+          _728.x = (_729.x+_773.x);
+          _728.y = (_729.y+_773.y);
+          _728.z = (_729.z+_773.z);
+          _728.w = (_729.w+_773.w);
+        int4 _774;
+          int4 _775 = make_int4(((((int)threadIdx.x) + 352))+(1*0), ((((int)threadIdx.x) + 352))+(1*1), ((((int)threadIdx.x) + 352))+(1*2), ((((int)threadIdx.x) + 352))+(1*3));
+          int4 _776 = make_int4(3, 3, 3, 3);
+          _774.x = (_775.x%_776.x);
+          _774.y = (_775.y%_776.y);
+          _774.z = (_775.z%_776.z);
+          _774.w = (_775.w%_776.w);
+        int4 _777;
+        ushort4 _778;
+          ushort4 _779;
+            ushort4 _780;
+              int4 _781 = make_int4(3, 3, 3, 3);
+              int4 _782 = make_int4(0, 0, 0, 0);
+              _780.x = (_781.x&gt;=_782.x);
+              _780.y = (_781.y&gt;=_782.y);
+              _780.z = (_781.z&gt;=_782.z);
+              _780.w = (_781.w&gt;=_782.w);
+            ushort4 _783;
+              int4 _784 = make_int4(0, 0, 0, 0);
+              _783.x = (_774.x&gt;=_784.x);
+              _783.y = (_774.y&gt;=_784.y);
+              _783.z = (_774.z&gt;=_784.z);
+              _783.w = (_774.w&gt;=_784.w);
+            _779.x = (_780.x&amp;&amp;_783.x);
+            _779.y = (_780.y&amp;&amp;_783.y);
+            _779.z = (_780.z&amp;&amp;_783.z);
+            _779.w = (_780.w&amp;&amp;_783.w);
+          ushort4 _785;
+            ushort4 _786;
+              int4 _787 = make_int4(3, 3, 3, 3);
+              int4 _788 = make_int4(0, 0, 0, 0);
+              _786.x = (_787.x&lt;_788.x);
+              _786.y = (_787.y&lt;_788.y);
+              _786.z = (_787.z&lt;_788.z);
+              _786.w = (_787.w&lt;_788.w);
+            ushort4 _789;
+              int4 _790 = make_int4(0, 0, 0, 0);
+              _789.x = (_774.x&lt;=_790.x);
+              _789.y = (_774.y&lt;=_790.y);
+              _789.z = (_774.z&lt;=_790.z);
+              _789.w = (_774.w&lt;=_790.w);
+            _785.x = (_786.x&amp;&amp;_789.x);
+            _785.y = (_786.y&amp;&amp;_789.y);
+            _785.z = (_786.z&amp;&amp;_789.z);
+            _785.w = (_786.w&amp;&amp;_789.w);
+          _778.x = (_779.x||_785.x);
+          _778.y = (_779.y||_785.y);
+          _778.z = (_779.z||_785.z);
+          _778.w = (_779.w||_785.w);
+        int4 _791;
+          int4 _792 = make_int4(3, 3, 3, 3);
+          _791.x = (_774.x+_792.x);
+          _791.y = (_774.y+_792.y);
+          _791.z = (_774.z+_792.z);
+          _791.w = (_774.w+_792.w);
+        _777.x = (bool(_778.x)?_774.x:_791.x);
+        _777.y = (bool(_778.y)?_774.y:_791.y);
+        _777.z = (bool(_778.z)?_774.z:_791.z);
+        _777.w = (bool(_778.w)?_774.w:_791.w);
+        _727.x = (_728.x+_777.x);
+        _727.y = (_728.y+_777.y);
+        _727.z = (_728.z+_777.z);
+        _727.w = (_728.w+_777.w);
+      *(float4*)(kernel_shared + ((((int)threadIdx.x) * 4) + 1408)) = make_float4(kernel[_727.x],kernel[_727.y],kernel[_727.z],kernel[_727.w]);
       __syncthreads();
-      for (int rc_outer_inner = 0; rc_outer_inner &lt; 16; ++rc_outer_inner) {
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7))] * kernel_shared[(((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6))]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7))] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 384)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 1)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 1)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 1)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 385)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 2)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 2)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 2)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 386)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 3)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 387)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 64)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 4)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 64)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 388)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 65)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 5)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((rc_outer_inner * 126) + (((((int)threadIdx.x) % 49) / 7) * 9)) + (((int)threadIdx.x) % 7)) + 65)] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 6)) + 389)]));
+      for (int rc_outer_inner = 0; rc_outer_inner &lt; 8; ++rc_outer_inner) {
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(rc_outer_inner * 18)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 18) + 1)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_outer_inner * 18) + 2)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_outer_inner * 18) + 3)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_outer_inner * 18) + 4)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_outer_inner * 18) + 5)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_outer_inner * 18) + 6)] * kernel_shared[((((int)threadIdx.x) * 48) + (rc_outer_inner * 6))]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 18) + 1)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 18) + 2)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_outer_inner * 18) + 3)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_outer_inner * 18) + 4)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_outer_inner * 18) + 5)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_outer_inner * 18) + 6)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_outer_inner * 18) + 7)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 1)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 18) + 2)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 18) + 3)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_outer_inner * 18) + 4)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_outer_inner * 18) + 5)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_outer_inner * 18) + 6)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_outer_inner * 18) + 7)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_outer_inner * 18) + 8)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 2)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 18) + 9)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 18) + 10)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_outer_inner * 18) + 11)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_outer_inner * 18) + 12)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_outer_inner * 18) + 13)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_outer_inner * 18) + 14)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_outer_inner * 18) + 15)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 3)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 18) + 10)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 18) + 11)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_outer_inner * 18) + 12)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_outer_inner * 18) + 13)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_outer_inner * 18) + 14)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_outer_inner * 18) + 15)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_outer_inner * 18) + 16)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 4)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 18) + 11)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 18) + 12)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((rc_outer_inner * 18) + 13)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((rc_outer_inner * 18) + 14)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((rc_outer_inner * 18) + 15)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((rc_outer_inner * 18) + 16)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((rc_outer_inner * 18) + 17)] * kernel_shared[(((((int)threadIdx.x) * 48) + (rc_outer_inner * 6)) + 5)]));
       }
     }
   }
-  compute[((((int)blockIdx.x) * 392) + ((int)threadIdx.x))] = max((conv2d_nchw[0] + bias[((((int)blockIdx.x) * 8) + (((int)threadIdx.x) / 49))]), 0.000000e+00f);
-  compute[(((((int)blockIdx.x) * 392) + ((int)threadIdx.x)) + 196)] = max((conv2d_nchw[1] + bias[(((((int)blockIdx.x) * 8) + (((int)threadIdx.x) / 49)) + 4)]), 0.000000e+00f);
+  for (int i3_inner = 0; i3_inner &lt; 7; ++i3_inner) {
+    compute[(((((((int)blockIdx.x) / 7) * 1568) + (((int)threadIdx.x) * 49)) + ((((int)blockIdx.x) % 7) * 7)) + i3_inner)] = max((conv2d_nchw[i3_inner] + bias[(((((int)blockIdx.x) / 7) * 32) + ((int)threadIdx.x))]), 0.000000e+00f);
+  }
 }
 </pre></div>
 </div>
@@ -774,7 +3336,7 @@ In the example below we resume the status and do more 5 trials.</p>
 Get devices for measurement successfully!
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes  35.530 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes  34.321 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/e3e540f3b477c0c52d8eb73e674e8ffd/tune_conv2d_layer_cuda.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_conv2d_layer_cuda.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
index 3232d60b6..98472f75a 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
@@ -901,7 +901,7 @@ so we can read the log file and load the best schedules.</p>
 Evaluate inference time cost...
 Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-   9.6964       9.7275       9.7331       9.6288       0.0479
+   9.8793       9.8894       9.9270       9.8216       0.0436
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_x86.html b/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
index db7a6dc1d..fb304af72 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
@@ -920,7 +920,7 @@ so we can read the log file and load the best schedules.</p>
 Evaluate inference time cost...
 Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  752.6694     752.9576     753.1499     751.9008      0.5491
+  757.9333     757.6261     759.1621     757.0116      0.9044
 </pre></div>
 </div>
 </div>
@@ -942,7 +942,7 @@ to learn how to use the RPC Tracker and RPC Server.
 To use the RPC Tracker in auto-scheduler, replace the runner in <code class="code docutils literal notranslate"><span class="pre">TuningOptions</span></code>
 with <a class="reference internal" href="../../reference/api/python/auto_scheduler.html#tvm.auto_scheduler.RPCRunner" title="tvm.auto_scheduler.RPCRunner"><code class="xref any py py-class docutils literal notranslate"><span class="pre">auto_scheduler.RPCRunner</span></code></a>.</p></li>
 </ol>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  19.801 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  20.685 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-network-x86-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/e416b94ca1090b0897c0f6e0df95b911/tune_network_x86.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_network_x86.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html b/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
index cb6ecd47f..3a5b75b97 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
@@ -620,12 +620,12 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
              placeholder_4: Buffer(placeholder_14: Pointer(float32), float32, [65536], []),
              compute: Buffer(compute_2: Pointer(float32), float32, [65536], [])}
   buffer_map = {placeholder_5: placeholder, placeholder_6: placeholder_1, placeholder_7: placeholder_2, placeholder_8: placeholder_3, placeholder_9: placeholder_4, compute_1: compute}
-  preflattened_buffer_map = {placeholder_5: placeholder_15: Buffer(placeholder_10, float32, [128, 256], []), placeholder_7: placeholder_16: Buffer(placeholder_12, int32, [4916], []), placeholder_8: placeholder_17: Buffer(placeholder_13, int32, [33], []), compute_1: compute_3: Buffer(compute_2, float32, [128, 512], []), placeholder_9: placeholder_18: Buffer(placeholder_14, float32, [128, 512], []), placeholder_6: placeholder_19: Buffer(placeholder_11, float32, [4916, 16, 1], [])} {
+  preflattened_buffer_map = {placeholder_7: placeholder_15: Buffer(placeholder_12, int32, [4916], []), placeholder_6: placeholder_16: Buffer(placeholder_11, float32, [4916, 16, 1], []), placeholder_8: placeholder_17: Buffer(placeholder_13, int32, [33], []), placeholder_5: placeholder_18: Buffer(placeholder_10, float32, [128, 256], []), placeholder_9: placeholder_19: Buffer(placeholder_14, float32, [128, 512], []), compute_1: compute_3: Buffer(compute_2, float32, [128, 512], [])} {
   for (i0.outer.i1.outer.fused: int32, 0, 32) &quot;parallel&quot; {
     allocate(compute_4: Pointer(global float32), float32, [2048]), storage_scope = global {
-      for (nb_j.inner: int32, 0, 2) {
-        for (i.inner.init: int32, 0, 64) {
-          let cse_var_1: int32 = ((i.inner.init*32) + (nb_j.inner*16))
+      for (i.outer.inner: int32, 0, 4) {
+        for (i.inner.init: int32, 0, 32) {
+          let cse_var_1: int32 = ((i.outer.inner*512) + (i.inner.init*16))
            {
             compute_5: Buffer(compute_4, float32, [2048], [])[cse_var_1] = 0f32
             compute_5[(cse_var_1 + 1)] = 0f32
@@ -645,51 +645,78 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
             compute_5[(cse_var_1 + 15)] = 0f32
           }
         }
-        for (elem_idx: int32, 0, let cse_var_2: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner) in (placeholder_3[(cse_var_2 + 1)] - placeholder_3[cse_var_2])) {
-          for (i.inner: int32, 0, 64) {
-            let cse_var_21: int32 = (elem_idx*16)
-            let cse_var_20: int32 = ((i.inner*32) + (nb_j.inner*16))
-            let cse_var_19: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)
-            let cse_var_18: int32 = ((floordiv(i0.outer.i1.outer.fused, 16)*16384) + (i.inner*256))
-            let cse_var_17: int32 = (cse_var_20 + 9)
-            let cse_var_16: int32 = (cse_var_20 + 8)
-            let cse_var_15: int32 = (cse_var_20 + 7)
-            let cse_var_14: int32 = (cse_var_20 + 6)
-            let cse_var_13: int32 = (cse_var_20 + 5)
-            let cse_var_12: int32 = (cse_var_20 + 4)
-            let cse_var_11: int32 = (cse_var_20 + 3)
-            let cse_var_10: int32 = (cse_var_20 + 2)
-            let cse_var_9: int32 = (cse_var_20 + 15)
-            let cse_var_8: int32 = (cse_var_20 + 14)
-            let cse_var_7: int32 = (cse_var_20 + 13)
-            let cse_var_6: int32 = (cse_var_20 + 12)
-            let cse_var_5: int32 = (cse_var_20 + 11)
-            let cse_var_4: int32 = (cse_var_20 + 10)
-            let cse_var_3: int32 = (cse_var_20 + 1)
-             {
-              compute_5[cse_var_20] = (compute_5[cse_var_20] + (placeholder_1[((placeholder_3[cse_var_19]*16) + cse_var_21)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_3] = (compute_5[cse_var_3] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 1)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 2)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 3)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 4)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 5)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 6)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 7)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 8)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 9)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_4] = (compute_5[cse_var_4] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 10)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_5] = (compute_5[cse_var_5] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 11)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_6] = (compute_5[cse_var_6] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 12)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 13)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 14)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 15)]*max(placeholder[(cse_var_18 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
+        for (elem_idx: int32, 0, (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])) {
+          for (i.inner: int32, 0, 32) {
+            if @tir.likely((elem_idx &lt; (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+              let cse_var_2: int32 = ((i.outer.inner*512) + (i.inner*16))
+              compute_5[cse_var_2] = (compute_5[cse_var_2] + (placeholder_1[((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16))]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+            }
+            if @tir.likely((elem_idx &lt; (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+              let cse_var_3: int32 = (((i.outer.inner*512) + (i.inner*16)) + 1)
+              compute_5[cse_var_3] = (compute_5[cse_var_3] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 1)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+            }
+            if @tir.likely((elem_idx &lt; (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+              let cse_var_4: int32 = (((i.outer.inner*512) + (i.inner*16)) + 2)
+              compute_5[cse_var_4] = (compute_5[cse_var_4] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 2)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+            }
+            if @tir.likely((elem_idx &lt; (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+              let cse_var_5: int32 = (((i.outer.inner*512) + (i.inner*16)) + 3)
+              compute_5[cse_var_5] = (compute_5[cse_var_5] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 3)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+            }
+            if @tir.likely((elem_idx &lt; (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+              let cse_var_6: int32 = (((i.outer.inner*512) + (i.inner*16)) + 4)
+              compute_5[cse_var_6] = (compute_5[cse_var_6] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 4)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+            }
+            if @tir.likely((elem_idx &lt; (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+              let cse_var_7: int32 = (((i.outer.inner*512) + (i.inner*16)) + 5)
+              compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 5)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+            }
+            if @tir.likely((elem_idx &lt; (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+              let cse_var_8: int32 = (((i.outer.inner*512) + (i.inner*16)) + 6)
+              compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 6)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+            }
+            if @tir.likely((elem_idx &lt; (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+              let cse_var_9: int32 = (((i.outer.inner*512) + (i.inner*16)) + 7)
+              compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 7)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+            }
+            if @tir.likely((elem_idx &lt; (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+              let cse_var_10: int32 = (((i.outer.inner*512) + (i.inner*16)) + 8)
+              compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 8)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+            }
+            if @tir.likely((elem_idx &lt; (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+              let cse_var_11: int32 = (((i.outer.inner*512) + (i.inner*16)) + 9)
+              compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 9)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+            }
+            if @tir.likely((elem_idx &lt; (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+              let cse_var_12: int32 = (((i.outer.inner*512) + (i.inner*16)) + 10)
+              compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 10)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+            }
+            if @tir.likely((elem_idx &lt; (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+              let cse_var_13: int32 = (((i.outer.inner*512) + (i.inner*16)) + 11)
+              compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 11)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+            }
+            if @tir.likely((elem_idx &lt; (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+              let cse_var_14: int32 = (((i.outer.inner*512) + (i.inner*16)) + 12)
+              compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 12)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+            }
+            if @tir.likely((elem_idx &lt; (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+              let cse_var_15: int32 = (((i.outer.inner*512) + (i.inner*16)) + 13)
+              compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 13)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+            }
+            if @tir.likely((elem_idx &lt; (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+              let cse_var_16: int32 = (((i.outer.inner*512) + (i.inner*16)) + 14)
+              compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 14)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+            }
+            if @tir.likely((elem_idx &lt; (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+              let cse_var_17: int32 = (((i.outer.inner*512) + (i.inner*16)) + 15)
+              compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 15)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
             }
           }
         }
       }
-      for (i0.inner: int32, 0, 64) {
-        let cse_var_22: int32 = (((floordiv(i0.outer.i1.outer.fused, 16)*32768) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 16)*32))
-        compute[ramp(cse_var_22, 1, 32)] = max((compute_5[ramp((i0.inner*32), 1, 32)] + placeholder_4[ramp(cse_var_22, 1, 32)]), broadcast(0f32, 32))
+      for (i0.inner: int32, 0, 128) {
+        let cse_var_18: int32 = ((i0.inner*512) + (i0.outer.i1.outer.fused*16))
+        compute[ramp(cse_var_18, 1, 16)] = max((compute_5[ramp((i0.inner*16), 1, 16)] + placeholder_4[ramp(cse_var_18, 1, 16)]), broadcast(0f32, 16))
       }
     }
   }
@@ -727,7 +754,7 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
 <span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 1.855 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 1.730 ms
 </pre></div>
 </div>
 <div class="admonition note">
diff --git a/docs/how_to/tune_with_autotvm/sg_execution_times.html b/docs/how_to/tune_with_autotvm/sg_execution_times.html
index 7f5f3fcf3..c40dad72e 100644
--- a/docs/how_to/tune_with_autotvm/sg_execution_times.html
+++ b/docs/how_to/tune_with_autotvm/sg_execution_times.html
@@ -322,7 +322,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-tune-with-autotvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:43.277</strong> total execution time for <strong>how_to_tune_with_autotvm</strong> files:</p>
+<p><strong>00:43.532</strong> total execution time for <strong>how_to_tune_with_autotvm</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 84%" />
@@ -331,11 +331,11 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="tune_conv2d_cuda.html#sphx-glr-how-to-tune-with-autotvm-tune-conv2d-cuda-py"><span class="std std-ref">Tuning High Performance Convolution on NVIDIA GPUs</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_cuda.py</span></code>)</p></td>
-<td><p>00:43.248</p></td>
+<td><p>00:43.499</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tune_relay_x86.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-x86-py"><span class="std std-ref">Auto-tuning a Convolutional Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_x86.py</span></code>)</p></td>
-<td><p>00:00.015</p></td>
+<td><p>00:00.019</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="tune_relay_cuda.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-cuda-py"><span class="std std-ref">Auto-tuning a Convolutional Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_cuda.py</span></code>)</p></td>
diff --git a/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html b/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
index 9b50d4e52..bb763fa2c 100644
--- a/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
+++ b/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
@@ -1164,8 +1164,8 @@ Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
 tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 4, 4, 32]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 1, 128]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,2885496
-No: 6   GFLOPS: 110.83/110.83   result: MeasureResult(costs=(0.002088788229166667,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.8140833377838135, timestamp=1655930458.0250723)       [(&#39;tile_f&#39;, [-1, 1, 1, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 4, 4]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 0)],None,3754080
-No: 7   GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+No: 6   GFLOPS: 110.46/110.46   result: MeasureResult(costs=(0.00209571425,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.82222318649292, timestamp=1655930909.2240996)        [(&#39;tile_f&#39;, [-1, 1, 1, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 4, 4]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 0)],None,3754080
+No: 7   GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -1288,7 +1288,7 @@ Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
 tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 1, 16, 32]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 256, 1]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 1)],None,6225319
-No: 8   GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+No: 8   GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -1411,7 +1411,7 @@ Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
 tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 2, 1, 32]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 8, 64]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,943546
-No: 9   GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+No: 9   GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -1534,7 +1534,7 @@ Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
 tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 4, 16, 4]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 16, 32]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,2868708
-No: 10  GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+No: 10  GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 142, in build
     res = future.result()
   File &quot;/usr/lib/python3.7/concurrent/futures/_base.py&quot;, line 435, in result
@@ -1552,7 +1552,7 @@ No: 10  GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
 TimeoutError
 
         [(&#39;tile_f&#39;, [-1, 32, 2, 4]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 4, 2]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 0)],None,4691833
-No: 11  GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+No: 11  GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -1675,7 +1675,7 @@ Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
 tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 1, 2, 64]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 4, 4]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,1042124
-No: 12  GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+No: 12  GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -1798,7 +1798,7 @@ Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
 tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 32, 1, 4]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 32, 16]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,10013405
-No: 13  GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+No: 13  GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -1921,7 +1921,7 @@ Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
 tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 8, 8, 2]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 4, 32]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 1)],None,6732082
-No: 14  GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+No: 14  GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -2044,7 +2044,7 @@ Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
 tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 2, 4, 32]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 4, 128]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 1)],None,7536735
-No: 15  GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+No: 15  GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -2167,7 +2167,7 @@ Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
 tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 2, 1, 4]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 128, 4]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,482121
-No: 16  GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+No: 16  GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -2290,7 +2290,7 @@ Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
 tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 2, 1, 16]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 32, 8]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,2824525
-No: 17  GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+No: 17  GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -2413,7 +2413,7 @@ Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
 tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 64, 1, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 8, 8]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 0)],None,4559286
-No: 18  GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+No: 18  GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -2536,7 +2536,7 @@ Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
 tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 1, 32, 16]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 1, 512]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9677544
-No: 19  GFLOPS: 0.00/110.83     result: Traceback (most recent call last):
+No: 19  GFLOPS: 0.00/110.46     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 738, in __call__
     yield remote, remote.load_module(os.path.split(build_result.filename)[1])
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 702, in run_through_rpc
@@ -2624,7 +2624,7 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
   15: _PyEval_EvalFrameDefault
   14: 0x0000000000537c30
   13: _PyObject_FastCallKeywords
-  12: 0x00007ffab91b6fa2
+  12: 0x00007f97971cffa2
   11: _ctypes_callproc
   10: ffi_call
   9: ffi_call_unix64
@@ -2689,7 +2689,7 @@ Traceback (most recent call last):
   21: _PyFunction_FastCallKeywords
   20: _PyEval_EvalFrameDefault
   19: _PyFunction_FastCall      [(&#39;tile_f&#39;, [-1, 8, 2, 16]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 1, 1]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 1)],None,6390073
-No: 20  GFLOPS: 144.17/144.17   result: MeasureResult(costs=(0.00160570644,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4077048301696777, timestamp=1655930484.4690123)      [(&#39;tile_f&#39;, [-1, 1, 4, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 4, 1]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9881539
+No: 20  GFLOPS: 144.77/144.77   result: MeasureResult(costs=(0.00159906648,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4326999187469482, timestamp=1655930935.759078)       [(&#39;tile_f&#39;, [-1, 1, 4, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 4, 1]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9881539
 </pre></div>
 </div>
 <p>Finally we can inspect the best config from log file, check correctness,
@@ -2730,7 +2730,7 @@ and measure running time.</p>
 Best config:
 [(&#39;tile_f&#39;, [-1, 1, 4, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 4, 1]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9881539
 Finish loading 20 records
-Time cost of this operator: 0.001957
+Time cost of this operator: 0.002022
 </pre></div>
 </div>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autotvm-tune-conv2d-cuda-py">
diff --git a/docs/how_to/work_with_microtvm/micro_autotune.html b/docs/how_to/work_with_microtvm/micro_autotune.html
index b6173a354..a1192dcba 100644
--- a/docs/how_to/work_with_microtvm/micro_autotune.html
+++ b/docs/how_to/work_with_microtvm/micro_autotune.html
@@ -578,10 +578,10 @@ the tuned operator.</p>
 ########## Build without Autotuning ##########
 Node Name                                     Ops                                           Time(us)  Time(%)  Shape              Inputs  Outputs
 ---------                                     ---                                           --------  -------  -----              ------  -------
-tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  318.6     98.76    (1, 2, 10, 10, 3)  2       1
-tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       3.076     0.953    (1, 6, 10, 10)     1       1
-tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.924     0.286    (1, 1, 10, 10, 3)  1       1
-Total_time                                    -                                             322.6     -        -                  -       -
+tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  315.3     98.749   (1, 2, 10, 10, 3)  2       1
+tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       3.079     0.964    (1, 6, 10, 10)     1       1
+tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.916     0.287    (1, 1, 10, 10, 3)  1       1
+Total_time                                    -                                             319.295   -        -                  -       -
 </pre></div>
 </div>
 </div>
@@ -634,10 +634,10 @@ Total_time                                    -
 ########## Build with Autotuning ##########
 Node Name                                     Ops                                           Time(us)  Time(%)  Shape              Inputs  Outputs
 ---------                                     ---                                           --------  -------  -----              ------  -------
-tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  190.6     98.591   (1, 1, 10, 10, 6)  2       1
-tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       1.902     0.984    (1, 6, 10, 10)     1       1
-tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.823     0.426    (1, 3, 10, 10, 1)  1       1
-Total_time                                    -                                             193.325   -        -                  -       -
+tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  135.1     98.066   (1, 6, 10, 10, 1)  2       1
+tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       1.748     1.269    (1, 6, 10, 10)     1       1
+tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.916     0.665    (1, 1, 10, 10, 3)  1       1
+Total_time                                    -                                             137.765   -        -                  -       -
 </pre></div>
 </div>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-autotune-py">
diff --git a/docs/how_to/work_with_microtvm/micro_train.html b/docs/how_to/work_with_microtvm/micro_train.html
index 71e46e5e4..4fe789120 100644
--- a/docs/how_to/work_with_microtvm/micro_train.html
+++ b/docs/how_to/work_with_microtvm/micro_train.html
@@ -510,7 +510,7 @@ take about <strong>2 minutes</strong> to download the Stanford Cars, while COCO
 <a href="https://docs.python.org/3/library/shutil.html#shutil.move" title="shutil.move" class="sphx-glr-backref-module-shutil sphx-glr-backref-type-py-function"><span class="n">shutil</span><span class="o">.</span><span class="n">move</span></a><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><a href="https://docs.python.org/3/library/stdtypes.html#str" title="builtins.str" class="sphx-glr-backref-module-builtins sphx-glr-backref-typ [...]
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>&#39;/tmp/tmp5adly3xq/images/random&#39;
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>&#39;/tmp/tmptfwkoswh/images/random&#39;
 </pre></div>
 </div>
 </div>
@@ -570,8 +570,8 @@ objects to other stuff? We can display some examples from our datasets using <co
     <span class="n">plt</span><span class="o">.</span><span class="n">axis</span><span class="p">(</span><span class="s2">&quot;off&quot;</span><span class="p">)</span>
 </pre></div>
 </div>
-<img src="../../_images/sphx_glr_micro_train_001.png" srcset="../../_images/sphx_glr_micro_train_001.png" alt="[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>/tmp/tmp5adly3xq/images/target contains 8144 images
-/tmp/tmp5adly3xq/images/random contains 5000 images
+<img src="../../_images/sphx_glr_micro_train_001.png" srcset="../../_images/sphx_glr_micro_train_001.png" alt="[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>/tmp/tmptfwkoswh/images/target contains 8144 images
+/tmp/tmptfwkoswh/images/random contains 5000 images
 </pre></div>
 </div>
 </div>
@@ -683,13 +683,13 @@ the time on our validation set).</p>
 </pre></div>
 </div>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Epoch 1/3
-328/328 - 55s - loss: 0.2082 - accuracy: 0.9277 - val_loss: 0.1597 - val_accuracy: 0.9535
+328/328 - 55s - loss: 0.2163 - accuracy: 0.9270 - val_loss: 0.1386 - val_accuracy: 0.9528
 Epoch 2/3
-328/328 - 52s - loss: 0.0971 - accuracy: 0.9631 - val_loss: 0.1233 - val_accuracy: 0.9630
+328/328 - 52s - loss: 0.0909 - accuracy: 0.9670 - val_loss: 0.1189 - val_accuracy: 0.9581
 Epoch 3/3
-328/328 - 52s - loss: 0.0650 - accuracy: 0.9757 - val_loss: 0.1162 - val_accuracy: 0.9600
+328/328 - 52s - loss: 0.0696 - accuracy: 0.9729 - val_loss: 0.1105 - val_accuracy: 0.9641
 
-&lt;keras.callbacks.History object at 0x7f7ed75d5e10&gt;
+&lt;keras.callbacks.History object at 0x7f489d506f10&gt;
 </pre></div>
 </div>
 </div>
@@ -951,7 +951,7 @@ as intended.</p>
 <p>From here, we could modify the model to read live images from the camera - we have another
 Arduino tutorial for how to do that <a class="reference external" href="https://github.com/guberti/tvm-arduino-demos/tree/master/examples/person_detection">on GitHub</a>. Alternatively, we could also
 <a class="reference external" href="https://tvm.apache.org/docs/how_to/work_with_microtvm/micro_autotune.html">use TVM’s autotuning capabilities</a> to dramatically improve the model’s performance.</p>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 8 minutes  7.957 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 10 minutes  12.692 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-train-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/b52cec46baf4f78d6bcd94cbe269c8a6/micro_train.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">micro_train.py</span></code></a></p>
diff --git a/docs/how_to/work_with_microtvm/sg_execution_times.html b/docs/how_to/work_with_microtvm/sg_execution_times.html
index 4bffce807..6c977af94 100644
--- a/docs/how_to/work_with_microtvm/sg_execution_times.html
+++ b/docs/how_to/work_with_microtvm/sg_execution_times.html
@@ -322,7 +322,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-work-with-microtvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>08:52.919</strong> total execution time for <strong>how_to_work_with_microtvm</strong> files:</p>
+<p><strong>10:58.917</strong> total execution time for <strong>how_to_work_with_microtvm</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 83%" />
@@ -331,15 +331,15 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="micro_train.html#sphx-glr-how-to-work-with-microtvm-micro-train-py"><span class="std std-ref">Training Vision Models for microTVM on Arduino</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_train.py</span></code>)</p></td>
-<td><p>08:07.957</p></td>
+<td><p>10:12.692</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="micro_autotune.html#sphx-glr-how-to-work-with-microtvm-micro-autotune-py"><span class="std std-ref">Autotuning with microTVM</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_autotune.py</span></code>)</p></td>
-<td><p>00:41.550</p></td>
+<td><p>00:42.778</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="micro_tflite.html#sphx-glr-how-to-work-with-microtvm-micro-tflite-py"><span class="std std-ref">microTVM with TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_tflite.py</span></code>)</p></td>
-<td><p>00:03.412</p></td>
+<td><p>00:03.447</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="micro_ethosu.html#sphx-glr-how-to-work-with-microtvm-micro-ethosu-py"><span class="std std-ref">Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_ethosu.py</span></code>)</p></td>
diff --git a/docs/how_to/work_with_relay/sg_execution_times.html b/docs/how_to/work_with_relay/sg_execution_times.html
index 2bb2fe3d9..17e787639 100644
--- a/docs/how_to/work_with_relay/sg_execution_times.html
+++ b/docs/how_to/work_with_relay/sg_execution_times.html
@@ -322,7 +322,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-work-with-relay-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:11.361</strong> total execution time for <strong>how_to_work_with_relay</strong> files:</p>
+<p><strong>00:11.381</strong> total execution time for <strong>how_to_work_with_relay</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 83%" />
@@ -331,11 +331,11 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="using_external_lib.html#sphx-glr-how-to-work-with-relay-using-external-lib-py"><span class="std std-ref">Using External Libraries in Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_external_lib.py</span></code>)</p></td>
-<td><p>00:09.853</p></td>
+<td><p>00:09.879</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="build_gcn.html#sphx-glr-how-to-work-with-relay-build-gcn-py"><span class="std std-ref">Building a Graph Convolutional Network</span></a> (<code class="docutils literal notranslate"><span class="pre">build_gcn.py</span></code>)</p></td>
-<td><p>00:01.502</p></td>
+<td><p>00:01.496</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="using_relay_viz.html#sphx-glr-how-to-work-with-relay-using-relay-viz-py"><span class="std std-ref">Use Relay Visualizer to Visualize Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_relay_viz.py</span></code>)</p></td>
diff --git a/docs/how_to/work_with_schedules/intrin_math.html b/docs/how_to/work_with_schedules/intrin_math.html
index 4d6c477b4..e5281259e 100644
--- a/docs/how_to/work_with_schedules/intrin_math.html
+++ b/docs/how_to/work_with_schedules/intrin_math.html
@@ -515,7 +515,7 @@ The following example customizes CUDA lowering rule for <code class="code docuti
 <a href="../../reference/api/python/ir.html#tvm.ir.register_intrin_lowering" title="tvm.ir.register_intrin_lowering" class="sphx-glr-backref-module-tvm-ir sphx-glr-backref-type-py-function"><span class="n">register_intrin_lowering</span></a><span class="p">(</span><span class="s2">&quot;tir.exp&quot;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&quot;cuda&quot;</span><span class="p">,</span> <span class="n">f</span><span class="o">= [...]
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>&lt;function my_cuda_math_rule at 0x7f7e3fb81440&gt;
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>&lt;function my_cuda_math_rule at 0x7f4810e0c950&gt;
 </pre></div>
 </div>
 <p>Register the rule to TVM with override option to override existing rule.
diff --git a/docs/how_to/work_with_schedules/sg_execution_times.html b/docs/how_to/work_with_schedules/sg_execution_times.html
index cc0a25541..c8f17193d 100644
--- a/docs/how_to/work_with_schedules/sg_execution_times.html
+++ b/docs/how_to/work_with_schedules/sg_execution_times.html
@@ -322,7 +322,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-work-with-schedules-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:04.002</strong> total execution time for <strong>how_to_work_with_schedules</strong> files:</p>
+<p><strong>00:03.981</strong> total execution time for <strong>how_to_work_with_schedules</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 83%" />
@@ -331,19 +331,19 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="intrin_math.html#sphx-glr-how-to-work-with-schedules-intrin-math-py"><span class="std std-ref">Intrinsics and Math Functions</span></a> (<code class="docutils literal notranslate"><span class="pre">intrin_math.py</span></code>)</p></td>
-<td><p>00:01.863</p></td>
+<td><p>00:01.854</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tensorize.html#sphx-glr-how-to-work-with-schedules-tensorize-py"><span class="std std-ref">Use Tensorize to Leverage Hardware Intrinsics</span></a> (<code class="docutils literal notranslate"><span class="pre">tensorize.py</span></code>)</p></td>
-<td><p>00:00.949</p></td>
+<td><p>00:00.930</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="reduction.html#sphx-glr-how-to-work-with-schedules-reduction-py"><span class="std std-ref">Reduction</span></a> (<code class="docutils literal notranslate"><span class="pre">reduction.py</span></code>)</p></td>
-<td><p>00:00.514</p></td>
+<td><p>00:00.522</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="scan.html#sphx-glr-how-to-work-with-schedules-scan-py"><span class="std std-ref">Scan and Recurrent Kernel</span></a> (<code class="docutils literal notranslate"><span class="pre">scan.py</span></code>)</p></td>
-<td><p>00:00.505</p></td>
+<td><p>00:00.503</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="extern_op.html#sphx-glr-how-to-work-with-schedules-extern-op-py"><span class="std std-ref">External Tensor Functions</span></a> (<code class="docutils literal notranslate"><span class="pre">extern_op.py</span></code>)</p></td>
@@ -355,11 +355,11 @@
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="tedd.html#sphx-glr-how-to-work-with-schedules-tedd-py"><span class="std std-ref">Use Tensor Expression Debug Display (TEDD) for Visualization</span></a> (<code class="docutils literal notranslate"><span class="pre">tedd.py</span></code>)</p></td>
-<td><p>00:00.026</p></td>
+<td><p>00:00.027</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tuple_inputs.html#sphx-glr-how-to-work-with-schedules-tuple-inputs-py"><span class="std std-ref">Compute and Reduce with Tuple Inputs</span></a> (<code class="docutils literal notranslate"><span class="pre">tuple_inputs.py</span></code>)</p></td>
-<td><p>00:00.013</p></td>
+<td><p>00:00.012</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 </tbody>
diff --git a/docs/how_to/work_with_schedules/tensorize.html b/docs/how_to/work_with_schedules/tensorize.html
index 57a0c20b0..4ae758169 100644
--- a/docs/how_to/work_with_schedules/tensorize.html
+++ b/docs/how_to/work_with_schedules/tensorize.html
@@ -571,7 +571,7 @@ The importing needs to happen before the tensorized GEMV being executed.</p>
              C: Buffer(C_2: Pointer(float32), float32, [524288], [])}
   buffer_map = {A_1: A, B_1: B, C_1: C}
   preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [1024, 64], []), B_1: B_3: Buffer(B_2, float32, [512, 64], []), C_1: C_3: Buffer(C_2, float32, [1024, 512], [])} {
-  attr [IterVar(i: int32, (nullptr), &quot;DataPar&quot;, &quot;&quot;)] &quot;pragma_import_llvm&quot; = &quot;; ModuleID = &#39;/tmp/tmpa_jm11ei/input0.cc&#39;\nsource_filename = \&quot;/tmp/tmpa_jm11ei/input0.cc\&quot;\ntarget datalayout = \&quot;e-m:e-i64:64-f80:128-n8:16:32:64-S128\&quot;\ntarget triple = \&quot;x86_64-pc-linux-gnu\&quot;\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n  %7 = allo [...]
+  attr [IterVar(i: int32, (nullptr), &quot;DataPar&quot;, &quot;&quot;)] &quot;pragma_import_llvm&quot; = &quot;; ModuleID = &#39;/tmp/tmpbs1xqx8t/input0.cc&#39;\nsource_filename = \&quot;/tmp/tmpbs1xqx8t/input0.cc\&quot;\ntarget datalayout = \&quot;e-m:e-i64:64-f80:128-n8:16:32:64-S128\&quot;\ntarget triple = \&quot;x86_64-pc-linux-gnu\&quot;\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n  %7 = allo [...]
   for (i, 0, 1024) {
     for (j.outer: int32, 0, 32) {
       @tir.call_extern(&quot;gemv_update&quot;, @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), C_2, ((i*512) + (j.outer*16)), 16, 2, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), A_2, (i*64), 64, 1, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), B_2, (j.outer*1024), 1024, 1, dtype=handle), 16, 64, 64, dtype=int32)
diff --git a/docs/reference/api/python/auto_scheduler.html b/docs/reference/api/python/auto_scheduler.html
index c1bb074da..2f779f220 100644
--- a/docs/reference/api/python/auto_scheduler.html
+++ b/docs/reference/api/python/auto_scheduler.html
@@ -1737,7 +1737,7 @@ Can be the a function or the function name.</p></li>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="tvm.auto_scheduler.auto_schedule">
-<span class="sig-prename descclassname"><span class="pre">tvm.auto_scheduler.</span></span><span class="sig-name descname"><span class="pre">auto_schedule</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">task</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">search_policy</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em clas [...]
+<span class="sig-prename descclassname"><span class="pre">tvm.auto_scheduler.</span></span><span class="sig-name descname"><span class="pre">auto_schedule</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">task</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">search_policy</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em clas [...]
 <dd><p>THIS API IS DEPRECATED.</p>
 <p>Run auto scheduling search for a task.</p>
 <dl class="field-list simple">
@@ -1774,7 +1774,7 @@ the initial naive schedule (state).</p>
 
 <dl class="py class">
 <dt class="sig sig-object py" id="tvm.auto_scheduler.SketchPolicy">
-<em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">tvm.auto_scheduler.</span></span><span class="sig-name descname"><span class="pre">SketchPolicy</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">task</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">program_cost_model</span></span><span class="o"><span class="pre">=</span></span><span class="defau [...]
+<em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">tvm.auto_scheduler.</span></span><span class="sig-name descname"><span class="pre">SketchPolicy</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">task</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">program_cost_model</span></span><span class="o"><span class="pre">=</span></span><span class="defau [...]
 <dd><p>The search policy that searches in a hierarchical search space defined by sketches.
 The policy randomly samples programs from the space defined by sketches and use evolutionary
 search to fine-tune them.</p>
diff --git a/docs/reference/api/typedoc/classes/bytestreamreader.html b/docs/reference/api/typedoc/classes/bytestreamreader.html
index 17557ffc4..c5756b9bf 100644
--- a/docs/reference/api/typedoc/classes/bytestreamreader.html
+++ b/docs/reference/api/typedoc/classes/bytestreamreader.html
@@ -119,7 +119,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/rpc_server.ts#L43">rpc_server.ts:43</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/rpc_server.ts#L43">rpc_server.ts:43</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-parameters-title">Parameters</h4>
@@ -141,7 +141,7 @@
 					<div class="tsd-signature tsd-kind-icon">bytes<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">Uint8Array</span></div>
 					<aside class="tsd-sources">
 						<ul>
-							<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/rpc_server.ts#L43">rpc_server.ts:43</a></li>
+							<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/rpc_server.ts#L43">rpc_server.ts:43</a></li>
 						</ul>
 					</aside>
 				</section>
@@ -151,7 +151,7 @@
 					<div class="tsd-signature tsd-kind-icon">offset<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">number</span><span class="tsd-signature-symbol"> = 0</span></div>
 					<aside class="tsd-sources">
 						<ul>
-							<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/rpc_server.ts#L42">rpc_server.ts:42</a></li>
+							<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/rpc_server.ts#L42">rpc_server.ts:42</a></li>
 						</ul>
 					</aside>
 				</section>
@@ -168,7 +168,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/rpc_server.ts#L63">rpc_server.ts:63</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/rpc_server.ts#L63">rpc_server.ts:63</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">Uint8Array</span></h4>
@@ -185,7 +185,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/rpc_server.ts#L49">rpc_server.ts:49</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/rpc_server.ts#L49">rpc_server.ts:49</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">number</span></h4>
@@ -202,7 +202,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/rpc_server.ts#L57">rpc_server.ts:57</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/rpc_server.ts#L57">rpc_server.ts:57</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">number</span></h4>
diff --git a/docs/reference/api/typedoc/classes/cachedcallstack.html b/docs/reference/api/typedoc/classes/cachedcallstack.html
index f0d6660dc..80e16e813 100644
--- a/docs/reference/api/typedoc/classes/cachedcallstack.html
+++ b/docs/reference/api/typedoc/classes/cachedcallstack.html
@@ -144,7 +144,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L223">memory.ts:223</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L223">memory.ts:223</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-parameters-title">Parameters</h4>
@@ -172,7 +172,7 @@
 					<div class="tsd-signature tsd-kind-icon">temp<wbr>Args<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">Array</span><span class="tsd-signature-symbol">&lt;</span><a href="../interfaces/disposable.html" class="tsd-signature-type">Disposable</a><span class="tsd-signature-symbol">&gt;</span><span class="tsd-signature-symbol"> = []</span></div>
 					<aside class="tsd-sources">
 						<ul>
-							<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L208">memory.ts:208</a></li>
+							<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L208">memory.ts:208</a></li>
 						</ul>
 					</aside>
 					<div class="tsd-comment tsd-typography">
@@ -194,7 +194,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L312">memory.ts:312</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L312">memory.ts:312</a></li>
 								</ul>
 							</aside>
 							<div class="tsd-comment tsd-typography">
@@ -226,7 +226,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L284">memory.ts:284</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L284">memory.ts:284</a></li>
 								</ul>
 							</aside>
 							<div class="tsd-comment tsd-typography">
@@ -262,7 +262,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L388">memory.ts:388</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L388">memory.ts:388</a></li>
 								</ul>
 							</aside>
 							<div class="tsd-comment tsd-typography">
@@ -300,7 +300,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L376">memory.ts:376</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L376">memory.ts:376</a></li>
 								</ul>
 							</aside>
 							<div class="tsd-comment tsd-typography">
@@ -340,7 +340,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L267">memory.ts:267</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L267">memory.ts:267</a></li>
 								</ul>
 							</aside>
 							<div class="tsd-comment tsd-typography">
@@ -373,7 +373,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L243">memory.ts:243</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L243">memory.ts:243</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">void</span></h4>
@@ -390,7 +390,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L321">memory.ts:321</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L321">memory.ts:321</a></li>
 								</ul>
 							</aside>
 							<div class="tsd-comment tsd-typography">
@@ -422,7 +422,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L252">memory.ts:252</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L252">memory.ts:252</a></li>
 								</ul>
 							</aside>
 							<div class="tsd-comment tsd-typography">
@@ -444,7 +444,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L359">memory.ts:359</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L359">memory.ts:359</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-parameters-title">Parameters</h4>
@@ -470,7 +470,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L342">memory.ts:342</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L342">memory.ts:342</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-parameters-title">Parameters</h4>
@@ -496,7 +496,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L350">memory.ts:350</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L350">memory.ts:350</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-parameters-title">Parameters</h4>
@@ -522,7 +522,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L326">memory.ts:326</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L326">memory.ts:326</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-parameters-title">Parameters</h4>
@@ -548,7 +548,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L363">memory.ts:363</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L363">memory.ts:363</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-parameters-title">Parameters</h4>
@@ -574,7 +574,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L346">memory.ts:346</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L346">memory.ts:346</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-parameters-title">Parameters</h4>
@@ -600,7 +600,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/memory.ts#L334">memory.ts:334</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/memory.ts#L334">memory.ts:334</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-parameters-title">Parameters</h4>
diff --git a/docs/reference/api/typedoc/classes/dldatatype.html b/docs/reference/api/typedoc/classes/dldatatype.html
index 9b32b4196..358d9e3c6 100644
--- a/docs/reference/api/typedoc/classes/dldatatype.html
+++ b/docs/reference/api/typedoc/classes/dldatatype.html
@@ -119,7 +119,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L262">runtime.ts:262</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L262">runtime.ts:262</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-parameters-title">Parameters</h4>
@@ -147,7 +147,7 @@
 					<div class="tsd-signature tsd-kind-icon">bits<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">number</span></div>
 					<aside class="tsd-sources">
 						<ul>
-							<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L260">runtime.ts:260</a></li>
+							<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L260">runtime.ts:260</a></li>
 						</ul>
 					</aside>
 					<div class="tsd-comment tsd-typography">
@@ -162,7 +162,7 @@
 					<div class="tsd-signature tsd-kind-icon">code<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">number</span></div>
 					<aside class="tsd-sources">
 						<ul>
-							<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L258">runtime.ts:258</a></li>
+							<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L258">runtime.ts:258</a></li>
 						</ul>
 					</aside>
 					<div class="tsd-comment tsd-typography">
@@ -177,7 +177,7 @@
 					<div class="tsd-signature tsd-kind-icon">lanes<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">number</span></div>
 					<aside class="tsd-sources">
 						<ul>
-							<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L262">runtime.ts:262</a></li>
+							<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L262">runtime.ts:262</a></li>
 						</ul>
 					</aside>
 					<div class="tsd-comment tsd-typography">
@@ -199,7 +199,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L279">runtime.ts:279</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L279">runtime.ts:279</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">number</span></h4>
@@ -216,7 +216,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L270">runtime.ts:270</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L270">runtime.ts:270</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">string</span></h4>
diff --git a/docs/reference/api/typedoc/classes/dldevice.html b/docs/reference/api/typedoc/classes/dldevice.html
index 0047e0215..ea2c78bcd 100644
--- a/docs/reference/api/typedoc/classes/dldevice.html
+++ b/docs/reference/api/typedoc/classes/dldevice.html
@@ -118,7 +118,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L202">runtime.ts:202</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L202">runtime.ts:202</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-parameters-title">Parameters</h4>
@@ -146,7 +146,7 @@
 					<div class="tsd-signature tsd-kind-icon">device<wbr>Id<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">number</span></div>
 					<aside class="tsd-sources">
 						<ul>
-							<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L200">runtime.ts:200</a></li>
+							<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L200">runtime.ts:200</a></li>
 						</ul>
 					</aside>
 					<div class="tsd-comment tsd-typography">
@@ -161,7 +161,7 @@
 					<div class="tsd-signature tsd-kind-icon">device<wbr>Type<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">number</span></div>
 					<aside class="tsd-sources">
 						<ul>
-							<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L198">runtime.ts:198</a></li>
+							<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L198">runtime.ts:198</a></li>
 						</ul>
 					</aside>
 					<div class="tsd-comment tsd-typography">
@@ -183,7 +183,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L223">runtime.ts:223</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L223">runtime.ts:223</a></li>
 								</ul>
 							</aside>
 							<div class="tsd-comment tsd-typography">
@@ -205,7 +205,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L230">runtime.ts:230</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L230">runtime.ts:230</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">string</span></h4>
diff --git a/docs/reference/api/typedoc/classes/environment.html b/docs/reference/api/typedoc/classes/environment.html
index 48af489a1..6602a7e8b 100644
--- a/docs/reference/api/typedoc/classes/environment.html
+++ b/docs/reference/api/typedoc/classes/environment.html
@@ -125,7 +125,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/environment.ts#L86">environment.ts:86</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/environment.ts#L86">environment.ts:86</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-parameters-title">Parameters</h4>
@@ -169,7 +169,7 @@
 					<aside class="tsd-sources">
 						<p>Implementation of <a href="../interfaces/libraryprovider.html">LibraryProvider</a>.<a href="../interfaces/libraryprovider.html#imports">imports</a></p>
 						<ul>
-							<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/environment.ts#L70">environment.ts:70</a></li>
+							<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/environment.ts#L70">environment.ts:70</a></li>
 						</ul>
 					</aside>
 				</section>
@@ -179,7 +179,7 @@
 					<div class="tsd-signature tsd-kind-icon">logger<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-symbol">(</span>msg<span class="tsd-signature-symbol">: </span><span class="tsd-signature-type">string</span><span class="tsd-signature-symbol">)</span><span class="tsd-signature-symbol"> =&gt; </span><span class="tsd-signature-type">void</span></div>
 					<aside class="tsd-sources">
 						<ul>
-							<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/environment.ts#L69">environment.ts:69</a></li>
+							<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/environment.ts#L69">environment.ts:69</a></li>
 						</ul>
 					</aside>
 					<div class="tsd-type-declaration">
@@ -210,7 +210,7 @@
 					<div class="tsd-signature tsd-kind-icon">packedCFunc<wbr>Table<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">Array</span><span class="tsd-signature-symbol">&lt;</span><span class="tsd-signature-type">ctypes.FTVMWasmPackedCFunc</span><span class="tsd-signature-symbol"> | </span><span class="tsd-signature-type">undefined</span><span class="tsd-signature-symbol">&gt;</span><span class="tsd-signature-symbol"> = [undefined,]</span></div>
 					<aside class="tsd-sources">
 						<ul>
-							<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/environment.ts#L78">environment.ts:78</a></li>
+							<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/environment.ts#L78">environment.ts:78</a></li>
 						</ul>
 					</aside>
 					<div class="tsd-comment tsd-typography">
@@ -228,7 +228,7 @@
 					<div class="tsd-signature tsd-kind-icon">packedCFunc<wbr>Table<wbr>Free<wbr>Id<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">Array</span><span class="tsd-signature-symbol">&lt;</span><span class="tsd-signature-type">number</span><span class="tsd-signature-symbol">&gt;</span><span class="tsd-signature-symbol"> = []</span></div>
 					<aside class="tsd-sources">
 						<ul>
-							<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/environment.ts#L84">environment.ts:84</a></li>
+							<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/environment.ts#L84">environment.ts:84</a></li>
 						</ul>
 					</aside>
 					<div class="tsd-comment tsd-typography">
@@ -250,7 +250,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/environment.ts#L105">environment.ts:105</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/environment.ts#L105">environment.ts:105</a></li>
 								</ul>
 							</aside>
 							<div class="tsd-comment tsd-typography">
diff --git a/docs/reference/api/typedoc/classes/ffilibrary.html b/docs/reference/api/typedoc/classes/ffilibrary.html
index 877be18ef..fdf81936f 100644
--- a/docs/reference/api/typedoc/classes/ffilibrary.html
+++ b/docs/reference/api/typedoc/classes/ffilibrary.html
@@ -131,7 +131,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L49">runtime.ts:49</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L49">runtime.ts:49</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-parameters-title">Parameters</h4>
@@ -156,7 +156,7 @@
 					<div class="tsd-signature tsd-kind-icon">exports<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">Record</span><span class="tsd-signature-symbol">&lt;</span><span class="tsd-signature-type">string</span><span class="tsd-signature-symbol">, </span><span class="tsd-signature-type">Function</span><span class="tsd-signature-symbol">&gt;</span></div>
 					<aside class="tsd-sources">
 						<ul>
-							<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L46">runtime.ts:46</a></li>
+							<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L46">runtime.ts:46</a></li>
 						</ul>
 					</aside>
 				</section>
@@ -166,7 +166,7 @@
 					<div class="tsd-signature tsd-kind-icon">memory<span class="tsd-signature-symbol">:</span> <a href="memory.html" class="tsd-signature-type">Memory</a></div>
 					<aside class="tsd-sources">
 						<ul>
-							<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L45">runtime.ts:45</a></li>
+							<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L45">runtime.ts:45</a></li>
 						</ul>
 					</aside>
 				</section>
@@ -176,7 +176,7 @@
 					<div class="tsd-signature tsd-kind-icon">wasm32<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">boolean</span></div>
 					<aside class="tsd-sources">
 						<ul>
-							<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L44">runtime.ts:44</a></li>
+							<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L44">runtime.ts:44</a></li>
 						</ul>
 					</aside>
 				</section>
@@ -186,7 +186,7 @@
 					<div class="tsd-signature tsd-kind-icon">webGPUContext<span class="tsd-signature-symbol">:</span> <a href="webgpucontext.html" class="tsd-signature-type">WebGPUContext</a></div>
 					<aside class="tsd-sources">
 						<ul>
-							<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L47">runtime.ts:47</a></li>
+							<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L47">runtime.ts:47</a></li>
 						</ul>
 					</aside>
 				</section>
@@ -203,7 +203,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L76">runtime.ts:76</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L76">runtime.ts:76</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-parameters-title">Parameters</h4>
@@ -226,7 +226,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L66">runtime.ts:66</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L66">runtime.ts:66</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">void</span></h4>
@@ -243,7 +243,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L84">runtime.ts:84</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L84">runtime.ts:84</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-returns-title">Returns <a href="cachedcallstack.html" class="tsd-signature-type">CachedCallStack</a></h4>
@@ -260,7 +260,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L95">runtime.ts:95</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L95">runtime.ts:95</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-parameters-title">Parameters</h4>
@@ -283,7 +283,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L72">runtime.ts:72</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L72">runtime.ts:72</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">number</span></h4>
diff --git a/docs/reference/api/typedoc/classes/graphexecutor.html b/docs/reference/api/typedoc/classes/graphexecutor.html
index aae61b0c6..fbb7419e4 100644
--- a/docs/reference/api/typedoc/classes/graphexecutor.html
+++ b/docs/reference/api/typedoc/classes/graphexecutor.html
@@ -130,7 +130,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L583">runtime.ts:583</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L583">runtime.ts:583</a></li>
 								</ul>
 							</aside>
 							<div class="tsd-comment tsd-typography">
@@ -162,7 +162,7 @@
 					<div class="tsd-signature tsd-kind-icon">module<span class="tsd-signature-symbol">:</span> <a href="module.html" class="tsd-signature-type">Module</a></div>
 					<aside class="tsd-sources">
 						<ul>
-							<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L579">runtime.ts:579</a></li>
+							<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L579">runtime.ts:579</a></li>
 						</ul>
 					</aside>
 				</section>
@@ -179,7 +179,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L654">runtime.ts:654</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L654">runtime.ts:654</a></li>
 								</ul>
 							</aside>
 							<div class="tsd-comment tsd-typography">
@@ -224,7 +224,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L597">runtime.ts:597</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L597">runtime.ts:597</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">void</span></h4>
@@ -241,7 +241,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L631">runtime.ts:631</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L631">runtime.ts:631</a></li>
 								</ul>
 							</aside>
 							<div class="tsd-comment tsd-typography">
@@ -279,7 +279,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/caa0d59c3/web/src/runtime.ts#L644">runtime.ts:644</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/c334790bf/web/src/runtime.ts#L644">runtime.ts:644</a></li>
 								</ul>
 							</aside>
 							<div class="tsd-comment tsd-typography">
@@ -310,7 +310,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
... 2373 lines suppressed ...