You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by tq...@apache.org on 2022/11/08 01:46:52 UTC

[tvm-site] branch asf-site updated: deploying docs (apache/tvm@b16a64d6edb9fd1a014fc51995dff7d0e2f4c84e)

This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/tvm-site.git


The following commit(s) were added to refs/heads/asf-site by this push:
     new 92fd30d67f deploying docs (apache/tvm@b16a64d6edb9fd1a014fc51995dff7d0e2f4c84e)
92fd30d67f is described below

commit 92fd30d67ff673d73c2127d9ab63405d261cdc5c
Author: tvm-bot <95...@users.noreply.github.com>
AuthorDate: Tue Nov 8 01:46:43 2022 +0000

    deploying docs (apache/tvm@b16a64d6edb9fd1a014fc51995dff7d0e2f4c84e)
---
 docs/_images/sphx_glr_micro_train_001.png          |  Bin 298784 -> 332672 bytes
 docs/_images/sphx_glr_micro_train_thumb.png        |  Bin 22856 -> 24174 bytes
 .../how_to/compile_models/from_darknet.rst.txt     |    2 +-
 .../how_to/compile_models/from_keras.rst.txt       |    2 +-
 .../how_to/compile_models/from_mxnet.rst.txt       |    2 +-
 .../how_to/compile_models/from_oneflow.rst.txt     |    2 +-
 .../how_to/compile_models/from_pytorch.rst.txt     |    2 +-
 .../how_to/compile_models/from_tensorflow.rst.txt  |    2 +-
 .../compile_models/sg_execution_times.rst.txt      |   22 +-
 .../deploy_models/deploy_model_on_android.rst.txt  |    2 +-
 .../deploy_object_detection_pytorch.rst.txt        |    4 +-
 .../deploy_models/deploy_prequantized.rst.txt      |    6 +-
 .../deploy_prequantized_tflite.rst.txt             |    4 +-
 .../how_to/deploy_models/deploy_quantized.rst.txt  |    2 +-
 .../deploy_models/deploy_ssd_gluoncv.rst.txt       |    4 +-
 .../deploy_models/sg_execution_times.rst.txt       |   18 +-
 .../extend_tvm/bring_your_own_datatypes.rst.txt    |    2 +-
 .../how_to/extend_tvm/sg_execution_times.rst.txt   |   10 +-
 .../how_to/extend_tvm/use_pass_instrument.rst.txt  |   16 +-
 .../optimize_operators/opt_conv_cuda.rst.txt       |    2 +-
 .../optimize_operators/opt_conv_tensorcore.rst.txt |    2 +-
 .../how_to/optimize_operators/opt_gemm.rst.txt     |   16 +-
 .../optimize_operators/sg_execution_times.rst.txt  |    8 +-
 .../sg_execution_times.rst.txt                     |   14 +-
 .../tune_conv2d_layer_cuda.rst.txt                 |  323 +++-
 .../tune_network_cuda.rst.txt                      |    4 +-
 .../tune_network_x86.rst.txt                       |    4 +-
 .../tune_sparse_x86.rst.txt                        |   50 +-
 .../tune_with_autotvm/sg_execution_times.rst.txt   |    6 +-
 .../tune_with_autotvm/tune_conv2d_cuda.rst.txt     |  243 +--
 .../work_with_microtvm/micro_autotune.rst.txt      |   16 +-
 .../how_to/work_with_microtvm/micro_train.rst.txt  |   18 +-
 .../work_with_microtvm/sg_execution_times.rst.txt  |   10 +-
 .../work_with_relay/sg_execution_times.rst.txt     |    8 +-
 .../how_to/work_with_schedules/intrin_math.rst.txt |    2 +-
 .../work_with_schedules/sg_execution_times.rst.txt |   14 +-
 .../how_to/work_with_schedules/tensorize.rst.txt   |    2 +-
 .../tutorials/autotvm/sg_execution_times.rst.txt   |    4 +-
 .../frontend/deploy_classification.rst.txt         |    2 +-
 .../tutorials/frontend/deploy_detection.rst.txt    |    2 +-
 .../tutorials/frontend/sg_execution_times.rst.txt  |    6 +-
 .../tutorials/optimize/sg_execution_times.rst.txt  |    6 +-
 .../topic/vta/tutorials/sg_execution_times.rst.txt |    6 +-
 .../tutorial/auto_scheduler_matmul_x86.rst.txt     |    4 +-
 docs/_sources/tutorial/autotvm_matmul_x86.rst.txt  |   20 +-
 docs/_sources/tutorial/autotvm_relay_x86.rst.txt   |   59 +-
 .../tutorial/cross_compilation_and_rpc.rst.txt     |    2 +-
 docs/_sources/tutorial/intro_topi.rst.txt          |    2 +-
 docs/_sources/tutorial/sg_execution_times.rst.txt  |   24 +-
 .../tutorial/tensor_expr_get_started.rst.txt       |   44 +-
 docs/commit_hash                                   |    2 +-
 docs/genindex.html                                 |    8 +-
 docs/how_to/compile_models/from_darknet.html       |    2 +-
 docs/how_to/compile_models/from_keras.html         |    2 +-
 docs/how_to/compile_models/from_mxnet.html         |    2 +-
 docs/how_to/compile_models/from_oneflow.html       |   15 +-
 docs/how_to/compile_models/from_pytorch.html       |   11 +-
 docs/how_to/compile_models/from_tensorflow.html    |    2 +-
 docs/how_to/compile_models/sg_execution_times.html |   22 +-
 .../deploy_models/deploy_model_on_android.html     |    2 +-
 .../deploy_object_detection_pytorch.html           |   55 +-
 docs/how_to/deploy_models/deploy_prequantized.html |    9 +-
 .../deploy_models/deploy_prequantized_tflite.html  |    4 +-
 docs/how_to/deploy_models/deploy_quantized.html    |    2 +-
 docs/how_to/deploy_models/deploy_ssd_gluoncv.html  |   36 +-
 docs/how_to/deploy_models/sg_execution_times.html  |   18 +-
 .../extend_tvm/bring_your_own_datatypes.html       |    2 +-
 docs/how_to/extend_tvm/sg_execution_times.html     |   10 +-
 docs/how_to/extend_tvm/use_pass_instrument.html    |   16 +-
 docs/how_to/optimize_operators/opt_conv_cuda.html  |    2 +-
 .../optimize_operators/opt_conv_tensorcore.html    |    2 +-
 docs/how_to/optimize_operators/opt_gemm.html       |   16 +-
 .../optimize_operators/sg_execution_times.html     |    8 +-
 .../sg_execution_times.html                        |   14 +-
 .../tune_conv2d_layer_cuda.html                    |  323 +++-
 .../tune_with_autoscheduler/tune_network_cuda.html |    4 +-
 .../tune_with_autoscheduler/tune_network_x86.html  |    4 +-
 .../tune_with_autoscheduler/tune_sparse_x86.html   |   50 +-
 .../tune_with_autotvm/sg_execution_times.html      |    6 +-
 .../how_to/tune_with_autotvm/tune_conv2d_cuda.html |  243 +--
 docs/how_to/work_with_microtvm/micro_autotune.html |   16 +-
 docs/how_to/work_with_microtvm/micro_train.html    |   16 +-
 .../work_with_microtvm/sg_execution_times.html     |   10 +-
 .../how_to/work_with_relay/sg_execution_times.html |    8 +-
 docs/how_to/work_with_schedules/intrin_math.html   |    2 +-
 .../work_with_schedules/sg_execution_times.html    |   14 +-
 docs/how_to/work_with_schedules/tensorize.html     |    2 +-
 docs/objects.inv                                   |  Bin 23624 -> 23632 bytes
 docs/reference/api/doxygen/array_8h__dep__incl.svg |  232 +--
 docs/reference/api/doxygen/block__scope_8h.html    |    2 +-
 .../api/doxygen/block__scope_8h__dep__incl.svg     |  244 +--
 .../api/doxygen/c__runtime__api_8h__dep__incl.svg  |  560 +++----
 ..._1_1meta__schedule_1_1ScheduleRule-members.html |   42 +-
 ...classtvm_1_1meta__schedule_1_1ScheduleRule.html |   66 +-
 ...meta__schedule_1_1ScheduleRule__coll__graph.svg |  132 +-
 ...a__schedule_1_1ScheduleRule__inherit__graph.svg |   98 +-
 .../api/doxygen/data__type_8h__dep__incl.svg       |  508 +++----
 ...r_000003_000017.html => dir_000003_000020.html} |    0
 ...r_000003_000031.html => dir_000003_000034.html} |    0
 ...r_000003_000032.html => dir_000003_000035.html} |    0
 docs/reference/api/doxygen/dir_000004_000011.html  |    2 +-
 ...r_000004_000017.html => dir_000004_000020.html} |    0
 ...r_000004_000031.html => dir_000004_000034.html} |    0
 ...r_000005_000017.html => dir_000005_000020.html} |    0
 ...r_000005_000031.html => dir_000005_000034.html} |    0
 ...r_000005_000032.html => dir_000005_000035.html} |    0
 ...r_000006_000031.html => dir_000006_000034.html} |    0
 ...r_000006_000032.html => dir_000006_000035.html} |    0
 ...r_000007_000017.html => dir_000007_000020.html} |    0
 ...r_000007_000018.html => dir_000007_000021.html} |    0
 ...r_000007_000031.html => dir_000007_000034.html} |    0
 ...r_000011_000017.html => dir_000011_000020.html} |    0
 ...r_000011_000031.html => dir_000011_000034.html} |    0
 ...r_000014_000031.html => dir_000014_000034.html} |    0
 ...r_000015_000031.html => dir_000015_000034.html} |    0
 ...r_000016_000031.html => dir_000016_000034.html} |    0
 ...r_000016_000032.html => dir_000016_000035.html} |    0
 docs/reference/api/doxygen/dir_000017_000008.html  |   73 -
 ...r_000037_000038.html => dir_000017_000011.html} |    6 +-
 docs/reference/api/doxygen/dir_000018_000007.html  |   73 -
 ...r_000037_000036.html => dir_000018_000011.html} |    6 +-
 docs/reference/api/doxygen/dir_000019_000009.html  |   73 -
 ...r_000033_000007.html => dir_000019_000011.html} |    6 +-
 docs/reference/api/doxygen/dir_000020_000008.html  |    6 +-
 docs/reference/api/doxygen/dir_000021_000007.html  |    6 +-
 ...r_000018_000008.html => dir_000021_000008.html} |    0
 docs/reference/api/doxygen/dir_000022_000009.html  |    6 +-
 ...r_000020_000007.html => dir_000023_000007.html} |    0
 ...r_000020_000008.html => dir_000023_000008.html} |    0
 ...r_000020_000011.html => dir_000023_000011.html} |    0
 ...r_000020_000013.html => dir_000023_000013.html} |    0
 ...r_000020_000017.html => dir_000023_000020.html} |    0
 ...r_000021_000007.html => dir_000024_000007.html} |    0
 ...r_000021_000011.html => dir_000024_000011.html} |    0
 ...r_000021_000017.html => dir_000024_000020.html} |    0
 ...r_000022_000009.html => dir_000025_000009.html} |    0
 docs/reference/api/doxygen/dir_000028_000007.html  |   73 -
 docs/reference/api/doxygen/dir_000029_000011.html  |   73 -
 docs/reference/api/doxygen/dir_000030_000007.html  |   73 -
 docs/reference/api/doxygen/dir_000030_000013.html  |   73 -
 docs/reference/api/doxygen/dir_000031_000007.html  |    6 +-
 ...r_000028_000017.html => dir_000031_000020.html} |    0
 docs/reference/api/doxygen/dir_000032_000002.html  |   73 -
 docs/reference/api/doxygen/dir_000032_000011.html  |    6 +-
 ...r_000029_000028.html => dir_000032_000031.html} |    0
 docs/reference/api/doxygen/dir_000033_000007.html  |    6 +-
 ...r_000030_000008.html => dir_000033_000008.html} |    0
 docs/reference/api/doxygen/dir_000033_000013.html  |    6 +-
 ...r_000030_000017.html => dir_000033_000020.html} |    0
 ...r_000031_000007.html => dir_000034_000007.html} |    0
 docs/reference/api/doxygen/dir_000034_000008.html  |    6 +-
 ...r_000031_000011.html => dir_000034_000011.html} |    0
 ...r_000031_000013.html => dir_000034_000013.html} |    0
 ...r_000031_000017.html => dir_000034_000020.html} |    0
 docs/reference/api/doxygen/dir_000035_000002.html  |    6 +-
 ...r_000032_000008.html => dir_000035_000008.html} |    0
 docs/reference/api/doxygen/dir_000035_000011.html  |    6 +-
 ...r_000032_000013.html => dir_000035_000013.html} |    0
 docs/reference/api/doxygen/dir_000035_000032.html  |   73 -
 ...r_000033_000007.html => dir_000036_000007.html} |    0
 ...r_000033_000013.html => dir_000036_000013.html} |    0
 ...r_000033_000017.html => dir_000036_000020.html} |    0
 docs/reference/api/doxygen/dir_000036_000032.html  |   73 -
 docs/reference/api/doxygen/dir_000036_000035.html  |   73 -
 ...r_000034_000002.html => dir_000037_000002.html} |    0
 ...r_000034_000008.html => dir_000037_000008.html} |    0
 docs/reference/api/doxygen/dir_000037_000032.html  |   73 -
 docs/reference/api/doxygen/dir_000038_000002.html  |    6 +-
 ...r_000035_000011.html => dir_000038_000011.html} |    0
 docs/reference/api/doxygen/dir_000038_000032.html  |   73 -
 docs/reference/api/doxygen/dir_000038_000035.html  |    6 +-
 docs/reference/api/doxygen/dir_000039_000032.html  |   73 -
 docs/reference/api/doxygen/dir_000039_000035.html  |    6 +-
 docs/reference/api/doxygen/dir_000039_000038.html  |    6 +-
 ...r_000037_000031.html => dir_000040_000034.html} |    0
 docs/reference/api/doxygen/dir_000040_000035.html  |    6 +-
 ...r_000037_000035.html => dir_000040_000038.html} |    0
 ...r_000037_000036.html => dir_000040_000039.html} |    0
 ...r_000037_000038.html => dir_000040_000041.html} |    0
 ...r_000037_000040.html => dir_000040_000043.html} |    0
 ...r_000038_000002.html => dir_000041_000002.html} |    0
 docs/reference/api/doxygen/dir_000041_000035.html  |    6 +-
 ...r_000038_000035.html => dir_000041_000038.html} |    0
 ...r_000039_000031.html => dir_000042_000034.html} |    0
 docs/reference/api/doxygen/dir_000042_000035.html  |    6 +-
 ...r_000039_000035.html => dir_000042_000038.html} |    0
 ...r_000039_000036.html => dir_000042_000039.html} |    0
 ...r_000039_000037.html => dir_000042_000040.html} |    0
 ...r_000039_000038.html => dir_000042_000041.html} |    0
 ...r_000039_000040.html => dir_000042_000043.html} |    0
 ...r_000040_000031.html => dir_000043_000034.html} |    0
 ...r_000040_000032.html => dir_000043_000035.html} |    0
 ...r_000040_000035.html => dir_000043_000038.html} |    0
 ...r_000041_000031.html => dir_000044_000034.html} |    0
 ...r_000041_000032.html => dir_000044_000035.html} |    0
 ...r_000041_000035.html => dir_000044_000038.html} |    0
 ...r_000042_000032.html => dir_000045_000035.html} |    0
 ...r_000042_000035.html => dir_000045_000038.html} |    0
 .../dir_006b1f4ac353a18abb55f74cc4796db6_dep.svg   |    8 +-
 .../dir_02be2c9d68e402f80df60bd528724ee5_dep.svg   |   22 +-
 .../dir_05ffda4d144d7985f926507abde48dbb_dep.svg   |   12 +-
 .../dir_1f1b12d204a071c9e67e47fcbb552b86_dep.svg   |   10 +-
 .../dir_2b0ef9f1c86b565a92e96353e1195b2c_dep.svg   |    8 +-
 .../dir_3a038e7bfa2370c6aee2a5aecd5d3ef1_dep.svg   |   12 +-
 .../dir_404558507ed35459f0d7a6d81d8c508d_dep.svg   |    4 +-
 .../dir_4378f18824ae7d4ad48f8d7785cd7ac8.html      |    7 +-
 .../dir_4378f18824ae7d4ad48f8d7785cd7ac8_dep.svg   |  162 +-
 ...l => dir_437a885699bf6787e92bcac6040bb86f.html} |   22 +-
 ...> dir_437a885699bf6787e92bcac6040bb86f_dep.svg} |   36 +-
 .../dir_519be2d4a83a987dbf989f1de527b870_dep.svg   |   10 +-
 .../dir_536029070df27a3ee03a4230630922c5_dep.svg   |    2 +-
 .../dir_54983dd6d74c59f67ee9e8e5a50aafc4_dep.svg   |   42 +-
 .../dir_5baffeed82c1190bfdf7a4f918ab5ac6_dep.svg   |    2 +-
 .../dir_5da96592f3a7c442b838b075c58254c2_dep.svg   |   20 +-
 .../dir_63946bee875c6d52bce55e72a67a86ad_dep.svg   |   28 +-
 .../dir_67fdee7a5e0396034822418fa5baa4b4_dep.svg   |    4 +-
 .../dir_6cd4295f6ad5aa17e5b568d0e5b190e5_dep.svg   |    2 +-
 .../dir_72c2f11201cd7636dc7624de0754daa5_dep.svg   |   26 +-
 .../dir_8395ded0a3205c0748976a0d4487d38d_dep.svg   |    8 +-
 .../dir_84875704194fd544d29fe0c7fedd8939_dep.svg   |   20 +-
 .../dir_8e4e25e66b8623d88c5b5dd2040bca97_dep.svg   |   74 +-
 .../dir_9e615ec4a59e46584bcc4e2226e148a2_dep.svg   |   12 +-
 .../dir_a59a89c7dd2e4e6561fe59bf359ce2f3_dep.svg   |   12 +-
 .../dir_a98464176f1216e334ac3bbacd433085_dep.svg   |   22 +-
 .../dir_ac57496531ccbad72f774fa62e6de987_dep.svg   |   28 +-
 ...l => dir_af4961563c20a83bf971a498792e6dee.html} |   24 +-
 .../dir_af4961563c20a83bf971a498792e6dee_dep.svg   |   80 +
 .../dir_b4c7d8e826c599ba55146c099a14beb5_dep.svg   |   74 +-
 ...l => dir_c20c9fad5dedbc870b2ada04754d1b9b.html} |   22 +-
 ...> dir_c20c9fad5dedbc870b2ada04754d1b9b_dep.svg} |   36 +-
 .../dir_d331277d4303e21ded95616eb56c1a9e_dep.svg   |    6 +-
 .../dir_d3953cf7eb33eca56fc6850c0e98447d_dep.svg   |    6 +-
 .../dir_d4a54fa981698f72ef4cd62f8b9e1a8f_dep.svg   |    4 +-
 .../dir_dc867ff9a37cad1764f1670dc7eba6c1_dep.svg   |   24 +-
 .../dir_e4a1a856a30057b9b1543256279fc7a1_dep.svg   |    6 +-
 .../dir_f97d855a3173728370e632aa77170e34_dep.svg   |    8 +-
 .../dir_fafc18f54a755f417c55c769623cbfef_dep.svg   |    6 +-
 docs/reference/api/doxygen/files.html              |   39 +-
 docs/reference/api/doxygen/functions_a.html        |    9 +-
 docs/reference/api/doxygen/functions_func_a.html   |    7 +-
 docs/reference/api/doxygen/functions_func_i.html   |    7 +-
 docs/reference/api/doxygen/functions_func_s.html   |    2 +-
 docs/reference/api/doxygen/functions_func_t.html   |    2 +-
 docs/reference/api/doxygen/functions_func_u.html   |    2 +-
 docs/reference/api/doxygen/functions_i.html        |    9 +-
 docs/reference/api/doxygen/functions_s.html        |    2 +-
 docs/reference/api/doxygen/functions_t.html        |    8 +-
 docs/reference/api/doxygen/functions_u.html        |    2 +-
 docs/reference/api/doxygen/functions_v.html        |   10 +-
 .../api/doxygen/functor_8h__dep__incl.svg          |  428 +++---
 docs/reference/api/doxygen/index__map_8h.html      |    2 +-
 .../api/doxygen/index__map_8h__dep__incl.svg       |  944 ++++++------
 docs/reference/api/doxygen/instruction_8h.html     |    2 +-
 .../api/doxygen/instruction_8h__dep__incl.svg      |  240 +--
 .../api/doxygen/ir_2attrs_8h__dep__incl.svg        |  136 +-
 .../api/doxygen/ir_2expr_8h__dep__incl.svg         |  272 ++--
 .../api/doxygen/ir_2function_8h__dep__incl.svg     |  108 +-
 .../api/doxygen/ir_2span_8h__dep__incl.svg         |  344 ++---
 .../api/doxygen/ir_2type_8h__dep__incl.svg         |  376 ++---
 docs/reference/api/doxygen/map_8h__dep__incl.svg   |  340 ++---
 docs/reference/api/doxygen/namespacemembers_b.html |   12 +-
 .../api/doxygen/namespacemembers_func_b.html       |   12 +-
 .../api/doxygen/namespacemembers_func_g.html       |    5 +-
 .../api/doxygen/namespacemembers_func_m.html       |   15 +-
 docs/reference/api/doxygen/namespacemembers_g.html |    9 +-
 docs/reference/api/doxygen/namespacemembers_m.html |   13 +-
 .../doxygen/namespacetvm_1_1meta__schedule.html    |  204 ++-
 .../api/doxygen/ndarray_8h__dep__incl.svg          |  312 ++--
 docs/reference/api/doxygen/node_8h__dep__incl.svg  |  468 +++---
 .../reference/api/doxygen/object_8h__dep__incl.svg |  600 ++++----
 .../api/doxygen/object__path_8h__dep__incl.svg     |  460 +++---
 .../api/doxygen/optional_8h__dep__incl.svg         |  584 ++++----
 .../api/doxygen/packed__func_8h__dep__incl.svg     |  268 ++--
 docs/reference/api/doxygen/random__engine_8h.html  |    2 +-
 .../api/doxygen/random__engine_8h__dep__incl.svg   |  226 +--
 .../api/doxygen/reflection_8h__dep__incl.svg       |  296 ++--
 .../api/doxygen/registry_8h__dep__incl.svg         |   24 +-
 .../api/doxygen/repr__printer_8h__dep__incl.svg    |  460 +++---
 .../runtime_2container_2adt_8h__dep__incl.svg      |    8 +-
 .../runtime_2container_2base_8h__dep__incl.svg     |  632 ++++----
 .../api/doxygen/runtime_2memory_8h__dep__incl.svg  |  468 +++---
 .../api/doxygen/runtime_2module_8h__dep__incl.svg  |  248 ++--
 .../api/doxygen/schedule__rule_8h_source.html      |   14 +-
 docs/reference/api/doxygen/search/all_11.js        |    2 +-
 docs/reference/api/doxygen/search/all_13.js        |    6 +-
 docs/reference/api/doxygen/search/all_14.js        |    6 +-
 docs/reference/api/doxygen/search/all_15.js        |    9 +-
 docs/reference/api/doxygen/search/all_16.js        |    4 +-
 docs/reference/api/doxygen/search/all_17.js        |    4 +-
 docs/reference/api/doxygen/search/all_18.js        |    3 +-
 docs/reference/api/doxygen/search/all_2.js         |    1 +
 docs/reference/api/doxygen/search/all_3.js         |    2 +
 docs/reference/api/doxygen/search/all_8.js         |    1 +
 docs/reference/api/doxygen/search/all_a.js         |    3 +-
 docs/reference/api/doxygen/search/all_e.js         |    3 +-
 docs/reference/api/doxygen/search/files_12.js      |    1 +
 docs/reference/api/doxygen/search/files_f.js       |    1 +
 docs/reference/api/doxygen/search/functions_1.js   |    1 +
 docs/reference/api/doxygen/search/functions_10.js  |    2 +-
 docs/reference/api/doxygen/search/functions_12.js  |    2 +-
 docs/reference/api/doxygen/search/functions_13.js  |    4 +-
 docs/reference/api/doxygen/search/functions_14.js  |    2 +-
 docs/reference/api/doxygen/search/functions_15.js  |    2 +-
 docs/reference/api/doxygen/search/functions_16.js  |    2 +-
 docs/reference/api/doxygen/search/functions_17.js  |    2 +-
 docs/reference/api/doxygen/search/functions_2.js   |    2 +
 docs/reference/api/doxygen/search/functions_7.js   |    1 +
 docs/reference/api/doxygen/search/functions_9.js   |    3 +-
 docs/reference/api/doxygen/search/functions_d.js   |    3 +-
 docs/reference/api/doxygen/search/typedefs_f.js    |    2 +-
 .../api/doxygen/serializer_8h__dep__incl.svg       |  300 ++--
 .../api/doxygen/shape__tuple_8h__dep__incl.svg     |  308 ++--
 docs/reference/api/doxygen/state_8h.html           |    2 +-
 docs/reference/api/doxygen/state_8h__dep__incl.svg |  234 +--
 docs/reference/api/doxygen/stmt_8h__dep__incl.svg  |  140 +-
 .../reference/api/doxygen/string_8h__dep__incl.svg |  412 +++---
 .../doxygen/structural__equal_8h__dep__incl.svg    |  448 +++---
 .../api/doxygen/structural__hash_8h__dep__incl.svg |  448 +++---
 docs/reference/api/doxygen/thread__bind_8h.html    |  113 ++
 .../api/doxygen/thread__bind_8h__incl.svg          | 1559 ++++++++++++++++++++
 .../api/doxygen/thread__bind_8h_source.html        |   83 ++
 .../api/doxygen/tir_2expr_8h__dep__incl.svg        |   72 +-
 docs/reference/api/doxygen/tir_2function_8h.html   |    2 +-
 .../api/doxygen/tir_2function_8h__dep__incl.svg    |  712 +++++----
 .../api/doxygen/tir_2schedule_2schedule_8h.html    |    2 +-
 .../tir_2schedule_2schedule_8h__dep__incl.svg      |  224 +--
 docs/reference/api/doxygen/trace_8h.html           |    2 +-
 docs/reference/api/doxygen/trace_8h__dep__incl.svg |  230 +--
 docs/reference/api/doxygen/var_8h__dep__incl.svg   |   96 +-
 .../{random__engine_8h.html => winograd_8h.html}   |   47 +-
 docs/reference/api/doxygen/winograd_8h__incl.svg   | 1541 +++++++++++++++++++
 docs/reference/api/doxygen/winograd_8h_source.html |   80 +
 docs/reference/api/python/auto_scheduler.html      |    4 +-
 docs/reference/api/python/topi.html                |   61 +
 .../api/typedoc/classes/bytestreamreader.html      |   12 +-
 .../api/typedoc/classes/cachedcallstack.html       |   34 +-
 docs/reference/api/typedoc/classes/dldatatype.html |   12 +-
 docs/reference/api/typedoc/classes/dldevice.html   |   10 +-
 .../reference/api/typedoc/classes/environment.html |   12 +-
 docs/reference/api/typedoc/classes/ffilibrary.html |   20 +-
 .../api/typedoc/classes/graphexecutor.html         |   16 +-
 docs/reference/api/typedoc/classes/instance.html   |   40 +-
 docs/reference/api/typedoc/classes/memory.html     |   34 +-
 docs/reference/api/typedoc/classes/module.html     |   10 +-
 docs/reference/api/typedoc/classes/ndarray.html    |   22 +-
 .../api/typedoc/classes/packedfunccell.html        |    6 +-
 docs/reference/api/typedoc/classes/rpcserver.html  |   14 +-
 docs/reference/api/typedoc/classes/scalar.html     |    6 +-
 .../api/typedoc/classes/webgpucontext.html         |   12 +-
 docs/reference/api/typedoc/enums/argtypecode.html  |   30 +-
 .../api/typedoc/enums/aynccallbackcode.html        |    4 +-
 .../api/typedoc/enums/dldatatypecode.html          |    8 +-
 .../api/typedoc/enums/rpcserverstate.html          |   12 +-
 docs/reference/api/typedoc/enums/sizeof.html       |   18 +-
 docs/reference/api/typedoc/index.html              |  112 +-
 .../api/typedoc/interfaces/disposable.html         |    2 +-
 .../api/typedoc/interfaces/functioninfo.html       |    6 +-
 .../api/typedoc/interfaces/libraryprovider.html    |    4 +-
 docs/searchindex.js                                |    2 +-
 .../vta/tutorials/autotvm/sg_execution_times.html  |    4 +-
 .../tutorials/frontend/deploy_classification.html  |    2 +-
 .../vta/tutorials/frontend/deploy_detection.html   |    2 +-
 .../vta/tutorials/frontend/sg_execution_times.html |    6 +-
 .../vta/tutorials/optimize/sg_execution_times.html |    6 +-
 docs/topic/vta/tutorials/sg_execution_times.html   |    6 +-
 docs/tutorial/auto_scheduler_matmul_x86.html       |    4 +-
 docs/tutorial/autotvm_matmul_x86.html              |   20 +-
 docs/tutorial/autotvm_relay_x86.html               |  269 ++--
 docs/tutorial/cross_compilation_and_rpc.html       |    2 +-
 docs/tutorial/intro_topi.html                      |    2 +-
 docs/tutorial/sg_execution_times.html              |   28 +-
 docs/tutorial/tensor_expr_get_started.html         |   44 +-
 372 files changed, 12852 insertions(+), 9750 deletions(-)

diff --git a/docs/_images/sphx_glr_micro_train_001.png b/docs/_images/sphx_glr_micro_train_001.png
index fb3c2850a3..3f59a37dab 100644
Binary files a/docs/_images/sphx_glr_micro_train_001.png and b/docs/_images/sphx_glr_micro_train_001.png differ
diff --git a/docs/_images/sphx_glr_micro_train_thumb.png b/docs/_images/sphx_glr_micro_train_thumb.png
index 86defffe09..6746d44e45 100644
Binary files a/docs/_images/sphx_glr_micro_train_thumb.png and b/docs/_images/sphx_glr_micro_train_thumb.png differ
diff --git a/docs/_sources/how_to/compile_models/from_darknet.rst.txt b/docs/_sources/how_to/compile_models/from_darknet.rst.txt
index 957e696aae..9632a6fb68 100644
--- a/docs/_sources/how_to/compile_models/from_darknet.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_darknet.rst.txt
@@ -315,7 +315,7 @@ The process is no different from other examples.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  14.292 seconds)
+   **Total running time of the script:** ( 1 minutes  13.416 seconds)
 
 
 .. _sphx_glr_download_how_to_compile_models_from_darknet.py:
diff --git a/docs/_sources/how_to/compile_models/from_keras.rst.txt b/docs/_sources/how_to/compile_models/from_keras.rst.txt
index a32b27d349..f2759d859c 100644
--- a/docs/_sources/how_to/compile_models/from_keras.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_keras.rst.txt
@@ -228,7 +228,7 @@ Look up prediction top 1 index in 1000 class synset.
  .. code-block:: none
 
     Relay top-1 id: 285, class name: Egyptian cat
-
    1/1 [==============================] - ETA: 0s
    1/1 [==============================] - 1s 949ms/step
+
    1/1 [==============================] - ETA: 0s
    1/1 [==============================] - 1s 972ms/step
     Keras top-1 id: 285, class name: Egyptian cat
 
 
diff --git a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
index 05c6337a6f..ef8721d5d1 100644
--- a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
@@ -115,7 +115,7 @@ In this section, we download a pretrained imagenet model and classify an image.
 
  .. code-block:: none
 
-    Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip6bf3f2ad-c7cc-4b27-b0d1-8236805957de from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+    Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zipe397f8ff-e142-4742-9764-ba71e41f7e82 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
     x (1, 3, 224, 224)
 
 
diff --git a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
index f2c6875a7a..cfecf4177e 100644
--- a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
@@ -116,7 +116,7 @@ Load a pretrained OneFlow model and save model
  .. code-block:: none
 
     Downloading: "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip" to /workspace/.oneflow/flowvision_cache/resnet18.zip
-
      0%|          | 0.00/41.5M [00:00<?, ?B/s]
     15%|#5        | 6.33M/41.5M [00:00<00:00, 53.4MB/s]
     28%|##7       | 11.4M/41.5M [00:00<00:00, 43.1MB/s]
     38%|###7      | 15.6M/41.5M [00:00<00:00, 40.4MB/s]
     49%|####8     | 20.3M/41.5M [00:00<00:00, 43.5MB/s]
     59%|#####9    | 24.5M/41.5M [00:00<00:00, 34.9MB/s]
     77%|#######7  | 32.0M/41.5M [00:00<00:00, 41.3MB/s]
     92%|#########2| 38.3M/41.5M [00:00<00:00, 39.3MB/s]
    100%|##########| 41.5M/41.5M [00:01<00:00, 41.4MB/s]
+
      0%|          | 0.00/41.5M [00:00<?, ?B/s]
     15%|#5        | 6.33M/41.5M [00:00<00:01, 33.1MB/s]
     27%|##7       | 11.3M/41.5M [00:00<00:00, 41.0MB/s]
     39%|###8      | 16.0M/41.5M [00:00<00:00, 35.2MB/s]
     58%|#####7    | 24.0M/41.5M [00:00<00:00, 44.8MB/s]
     77%|#######7  | 32.0M/41.5M [00:00<00:00, 53.1MB/s]
     96%|#########6| 40.0M/41.5M [00:00<00:00, 58.4MB/s]
    100%|##########| 41.5M/41.5M [00:00<00:00, 51.3MB/s]
 
 
 
diff --git a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
index 7080a6b22c..7b26e6a633 100644
--- a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
@@ -98,7 +98,7 @@ Load a pretrained PyTorch model
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and will be removed in 0.15. The current behavior is equivalent to passing `weights=ResNet18_Weights.IMAGENET1K_V1`. You can also use `weights=ResNet18_Weights.DEFAULT` to get the most up-to-date weights.
       warnings.warn(msg)
     Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
-
      0%|          | 0.00/44.7M [00:00<?, ?B/s]
     18%|#7        | 7.99M/44.7M [00:00<00:00, 67.8MB/s]
     60%|######    | 26.9M/44.7M [00:00<00:00, 137MB/s] 
     91%|#########1| 40.7M/44.7M [00:00<00:00, 105MB/s]
    100%|##########| 44.7M/44.7M [00:00<00:00, 113MB/s]
+
      0%|          | 0.00/44.7M [00:00<?, ?B/s]
     14%|#4        | 6.42M/44.7M [00:00<00:00, 67.3MB/s]
     29%|##8       | 12.8M/44.7M [00:00<00:00, 56.9MB/s]
     41%|####1     | 18.4M/44.7M [00:00<00:00, 41.0MB/s]
     54%|#####3    | 24.0M/44.7M [00:00<00:00, 45.8MB/s]
     72%|#######1  | 32.0M/44.7M [00:00<00:00, 51.3MB/s]
     90%|########9 | 40.0M/44.7M [00:00<00:00, 56.3MB/s]
    100%|##########| 44.7M/44.7M [00:00<00:00, 57.6MB/s]
 
 
 
diff --git a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
index 0359ded3a5..e2e3d68b17 100644
--- a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
@@ -416,7 +416,7 @@ Run the corresponding model on tensorflow
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  13.151 seconds)
+   **Total running time of the script:** ( 1 minutes  12.002 seconds)
 
 
 .. _sphx_glr_download_how_to_compile_models_from_tensorflow.py:
diff --git a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
index 52bf048c35..622abaacf9 100644
--- a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
@@ -5,26 +5,26 @@
 
 Computation times
 =================
-**05:56.507** total execution time for **how_to_compile_models** files:
+**05:52.428** total execution time for **how_to_compile_models** files:
 
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``)       | 01:14.292 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``)       | 01:13.416 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``) | 01:13.151 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``) | 01:12.002 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``)         | 00:48.189 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``)         | 00:46.691 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``)       | 00:33.162 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``)       | 00:33.083 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``)           | 00:31.241 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``)           | 00:29.497 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``)         | 00:28.045 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``)         | 00:27.011 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``)         | 00:25.118 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``)         | 00:25.709 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``)       | 00:22.425 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``)       | 00:23.732 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``)           | 00:18.418 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``)           | 00:18.859 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``)             | 00:02.465 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``)             | 00:02.428 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
index 5407253dbd..8ddc019532 100644
--- a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
@@ -434,7 +434,7 @@ Execute on TVM
     Evaluate inference time cost...
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      16.4190      16.4221      16.5149      16.3321       0.0615   
+      16.1611      16.1398      16.5247      15.9416       0.1550   
                
 
 
diff --git a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
index 619da85d98..aebadce19f 100644
--- a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
@@ -127,7 +127,7 @@ Load pre-trained maskrcnn from torchvision and do tracing
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and will be removed in 0.15. The current behavior is equivalent to passing `weights=MaskRCNN_ResNet50_FPN_Weights.COCO_V1`. You can also use `weights=MaskRCNN_ResNet50_FPN_Weights.DEFAULT` to get the most up-to-date weights.
       warnings.warn(msg)
     Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
-
      0%|          | 0.00/170M [00:00<?, ?B/s]
      1%|1         | 2.05M/170M [00:00<00:10, 17.2MB/s]
      3%|3         | 5.18M/170M [00:00<00:08, 21.3MB/s]
      7%|6         | 11.4M/170M [00:00<00:04, 38.4MB/s]
      9%|9         | 16.0M/170M [00:00<00:04, 37.3MB/s]
     16%|#5        | 26.8M/170M [00:00<00:02, 61.2MB/s]
     24%|##3       | 40.0M/170M [00:00<00:01, 79.5MB/s]
     32%|###2      | 54.9M/170M [00:00<00:01, 102MB/s] 
     38%|###8      | 65.0M/170M [00:00<00:01, 99.9MB/s]
     44%|####4     | 74.8M/170M [00:01<00:01, 77.1MB/s]
     49%|####8     | 83.0M/170M [00:01<00:01, 74.9MB/s]
     55%|#####4    | 92.8M/170M [00:01<00:00, 81.6MB/s]
     60%|#####9    | 101M/170M [00:01<00:00, 82.1MB/s] 
     64%|######4   | 109M/170M [00:01<00:01, 60.2MB/s]
     68%|######8   | 116M/170M [00:01<00:00, 56.4MB/s]
     72%|#######1  | 122M/170M [00:01<00:00, 56.1MB/s]
     75%|#######5  | 128M/170M [00:02<00:00, 52.1MB/s]
     80%|########  | 136M/170M [00:02<00:00, 58.1MB/s]
  
    85%|########4 | 144M/170M [00:02<00:00, 62.7MB/s]
     92%|#########2| 156M/170M [00:02<00:00, 80.0MB/s]
     97%|#########6| 165M/170M [00:02<00:00, 69.5MB/s]
    100%|##########| 170M/170M [00:02<00:00, 67.5MB/s]
+
      0%|          | 0.00/170M [00:00<?, ?B/s]
      4%|3         | 6.30M/170M [00:00<00:02, 61.9MB/s]
      7%|7         | 12.2M/170M [00:00<00:03, 53.4MB/s]
     10%|#         | 17.4M/170M [00:00<00:04, 38.9MB/s]
     13%|#3        | 22.3M/170M [00:00<00:03, 41.5MB/s]
     16%|#5        | 26.5M/170M [00:00<00:03, 40.0MB/s]
     19%|#8        | 32.1M/170M [00:00<00:03, 45.2MB/s]
     24%|##3       | 40.0M/170M [00:00<00:02, 46.8MB/s]
     28%|##8       | 47.8M/170M [00:01<00:02, 55.8MB/s]
     31%|###1      | 53.4M/170M [00:01<00:02, 53.2MB/s]
     35%|###4      | 58.7M/170M [00:01<00:02, 52.2MB/s]
     38%|###7      | 63.8M/170M [00:01<00:03, 34.7MB/s]
     40%|###9      | 67.9M/170M [00:01<00:03, 31.2MB/s]
     43%|####2     | 72.5M/170M [00:01<00:02, 34.6MB/s]
     46%|####6     | 78.3M/170M [00:01<00:02, 36.1MB/s]
     48%|####8     | 82.1M/170M [00:02<00:02, 34.6MB/s]
     51%|#####     | 86.3M/170M [00:02<00:02, 34.1MB/s]
     53%|#####2    | 89.7M/170M [00:02<00:02, 31.9MB/
 s]
     56%|#####5    | 94.3M/170M [00:02<00:02, 32.5MB/s]
     57%|#####7    | 97.5M/170M [00:02<00:02, 32.6MB/s]
     60%|######    | 102M/170M [00:02<00:02, 27.4MB/s] 
     62%|######1   | 105M/170M [00:03<00:02, 24.8MB/s]
     66%|######5   | 112M/170M [00:03<00:01, 32.0MB/s]
     70%|######9   | 118M/170M [00:03<00:01, 38.5MB/s]
     72%|#######1  | 122M/170M [00:03<00:01, 38.2MB/s]
     75%|#######5  | 128M/170M [00:03<00:01, 36.7MB/s]
     80%|########  | 136M/170M [00:03<00:00, 39.3MB/s]
     85%|########4 | 144M/170M [00:03<00:00, 43.9MB/s]
     88%|########8 | 150M/170M [00:04<00:00, 31.6MB/s]
     91%|#########1| 155M/170M [00:04<00:00, 34.3MB/s]
     94%|#########4| 160M/170M [00:04<00:00, 34.3MB/s]
     99%|#########8| 168M/170M [00:04<00:00, 41.2MB/s]
    100%|##########| 170M/170M [00:04<00:00, 38.1MB/s]
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torch/nn/functional.py:3897: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
       for i in range(dim)
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/detection/anchor_utils.py:124: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
@@ -296,7 +296,7 @@ Get boxes with score larger than 0.9
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 3 minutes  19.999 seconds)
+   **Total running time of the script:** ( 3 minutes  22.367 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_object_detection_pytorch.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
index bbd3b18bb5..7f74bce157 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
@@ -236,7 +236,7 @@ training. Other models require a full post training calibration.
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and will be removed in 0.15. The current behavior is equivalent to passing `weights=MobileNet_V2_Weights.IMAGENET1K_V1`. You can also use `weights=MobileNet_V2_Weights.DEFAULT` to get the most up-to-date weights.
       warnings.warn(msg)
     Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
-
      0%|          | 0.00/13.6M [00:00<?, ?B/s]
     48%|####7     | 6.46M/13.6M [00:00<00:00, 67.7MB/s]
     95%|#########5| 12.9M/13.6M [00:00<00:00, 55.8MB/s]
    100%|##########| 13.6M/13.6M [00:00<00:00, 59.2MB/s]
+
      0%|          | 0.00/13.6M [00:00<?, ?B/s]
     59%|#####8    | 7.99M/13.6M [00:00<00:00, 64.1MB/s]
    100%|##########| 13.6M/13.6M [00:00<00:00, 93.5MB/s]
 
 
 
@@ -418,7 +418,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      90.6124      90.5421      91.8263      90.1658       0.3243   
+      90.4649      90.2693      95.9496      90.0914       0.7106   
                
 
 
@@ -467,7 +467,7 @@ TODO
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  8.086 seconds)
+   **Total running time of the script:** ( 1 minutes  7.421 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_prequantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
index dcdea2bd3a..76e7b20de3 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
@@ -432,7 +432,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      121.2750     121.1947     123.1208     120.4836      0.4973   
+      120.8390     120.7152     124.1033     119.9997      0.6289   
                
 
 
@@ -469,7 +469,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 2 minutes  30.376 seconds)
+   **Total running time of the script:** ( 2 minutes  29.844 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_prequantized_tflite.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
index 34d9d0d9c1..f0afd913fe 100644
--- a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
@@ -253,7 +253,7 @@ We create a Relay VM to build and execute the model.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  31.056 seconds)
+   **Total running time of the script:** ( 1 minutes  55.880 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_quantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
index 4363690010..71af44846c 100644
--- a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
@@ -166,7 +166,7 @@ Convert and compile model for CPU.
             data: None
       input_sym_arg_type = in_param.infer_type()[0]
     Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
-
      0%|          | 0/132723 [00:00<?, ?KB/s]
      3%|2         | 3679/132723 [00:00<00:03, 36784.77KB/s]
      8%|8         | 11013/132723 [00:00<00:02, 58282.29KB/s]
     15%|#4        | 19714/132723 [00:00<00:01, 71395.71KB/s]
     21%|##1       | 28372/132723 [00:00<00:01, 77387.86KB/s]
     28%|##7       | 36976/132723 [00:00<00:01, 80506.26KB/s]
     34%|###4      | 45695/132723 [00:00<00:01, 82776.10KB/s]
     41%|####      | 54282/132723 [00:00<00:00, 83784.86KB/s]
     47%|####7     | 63028/132723 [00:00<00:00, 84951.26KB/s]
     54%|#####4    | 71712/132723 [00:00<00:00, 85539.59KB/s]
     61%|######    | 80417/132723 [00:01<00:00, 86002.08KB/s]
     67%|######7   | 89126/132723 [00:01<00:00, 86264.48KB/s]
     74%|#######3  | 97826/132723 [00:01<00:00, 86486.24KB/s]
     80%|########  | 106535/132723 [00:01<00:00, 86666.77KB/s]
     87%|########6 | 115202/132723 [00:01<00:00, 86522.76KB/s]
     93%|#########3| 123855/132723 [00:01<00:00, 86517.83KB/s]
    100%|########
 #9| 132701/132723 [00:01<00:00, 87099.53KB/s]
    100%|##########| 132723/132723 [00:01<00:00, 82750.86KB/s]
+
      0%|          | 0/132723 [00:00<?, ?KB/s]
      4%|3         | 4692/132723 [00:00<00:02, 46914.12KB/s]
     10%|#         | 13412/132723 [00:00<00:01, 70605.36KB/s]
     15%|#5        | 20473/132723 [00:00<00:01, 69856.79KB/s]
     22%|##1       | 29149/132723 [00:00<00:01, 76482.76KB/s]
     28%|##7       | 36803/132723 [00:00<00:01, 68852.31KB/s]
     34%|###4      | 45556/132723 [00:00<00:01, 74748.18KB/s]
     40%|####      | 53153/132723 [00:00<00:01, 60600.90KB/s]
     47%|####6     | 62015/132723 [00:00<00:01, 67990.43KB/s]
     52%|#####2    | 69278/132723 [00:01<00:01, 57875.78KB/s]
     58%|#####8    | 77196/132723 [00:01<00:00, 63120.16KB/s]
     63%|######3   | 84005/132723 [00:01<00:00, 55720.83KB/s]
     70%|######9   | 92761/132723 [00:01<00:00, 63467.05KB/s]
     75%|#######5  | 100009/132723 [00:01<00:00, 65789.42KB/s]
     82%|########1 | 108721/132723 [00:01<00:00, 71530.75KB/s]
     88%|########7 | 116221/132723 [00:01<00:00, 56441.08KB/s]
     94%|########
 #4| 125006/132723 [00:01<00:00, 63853.39KB/s]
    100%|##########| 132723/132723 [00:02<00:00, 65271.66KB/s]
 
 
 
@@ -242,7 +242,7 @@ Display result
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 3 minutes  5.790 seconds)
+   **Total running time of the script:** ( 3 minutes  5.709 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_ssd_gluoncv.py:
diff --git a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
index e16b076e73..0d9350ba8a 100644
--- a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
@@ -5,24 +5,24 @@
 
 Computation times
 =================
-**13:02.812** total execution time for **how_to_deploy_models** files:
+**13:29.609** total execution time for **how_to_deploy_models** files:
 
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``) | 03:19.999 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``) | 03:22.367 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``)                           | 03:05.790 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``)                           | 03:05.709 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``)           | 02:30.376 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``)           | 02:29.844 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``)                               | 01:31.056 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``)                               | 01:55.880 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``)                         | 01:08.086 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``)                         | 01:07.421 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``)                 | 00:37.059 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``)                 | 00:36.832 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_nano.py` (``deploy_model_on_nano.py``)                       | 00:25.535 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_nano.py` (``deploy_model_on_nano.py``)                       | 00:26.055 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``)                       | 00:24.905 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``)                       | 00:25.495 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_deploy_models_deploy_sparse.py` (``deploy_sparse.py``)                                     | 00:00.007 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
index cc69a150f5..a98753932a 100644
--- a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
@@ -472,7 +472,7 @@ First let us define two helper functions to get the mobilenet model and a cat im
 
  .. code-block:: none
 
-    Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip45149bf0-8821-4ad1-8130-cdb110608ac0 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+    Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip2692ce48-4cfd-490f-a4fe-279be8b87c67 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
 
 
 
diff --git a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
index 9e76d92bd2..af565cbe03 100644
--- a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
 
 Computation times
 =================
-**00:49.747** total execution time for **how_to_extend_tvm** files:
+**00:47.355** total execution time for **how_to_extend_tvm** files:
 
 +-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``) | 00:46.247 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``) | 00:43.952 | 0.0 MB |
 +-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``)           | 00:02.438 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``)           | 00:02.367 | 0.0 MB |
 +-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``)                     | 00:01.053 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``)                     | 00:01.028 | 0.0 MB |
 +-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_low_level_custom_pass.py` (``low_level_custom_pass.py``)       | 00:00.009 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_low_level_custom_pass.py` (``low_level_custom_pass.py``)       | 00:00.008 | 0.0 MB |
 +-------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
index 93bba041ba..7772af5527 100644
--- a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
@@ -216,10 +216,10 @@ profile the execution time of each passes.
  .. code-block:: none
 
     Printing results of timing profile...
-    InferType: 6756us [6756us] (46.50%; 46.50%)
-    FoldScaleAxis: 7773us [6us] (53.50%; 53.50%)
-            FoldConstant: 7767us [1534us] (53.46%; 99.92%)
-                    InferType: 6233us [6233us] (42.90%; 80.25%)
+    InferType: 7305us [7305us] (47.99%; 47.99%)
+    FoldScaleAxis: 7916us [7us] (52.01%; 52.01%)
+            FoldConstant: 7909us [1631us] (51.96%; 99.92%)
+                    InferType: 6278us [6278us] (41.25%; 79.38%)
 
 
 
@@ -258,10 +258,10 @@ Refer to following sections and :py:func:`tvm.instrument.pass_instrument` for th
  .. code-block:: none
 
     Printing results of timing profile...
-    InferType: 6221us [6221us] (44.87%; 44.87%)
-    FoldScaleAxis: 7644us [5us] (55.13%; 55.13%)
-            FoldConstant: 7639us [1559us] (55.10%; 99.93%)
-                    InferType: 6081us [6081us] (43.86%; 79.59%)
+    InferType: 6272us [6272us] (44.65%; 44.65%)
+    FoldScaleAxis: 7775us [4us] (55.35%; 55.35%)
+            FoldConstant: 7771us [1613us] (55.32%; 99.94%)
+                    InferType: 6158us [6158us] (43.84%; 79.24%)
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
index 30f8046bca..e50d94a017 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
@@ -340,7 +340,7 @@ latency of convolution.
 
  .. code-block:: none
 
-    Convolution: 43.150497 ms
+    Convolution: 54.192222 ms
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
index 5ba96547e9..5883de3852 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
@@ -659,7 +659,7 @@ be able to run on our build server
 
  .. code-block:: none
 
-    conv2d with tensor core: 13.345908 ms
+    conv2d with tensor core: 13.365604 ms
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
index a6e5f9bab3..6f51889e02 100644
--- a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
@@ -143,8 +143,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
 
  .. code-block:: none
 
-    Numpy running time: 0.018938
-    Baseline: 3.437834
+    Numpy running time: 0.018726
+    Baseline: 3.452490
 
 
 
@@ -239,7 +239,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
 
  .. code-block:: none
 
-    Opt1: 0.312362
+    Opt1: 0.303881
 
 
 
@@ -342,7 +342,7 @@ In this tutorial, we chose to vectorize the inner loop row data since it is cach
 
  .. code-block:: none
 
-    Opt2: 0.337694
+    Opt2: 0.345987
 
 
 
@@ -438,7 +438,7 @@ the access pattern for A matrix is more cache friendly.
 
  .. code-block:: none
 
-    Opt3: 0.118729
+    Opt3: 0.118088
 
 
 
@@ -563,7 +563,7 @@ flattening.
 
  .. code-block:: none
 
-    Opt4: 0.109678
+    Opt4: 0.109866
 
 
 
@@ -685,7 +685,7 @@ write to C when all the block results are ready.
 
  .. code-block:: none
 
-    Opt5: 0.111738
+    Opt5: 0.111457
 
 
 
@@ -810,7 +810,7 @@ Furthermore, we can also utilize multi-core processors to do the thread-level pa
 
  .. code-block:: none
 
-    Opt6: 0.147516
+    Opt6: 0.147367
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
index 0997163598..af313b69ba 100644
--- a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
@@ -5,12 +5,12 @@
 
 Computation times
 =================
-**00:35.273** total execution time for **how_to_optimize_operators** files:
+**00:35.178** total execution time for **how_to_optimize_operators** files:
 
 +-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``)                       | 00:32.697 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``)                       | 00:32.664 | 0.0 MB |
 +-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``) | 00:01.477 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``) | 00:01.459 | 0.0 MB |
 +-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``)             | 00:01.099 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``)             | 00:01.055 | 0.0 MB |
 +-----------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
index 5cb35f5fed..a7111953a8 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
@@ -5,18 +5,18 @@
 
 Computation times
 =================
-**09:04.707** total execution time for **how_to_tune_with_autoscheduler** files:
+**09:11.143** total execution time for **how_to_tune_with_autoscheduler** files:
 
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``) | 05:33.921 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``) | 05:43.674 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``)             | 01:33.757 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``)             | 01:32.818 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``)           | 01:04.112 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``)           | 01:03.505 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``)               | 00:29.522 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``)               | 00:27.992 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``)             | 00:12.100 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``)             | 00:11.975 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``)           | 00:11.296 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``)           | 00:11.178 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
index d0571a34b7..a517282d54 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
@@ -240,45 +240,153 @@ cooperative fetching, unrolling and operator fusion.
                  compute: Buffer(compute_2: Pointer(float32), float32, [25088], [])}
       buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, compute_1: compute}
       preflattened_buffer_map = {data_1: data_3: Buffer(data_2, float32, [1, 512, 7, 7], []), kernel_1: kernel_3: Buffer(kernel_2, float32, [512, 512, 3, 3], []), bias_1: bias_3: Buffer(bias_2, float32, [1, 512, 1, 1], []), compute_1: compute_3: Buffer(compute_2, float32, [1, 512, 7, 7], [])} {
-      attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 32;
-      allocate(conv2d_nchw: Pointer(local float32), float32, [4]), storage_scope = local;
-      allocate(pad_temp.shared: Pointer(shared float32), float32, [252]), storage_scope = shared;
-      allocate(kernel.shared: Pointer(shared float32), float32, [192]), storage_scope = shared;
-      attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196 {
-        for (ff.inner.init: int32, 0, 2) {
-          conv2d_nchw_1: Buffer(conv2d_nchw, float32, [4], [], scope="local", align=8)[ff.inner.init] = 0f32
-          conv2d_nchw_1[(ff.inner.init + 2)] = 0f32
-        }
-        for (rc.outer.outer: int32, 0, 128) {
+      attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 64;
+      allocate(conv2d_nchw: Pointer(local float32), float32, [7]), storage_scope = local;
+      allocate(pad_temp.shared: Pointer(shared float32), float32, [2016]), storage_scope = shared;
+      allocate(kernel.shared: Pointer(shared float32), float32, [768]), storage_scope = shared;
+      attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
+        conv2d_nchw_1: Buffer(conv2d_nchw, float32, [7], [], scope="local", align=16)[0] = 0f32
+        conv2d_nchw_1[1] = 0f32
+        conv2d_nchw_1[2] = 0f32
+        conv2d_nchw_1[3] = 0f32
+        conv2d_nchw_1[4] = 0f32
+        conv2d_nchw_1[5] = 0f32
+        conv2d_nchw_1[6] = 0f32
+        for (rc.outer.outer: int32, 0, 16) {
           for (rx.outer.outer: int32, 0, 3) {
-            for (ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer: int32, 0, 2) {
-              attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
-              if @tir.likely((((ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer*7) + floordiv(threadIdx.x_1, 28)) < 9), dtype=bool) {
-                pad_temp.shared_1: Buffer(pad_temp.shared, float32, [252], [], scope="shared")[((ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer*196) + threadIdx.x_1)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) [...]
+            let cse_var_2: int32 = (rc.outer.outer*1568)
+            let cse_var_1: int32 = (rc.outer.outer*288)
+             {
+              attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1: Buffer(pad_temp.shared, float32, [2016], [], scope="shared")[threadIdx.x_1] = @tir.if_then_else((((7 <= threadIdx.x_1) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((cse_var_2 + threadIdx.x_1) + rx.outer.outer) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 56)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 8), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 56), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 8), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 112)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 7), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 112), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 7), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 168)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 6), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 168), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 6), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 224)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 5), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 224), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 5), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 280)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 4), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 280), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 4), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 336)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 3), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 336), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 3), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 392)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 2), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 392), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 2), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 448)] = @tir.if_then_else((((threadIdx.x_1 < 49) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 448), 63)*49)) + ((floordiv(threadIdx.x_1, 7) + 1)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 504)] = @tir.if_then_else((((7 <= threadIdx.x_1) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((cse_var_2 + threadIdx.x_1) + rx.outer.outer) + 384)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 560)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 8), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 560), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 8), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 616)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 7), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 616), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 7), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 672)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 6), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 672), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 6), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 728)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 5), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 728), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 5), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 784)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 4), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 784), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 4), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 840)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 3), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 840), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 3), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 896)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 2), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 896), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 2), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 952)] = @tir.if_then_else((((threadIdx.x_1 < 49) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 952), 63)*49)) + ((floordiv(threadIdx.x_1, 7) + 1)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 1008)] = @tir.if_then_else((((7 <= threadIdx.x_1) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((cse_var_2 + threadIdx.x_1) + rx.outer.outer) + 776)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 1064)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 8), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1064), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 8), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 1120)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 7), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1120), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 7), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 1176)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 6), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1176), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 6), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 1232)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 5), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1232), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 5), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 1288)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 4), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1288), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 4), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 1344)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 3), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1344), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 3), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 1400)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 2), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1400), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 2), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 1456)] = @tir.if_then_else((((threadIdx.x_1 < 49) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1456), 63)*49)) + ((floordiv(threadIdx.x_1, 7) + 1)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 1512)] = @tir.if_then_else((((7 <= threadIdx.x_1) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((cse_var_2 + threadIdx.x_1) + rx.outer.outer) + 1168)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 1568)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 8), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1568), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 8), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 1624)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 7), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1624), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 7), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 1680)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 6), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1680), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 6), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 1736)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 5), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1736), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 5), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 1792)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 4), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1792), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 4), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 1848)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 3), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1848), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 3), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 1904)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 2), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1904), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 2), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              pad_temp.shared_1[(threadIdx.x_1 + 1960)] = @tir.if_then_else((((threadIdx.x_1 < 49) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1960), 63)*49)) + ((floordiv(threadIdx.x_1, 7) + 1)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              kernel.shared_1: Buffer(kernel.shared, float32, [768], [], scope="shared")[threadIdx.x_2] = kernel[((((blockIdx.x*36864) + cse_var_1) + (threadIdx.x_2*3)) + rx.outer.outer)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              kernel.shared_1[(threadIdx.x_2 + 56)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 56), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 56), 96), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              kernel.shared_1[(threadIdx.x_2 + 112)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 112), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 16), 96), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              kernel.shared_1[(threadIdx.x_2 + 168)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 168), 96)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 24), 32)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              kernel.shared_1[(threadIdx.x_2 + 224)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 224), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 96), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              kernel.shared_1[(threadIdx.x_2 + 280)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 280), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 88), 96), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              kernel.shared_1[(threadIdx.x_2 + 336)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 336), 96)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 16), 32)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              kernel.shared_1[(threadIdx.x_2 + 392)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 392), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 8), 96), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              kernel.shared_1[(threadIdx.x_2 + 448)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 448), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 64), 96), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              kernel.shared_1[(threadIdx.x_2 + 504)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 504), 96)*4608)) + cse_var_1) + ((floordiv(threadIdx.x_2, 3) + 8)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              kernel.shared_1[(threadIdx.x_2 + 560)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 560), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 80), 96), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              kernel.shared_1[(threadIdx.x_2 + 616)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 616), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 40), 96), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              kernel.shared_1[(threadIdx.x_2 + 672)] = kernel[(((((blockIdx.x*36864) + cse_var_1) + (threadIdx.x_2*3)) + rx.outer.outer) + 32256)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
+              if @tir.likely((threadIdx.x_2 < 40), dtype=bool) {
+                kernel.shared_1[(threadIdx.x_2 + 728)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 728), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 56), 96), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
               }
-            }
-            attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 196;
-            if @tir.likely((threadIdx.x_2 < 192), dtype=bool) {
-              kernel.shared_1: Buffer(kernel.shared, float32, [192], [], scope="shared")[threadIdx.x_2] = kernel[(((((blockIdx.x*73728) + (floordiv(threadIdx.x_2, 12)*4608)) + (rc.outer.outer*36)) + (floormod(threadIdx.x_2, 12)*3)) + rx.outer.outer)]
-            }
-            for (rc.outer.inner: int32, 0, 2) {
-              for (ry.outer.inner: int32, 0, 3) {
-                for (rc.inner: int32, 0, 2) {
-                  for (ff.inner: int32, 0, 2) {
-                    let cse_var_1: int32 = (ff.inner + 2)
-                     {
-                      conv2d_nchw_1[ff.inner] = (conv2d_nchw_1[ff.inner] + (pad_temp.shared_1[((((rc.outer.inner*126) + (rc.inner*63)) + (ry.outer.inner*7)) + floormod(threadIdx.x, 49))]*kernel.shared_1[(((((floordiv(threadIdx.x, 49)*24) + (ff.inner*12)) + (rc.outer.inner*6)) + (rc.inner*3)) + ry.outer.inner)]))
-                      conv2d_nchw_1[cse_var_1] = (conv2d_nchw_1[cse_var_1] + (pad_temp.shared_1[((((rc.outer.inner*126) + (rc.inner*63)) + (ry.outer.inner*7)) + floormod(threadIdx.x, 49))]*kernel.shared_1[((((((floordiv(threadIdx.x, 49)*24) + (ff.inner*12)) + (rc.outer.inner*6)) + (rc.inner*3)) + ry.outer.inner) + 96)]))
-                    }
-                  }
-                }
+              for (rc.outer.inner: int32, 0, 32) {
+                conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7))]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3))]))
+                conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 1)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3))]))
+                conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 2)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3))]))
+                conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 3)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3))]))
+                conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 4)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3))]))
+                conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 5)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3))]))
+                conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 6)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3))]))
+                conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 7)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 1)]))
+                conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 8)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 1)]))
+                conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 9)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 1)]))
+                conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 10)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 1)]))
+                conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 11)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 1)]))
+                conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 12)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 1)]))
+                conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 13)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 1)]))
+                conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 14)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 2)]))
+                conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 15)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 2)]))
+                conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 16)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 2)]))
+                conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 17)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 2)]))
+                conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 18)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 2)]))
+                conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 19)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 2)]))
+                conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 2)]))
               }
             }
           }
         }
-        for (i1.inner: int32, 0, 2) {
-          compute[((((blockIdx.x*784) + (floordiv(threadIdx.x, 49)*98)) + (i1.inner*49)) + floormod(threadIdx.x, 49))] = max((conv2d_nchw_1[i1.inner] + bias[(((blockIdx.x*16) + (floordiv(threadIdx.x, 49)*2)) + i1.inner)]), 0f32)
-          compute[(((((blockIdx.x*784) + (floordiv(threadIdx.x, 49)*98)) + (i1.inner*49)) + floormod(threadIdx.x, 49)) + 392)] = max((conv2d_nchw_1[(i1.inner + 2)] + bias[((((blockIdx.x*16) + (floordiv(threadIdx.x, 49)*2)) + i1.inner) + 8)]), 0f32)
+        for (i3.inner: int32, 0, 7) {
+          compute[(((blockIdx.x*392) + (threadIdx.x*7)) + i3.inner)] = max((conv2d_nchw_1[i3.inner] + bias[((blockIdx.x*8) + floordiv(threadIdx.x, 7))]), 0f32)
         }
       }
     }
@@ -333,7 +441,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 0.349 ms
+    Execution time of this operator: 0.368 ms
 
 
 
@@ -381,20 +489,20 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
     conv2d_nchw_nn_o_o_i, conv2d_nchw_nn_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_i, factor=1)
     conv2d_nchw_nn_o_o_o_i, conv2d_nchw_nn_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_i, factor=1)
     conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
-    conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=2)
+    conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
     conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=1)
-    conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=4)
-    conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=2)
+    conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=8)
+    conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
     conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
     conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
     conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=7)
     conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
     conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
-    conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
-    conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=7)
+    conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=7)
+    conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
     conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
-    conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=2)
-    conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=2)
+    conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=1)
+    conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=32)
     conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1)
     conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=3)
     conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=1)
@@ -403,14 +511,14 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
     compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1)
     compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
     compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
-    compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=2)
-    compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=4)
-    compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=2)
+    compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=1)
+    compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=8)
+    compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
     compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
     compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=7)
     compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
-    compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
-    compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=7)
+    compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
+    compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
     compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
     s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
     s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
@@ -430,14 +538,14 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
     kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
     kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
     s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-    kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=196)
+    kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=56)
     s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
     pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
     pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
     s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=196)
+    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=56)
     s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
-    s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 0)
+    s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 64)
     s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "unroll_explicit", True)
 
     CUDA source code:
@@ -455,41 +563,100 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
       #define int64_t long long
       #define uint64_t unsigned long long
     #endif
-    extern "C" __global__ void __launch_bounds__(196) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
-      float conv2d_nchw[4];
-      __shared__ float pad_temp_shared[252];
-      __shared__ float kernel_shared[192];
-      for (int ff_inner_init = 0; ff_inner_init < 2; ++ff_inner_init) {
-        conv2d_nchw[ff_inner_init] = 0.000000e+00f;
-        conv2d_nchw[(ff_inner_init + 2)] = 0.000000e+00f;
-      }
-      for (int rc_outer_outer = 0; rc_outer_outer < 128; ++rc_outer_outer) {
+    extern "C" __global__ void __launch_bounds__(56) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+      float conv2d_nchw[7];
+      __shared__ float pad_temp_shared[2016];
+      __shared__ float kernel_shared[768];
+      conv2d_nchw[0] = 0.000000e+00f;
+      conv2d_nchw[1] = 0.000000e+00f;
+      conv2d_nchw[2] = 0.000000e+00f;
+      conv2d_nchw[3] = 0.000000e+00f;
+      conv2d_nchw[4] = 0.000000e+00f;
+      conv2d_nchw[5] = 0.000000e+00f;
+      conv2d_nchw[6] = 0.000000e+00f;
+      for (int rc_outer_outer = 0; rc_outer_outer < 16; ++rc_outer_outer) {
         for (int rx_outer_outer = 0; rx_outer_outer < 3; ++rx_outer_outer) {
           __syncthreads();
-          for (int ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = 0; ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer < 2; ++ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer) {
-            if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) / 28)) < 9) {
-              pad_temp_shared[((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 196) + ((int)threadIdx.x))] = (((((1 <= (((((int)threadIdx.x) / 7) + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer) % 9)) && ((((((int)threadIdx.x) / 7) + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 196) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2 [...]
-            }
-          }
-          if (((int)threadIdx.x) < 192) {
-            kernel_shared[((int)threadIdx.x)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 36)) + ((((int)threadIdx.x) % 12) * 3)) + rx_outer_outer)];
+          pad_temp_shared[((int)threadIdx.x)] = ((((7 <= ((int)threadIdx.x)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((rc_outer_outer * 1568) + ((int)threadIdx.x)) + rx_outer_outer) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 56)] = (((((1 <= (((((int)threadIdx.x) / 7) + 8) % 9)) && ((((((int)threadIdx.x) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 56) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 8) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 112)] = (((((1 <= (((((int)threadIdx.x) / 7) + 7) % 9)) && ((((((int)threadIdx.x) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 112) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 7) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 168)] = (((((1 <= (((((int)threadIdx.x) / 7) + 6) % 9)) && ((((((int)threadIdx.x) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 168) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 6) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 224)] = (((((1 <= (((((int)threadIdx.x) / 7) + 5) % 9)) && ((((((int)threadIdx.x) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 224) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 5) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 280)] = (((((1 <= (((((int)threadIdx.x) / 7) + 4) % 9)) && ((((((int)threadIdx.x) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 280) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 4) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 336)] = (((((1 <= (((((int)threadIdx.x) / 7) + 3) % 9)) && ((((((int)threadIdx.x) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 336) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 3) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 392)] = (((((1 <= (((((int)threadIdx.x) / 7) + 2) % 9)) && ((((((int)threadIdx.x) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 392) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 2) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 448)] = ((((((int)threadIdx.x) < 49) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 448) / 63) * 49)) + ((int)threadIdx.x)) + rx_outer_outer) - 1)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 504)] = ((((7 <= ((int)threadIdx.x)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((rc_outer_outer * 1568) + ((int)threadIdx.x)) + rx_outer_outer) + 384)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 560)] = (((((1 <= (((((int)threadIdx.x) / 7) + 8) % 9)) && ((((((int)threadIdx.x) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 560) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 8) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 616)] = (((((1 <= (((((int)threadIdx.x) / 7) + 7) % 9)) && ((((((int)threadIdx.x) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 616) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 7) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 672)] = (((((1 <= (((((int)threadIdx.x) / 7) + 6) % 9)) && ((((((int)threadIdx.x) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 672) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 6) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 728)] = (((((1 <= (((((int)threadIdx.x) / 7) + 5) % 9)) && ((((((int)threadIdx.x) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 728) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 5) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 784)] = (((((1 <= (((((int)threadIdx.x) / 7) + 4) % 9)) && ((((((int)threadIdx.x) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 784) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 4) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 840)] = (((((1 <= (((((int)threadIdx.x) / 7) + 3) % 9)) && ((((((int)threadIdx.x) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 840) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 3) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 896)] = (((((1 <= (((((int)threadIdx.x) / 7) + 2) % 9)) && ((((((int)threadIdx.x) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 896) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 2) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 952)] = ((((((int)threadIdx.x) < 49) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 952) / 63) * 49)) + ((int)threadIdx.x)) + rx_outer_outer) - 1)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1008)] = ((((7 <= ((int)threadIdx.x)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((rc_outer_outer * 1568) + ((int)threadIdx.x)) + rx_outer_outer) + 776)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1064)] = (((((1 <= (((((int)threadIdx.x) / 7) + 8) % 9)) && ((((((int)threadIdx.x) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1064) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 8) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1120)] = (((((1 <= (((((int)threadIdx.x) / 7) + 7) % 9)) && ((((((int)threadIdx.x) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1120) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 7) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1176)] = (((((1 <= (((((int)threadIdx.x) / 7) + 6) % 9)) && ((((((int)threadIdx.x) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1176) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 6) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1232)] = (((((1 <= (((((int)threadIdx.x) / 7) + 5) % 9)) && ((((((int)threadIdx.x) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1232) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 5) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1288)] = (((((1 <= (((((int)threadIdx.x) / 7) + 4) % 9)) && ((((((int)threadIdx.x) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1288) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 4) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1344)] = (((((1 <= (((((int)threadIdx.x) / 7) + 3) % 9)) && ((((((int)threadIdx.x) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1344) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 3) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1400)] = (((((1 <= (((((int)threadIdx.x) / 7) + 2) % 9)) && ((((((int)threadIdx.x) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1400) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 2) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1456)] = ((((((int)threadIdx.x) < 49) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1456) / 63) * 49)) + ((int)threadIdx.x)) + rx_outer_outer) - 1)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1512)] = ((((7 <= ((int)threadIdx.x)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((rc_outer_outer * 1568) + ((int)threadIdx.x)) + rx_outer_outer) + 1168)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1568)] = (((((1 <= (((((int)threadIdx.x) / 7) + 8) % 9)) && ((((((int)threadIdx.x) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1568) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 8) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1624)] = (((((1 <= (((((int)threadIdx.x) / 7) + 7) % 9)) && ((((((int)threadIdx.x) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1624) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 7) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1680)] = (((((1 <= (((((int)threadIdx.x) / 7) + 6) % 9)) && ((((((int)threadIdx.x) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1680) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 6) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1736)] = (((((1 <= (((((int)threadIdx.x) / 7) + 5) % 9)) && ((((((int)threadIdx.x) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1736) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 5) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1792)] = (((((1 <= (((((int)threadIdx.x) / 7) + 4) % 9)) && ((((((int)threadIdx.x) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1792) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 4) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1848)] = (((((1 <= (((((int)threadIdx.x) / 7) + 3) % 9)) && ((((((int)threadIdx.x) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1848) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 3) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1904)] = (((((1 <= (((((int)threadIdx.x) / 7) + 2) % 9)) && ((((((int)threadIdx.x) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1904) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 2) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) + 1960)] = ((((((int)threadIdx.x) < 49) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1960) / 63) * 49)) + ((int)threadIdx.x)) + rx_outer_outer) - 1)] : 0.000000e+00f);
+          kernel_shared[((int)threadIdx.x)] = kernel[((((((int)blockIdx.x) * 36864) + (rc_outer_outer * 288)) + (((int)threadIdx.x) * 3)) + rx_outer_outer)];
+          kernel_shared[(((int)threadIdx.x) + 56)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 56) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 56) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+          kernel_shared[(((int)threadIdx.x) + 112)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 112) / 96) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 16) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+          kernel_shared[(((int)threadIdx.x) + 168)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 168) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) / 3) + 24) & 31) * 9)) + ((((int)threadIdx.x) % 3) * 3)) + rx_outer_outer)];
+          kernel_shared[(((int)threadIdx.x) + 224)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 224) / 96) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 32) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+          kernel_shared[(((int)threadIdx.x) + 280)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 280) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 88) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+          kernel_shared[(((int)threadIdx.x) + 336)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 336) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) / 3) + 16) & 31) * 9)) + ((((int)threadIdx.x) % 3) * 3)) + rx_outer_outer)];
+          kernel_shared[(((int)threadIdx.x) + 392)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 392) / 96) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 8) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+          kernel_shared[(((int)threadIdx.x) + 448)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 448) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 64) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+          kernel_shared[(((int)threadIdx.x) + 504)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 504) / 96) * 4608)) + (rc_outer_outer * 288)) + (((int)threadIdx.x) * 3)) + rx_outer_outer) + 72)];
+          kernel_shared[(((int)threadIdx.x) + 560)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 560) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 80) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+          kernel_shared[(((int)threadIdx.x) + 616)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 616) / 96) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 40) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+          kernel_shared[(((int)threadIdx.x) + 672)] = kernel[(((((((int)blockIdx.x) * 36864) + (rc_outer_outer * 288)) + (((int)threadIdx.x) * 3)) + rx_outer_outer) + 32256)];
+          if (((int)threadIdx.x) < 40) {
+            kernel_shared[(((int)threadIdx.x) + 728)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 728) / 96) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 56) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
           }
           __syncthreads();
-          for (int rc_outer_inner = 0; rc_outer_inner < 2; ++rc_outer_inner) {
-            for (int ry_outer_inner = 0; ry_outer_inner < 3; ++ry_outer_inner) {
-              for (int rc_inner = 0; rc_inner < 2; ++rc_inner) {
-                for (int ff_inner = 0; ff_inner < 2; ++ff_inner) {
-                  conv2d_nchw[ff_inner] = (conv2d_nchw[ff_inner] + (pad_temp_shared[((((rc_outer_inner * 126) + (rc_inner * 63)) + (ry_outer_inner * 7)) + (((int)threadIdx.x) % 49))] * kernel_shared[((((((((int)threadIdx.x) / 49) * 24) + (ff_inner * 12)) + (rc_outer_inner * 6)) + (rc_inner * 3)) + ry_outer_inner)]));
-                  conv2d_nchw[(ff_inner + 2)] = (conv2d_nchw[(ff_inner + 2)] + (pad_temp_shared[((((rc_outer_inner * 126) + (rc_inner * 63)) + (ry_outer_inner * 7)) + (((int)threadIdx.x) % 49))] * kernel_shared[(((((((((int)threadIdx.x) / 49) * 24) + (ff_inner * 12)) + (rc_outer_inner * 6)) + (rc_inner * 3)) + ry_outer_inner) + 96)]));
-                }
-              }
-            }
+          for (int rc_outer_inner = 0; rc_outer_inner < 32; ++rc_outer_inner) {
+            conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7))] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3))]));
+            conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 1)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3))]));
+            conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 2)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3))]));
+            conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 3)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3))]));
+            conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 4)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3))]));
+            conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 5)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3))]));
+            conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 6)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3))]));
+            conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 7)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 1)]));
+            conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 8)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 1)]));
+            conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 9)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 1)]));
+            conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 10)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 1)]));
+            conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 1)]));
+            conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 12)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 1)]));
+            conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 13)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 1)]));
+            conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 2)]));
+            conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 15)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 2)]));
+            conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 16)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 2)]));
+            conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 17)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 2)]));
+            conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 18)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 2)]));
+            conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 2)]));
+            conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 2)]));
           }
         }
       }
-      for (int i1_inner = 0; i1_inner < 2; ++i1_inner) {
-        compute[((((((int)blockIdx.x) * 784) + ((((int)threadIdx.x) / 49) * 98)) + (i1_inner * 49)) + (((int)threadIdx.x) % 49))] = max((conv2d_nchw[i1_inner] + bias[(((((int)blockIdx.x) * 16) + ((((int)threadIdx.x) / 49) * 2)) + i1_inner)]), 0.000000e+00f);
-        compute[(((((((int)blockIdx.x) * 784) + ((((int)threadIdx.x) / 49) * 98)) + (i1_inner * 49)) + (((int)threadIdx.x) % 49)) + 392)] = max((conv2d_nchw[(i1_inner + 2)] + bias[((((((int)blockIdx.x) * 16) + ((((int)threadIdx.x) / 49) * 2)) + i1_inner) + 8)]), 0.000000e+00f);
+      for (int i3_inner = 0; i3_inner < 7; ++i3_inner) {
+        compute[(((((int)blockIdx.x) * 392) + (((int)threadIdx.x) * 7)) + i3_inner)] = max((conv2d_nchw[i3_inner] + bias[((((int)blockIdx.x) * 8) + (((int)threadIdx.x) / 7))]), 0.000000e+00f);
       }
     }
 
@@ -551,7 +718,7 @@ In the example below we resume the status and do more 5 trials.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 5 minutes  33.921 seconds)
+   **Total running time of the script:** ( 5 minutes  43.674 seconds)
 
 
 .. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
index 597d3175cb..78fd1a61a5 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
@@ -643,7 +643,7 @@ so we can read the log file and load the best schedules.
     Evaluate inference time cost...
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-       8.2136       8.2150       8.2171       8.2088       0.0035   
+       8.1730       8.1716       8.1796       8.1679       0.0049   
                
 
 
@@ -671,7 +671,7 @@ Other Tips
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  4.112 seconds)
+   **Total running time of the script:** ( 1 minutes  3.505 seconds)
 
 
 .. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_network_cuda.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
index 033ffd5144..6c9a6573a2 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
@@ -662,7 +662,7 @@ so we can read the log file and load the best schedules.
     Evaluate inference time cost...
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      760.2186     760.4327     761.2765     758.9464      0.9632   
+      755.8821     755.1885     757.4651     754.9928      1.1222   
                
 
 
@@ -690,7 +690,7 @@ Other Tips
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  33.757 seconds)
+   **Total running time of the script:** ( 1 minutes  32.818 seconds)
 
 
 .. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_network_x86.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
index 1a7188f31c..62ea8ccc7d 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
@@ -386,13 +386,13 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
                  placeholder_4: Buffer(placeholder_14: Pointer(float32), float32, [65536], []),
                  compute: Buffer(compute_2: Pointer(float32), float32, [65536], [])}
       buffer_map = {placeholder_5: placeholder, placeholder_6: placeholder_1, placeholder_7: placeholder_2, placeholder_8: placeholder_3, placeholder_9: placeholder_4, compute_1: compute}
-      preflattened_buffer_map = {placeholder_8: placeholder_15: Buffer(placeholder_13, int32, [33], []), placeholder_7: placeholder_16: Buffer(placeholder_12, int32, [4916], []), placeholder_5: placeholder_17: Buffer(placeholder_10, float32, [128, 256], []), placeholder_9: placeholder_18: Buffer(placeholder_14, float32, [128, 512], []), compute_1: compute_3: Buffer(compute_2, float32, [128, 512], []), placeholder_6: placeholder_19: Buffer(placeholder_11, float32, [4916, 16, 1], [])} {
-      for (i0.outer.i1.outer.fused: int32, 0, 512) "parallel" {
-        allocate(compute_4: Pointer(global float32), float32, [128]), storage_scope = global {
-          for (i.inner.init: int32, 0, 8) {
+      preflattened_buffer_map = {compute_1: compute_3: Buffer(compute_2, float32, [128, 512], []), placeholder_6: placeholder_15: Buffer(placeholder_11, float32, [4916, 16, 1], []), placeholder_5: placeholder_16: Buffer(placeholder_10, float32, [128, 256], []), placeholder_7: placeholder_17: Buffer(placeholder_12, int32, [4916], []), placeholder_8: placeholder_18: Buffer(placeholder_13, int32, [33], []), placeholder_9: placeholder_19: Buffer(placeholder_14, float32, [128, 512], [])} {
+      for (i0.outer.i1.outer.fused: int32, 0, 64) "parallel" {
+        allocate(compute_4: Pointer(global float32), float32, [1024]), storage_scope = global {
+          for (i.inner.init: int32, 0, 64) {
             let cse_var_1: int32 = (i.inner.init*16)
              {
-              compute_5: Buffer(compute_4, float32, [128], [])[cse_var_1] = 0f32
+              compute_5: Buffer(compute_4, float32, [1024], [])[cse_var_1] = 0f32
               compute_5[(cse_var_1 + 1)] = 0f32
               compute_5[(cse_var_1 + 2)] = 0f32
               compute_5[(cse_var_1 + 3)] = 0f32
@@ -411,78 +411,78 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
             }
           }
           for (elem_idx: int32, 0, let cse_var_2: int32 = floormod(i0.outer.i1.outer.fused, 32) in (placeholder_3[(cse_var_2 + 1)] - placeholder_3[cse_var_2])) {
-            for (i.inner: int32, 0, 8) {
+            for (i.inner: int32, 0, 64) {
               let cse_var_3: int32 = floormod(i0.outer.i1.outer.fused, 32)
                {
                 if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
                   let cse_var_4: int32 = (i.inner*16)
-                  compute_5[cse_var_4] = (compute_5[cse_var_4] + (placeholder_1[((placeholder_3[cse_var_3]*16) + (elem_idx*16))]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+                  compute_5[cse_var_4] = (compute_5[cse_var_4] + (placeholder_1[((placeholder_3[cse_var_3]*16) + (elem_idx*16))]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
                 }
                 if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
                   let cse_var_5: int32 = ((i.inner*16) + 1)
-                  compute_5[cse_var_5] = (compute_5[cse_var_5] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 1)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+                  compute_5[cse_var_5] = (compute_5[cse_var_5] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 1)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
                 }
                 if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
                   let cse_var_6: int32 = ((i.inner*16) + 2)
-                  compute_5[cse_var_6] = (compute_5[cse_var_6] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 2)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+                  compute_5[cse_var_6] = (compute_5[cse_var_6] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 2)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
                 }
                 if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
                   let cse_var_7: int32 = ((i.inner*16) + 3)
-                  compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 3)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+                  compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 3)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
                 }
                 if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
                   let cse_var_8: int32 = ((i.inner*16) + 4)
-                  compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 4)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+                  compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 4)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
                 }
                 if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
                   let cse_var_9: int32 = ((i.inner*16) + 5)
-                  compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 5)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+                  compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 5)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
                 }
                 if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
                   let cse_var_10: int32 = ((i.inner*16) + 6)
-                  compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 6)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+                  compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 6)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
                 }
                 if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
                   let cse_var_11: int32 = ((i.inner*16) + 7)
-                  compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 7)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+                  compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 7)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
                 }
                 if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
                   let cse_var_12: int32 = ((i.inner*16) + 8)
-                  compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 8)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+                  compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 8)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
                 }
                 if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
                   let cse_var_13: int32 = ((i.inner*16) + 9)
-                  compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 9)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+                  compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 9)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
                 }
                 if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
                   let cse_var_14: int32 = ((i.inner*16) + 10)
-                  compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 10)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+                  compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 10)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
                 }
                 if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
                   let cse_var_15: int32 = ((i.inner*16) + 11)
-                  compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 11)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+                  compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 11)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
                 }
                 if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
                   let cse_var_16: int32 = ((i.inner*16) + 12)
-                  compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 12)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+                  compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 12)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
                 }
                 if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
                   let cse_var_17: int32 = ((i.inner*16) + 13)
-                  compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 13)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+                  compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 13)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
                 }
                 if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
                   let cse_var_18: int32 = ((i.inner*16) + 14)
-                  compute_5[cse_var_18] = (compute_5[cse_var_18] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 14)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+                  compute_5[cse_var_18] = (compute_5[cse_var_18] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 14)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
                 }
                 if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
                   let cse_var_19: int32 = ((i.inner*16) + 15)
-                  compute_5[cse_var_19] = (compute_5[cse_var_19] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 15)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+                  compute_5[cse_var_19] = (compute_5[cse_var_19] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 15)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
                 }
               }
             }
           }
-          for (i0.inner: int32, 0, 8) {
-            let cse_var_20: int32 = (((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 32)*16))
+          for (i0.inner: int32, 0, 64) {
+            let cse_var_20: int32 = (((floordiv(i0.outer.i1.outer.fused, 32)*32768) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 32)*16))
             compute[ramp(cse_var_20, 1, 16)] = max((compute_5[ramp((i0.inner*16), 1, 16)] + placeholder_4[ramp(cse_var_20, 1, 16)]), broadcast(0f32, 16))
           }
         }
@@ -539,7 +539,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 1.868 ms
+    Execution time of this operator: 1.844 ms
 
 
 
diff --git a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
index c578ee91b2..8488b444b3 100644
--- a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
@@ -5,12 +5,12 @@
 
 Computation times
 =================
-**00:37.796** total execution time for **how_to_tune_with_autotvm** files:
+**00:42.071** total execution time for **how_to_tune_with_autotvm** files:
 
 +--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``)           | 00:37.760 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``)           | 00:42.036 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``)               | 00:00.021 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``)               | 00:00.020 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_cuda.py` (``tune_relay_cuda.py``)             | 00:00.005 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
index 0047e70fac..71316b47da 100644
--- a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
@@ -265,7 +265,9 @@ for this template
     waiting for device...
     device available
     Get devices for measurement successfully!
-    No: 1   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
+    No: 1   GFLOPS: 1.43/1.43       result: MeasureResult(costs=(0.16210441574999998,), error_no=MeasureErrorNo.NO_ERROR, all_cost=9.74929690361023, timestamp=1667866553.1932828)  [('tile_f', [-1, 32, 4, 2]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 1, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9682937
+    No: 2   GFLOPS: 148.64/148.64   result: MeasureResult(costs=(0.0015574340776699027,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.8080589771270752, timestamp=1667866554.1134543)      [('tile_f', [-1, 1, 16, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 16]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1677314
+    No: 3   GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -387,8 +389,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 8, 8, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 512]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8128642
-    No: 2   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 32, 1]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 64, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1027441
+    No: 4   GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -510,8 +512,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 16, 4, 4]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 64, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10348919
-    No: 3   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 2, 64]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 64]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5775206
+    No: 5   GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -633,9 +635,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 16, 8, 2]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 512]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7549163
-    No: 4   GFLOPS: 248.27/248.27   result: MeasureResult(costs=(0.0009324433863636363,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.7833104133605957, timestamp=1667865851.5936587)      [('tile_f', [-1, 1, 64, 2]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 4]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,653714
-    No: 5   GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 32, 4, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2203420
+    No: 6   GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -757,9 +758,9 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 8, 8]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 1, 512]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6581895
-    No: 6   GFLOPS: 108.35/248.27   result: MeasureResult(costs=(0.002136543255319149,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.1016724109649658, timestamp=1667865852.8906536)       [('tile_f', [-1, 2, 1, 2]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 2, 8]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,293536
-    No: 7   GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 64, 8]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6348043
+    No: 7   GFLOPS: 50.89/148.64    result: MeasureResult(costs=(0.004549476272727272,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.3591265678405762, timestamp=1667866558.40512) [('tile_f', [-1, 4, 32, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 2, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2326762
+    No: 8   GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -881,8 +882,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 8, 16]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 32, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2183021
-    No: 8   GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 512, 1, 1]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 1, 512]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9292369
+    No: 9   GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1004,8 +1005,26 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 8, 1, 2]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 8, 64]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5592898
-    No: 9   GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 128, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 1, 512]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2710231
+    No: 10  GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 142, in build
+        res = future.result()
+      File "/usr/lib/python3.7/concurrent/futures/_base.py", line 435, in result
+        return self.__get_result()
+      File "/usr/lib/python3.7/concurrent/futures/_base.py", line 384, in __get_result
+        raise self._exception
+      File "/usr/lib/python3.7/concurrent/futures/thread.py", line 57, in run
+        result = self.fn(*self.args, **self.kwargs)
+      File "/workspace/python/tvm/contrib/popen_pool.py", line 432, in <lambda>
+        worker = lambda *args: self._worker_run(*args)
+      File "/workspace/python/tvm/contrib/popen_pool.py", line 401, in _worker_run
+        return proc.recv()
+      File "/workspace/python/tvm/contrib/popen_pool.py", line 309, in recv
+        raise TimeoutError()
+    TimeoutError
+
+            [('tile_f', [-1, 16, 2, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7400252
+    No: 11  GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1127,8 +1146,9 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 128, 1, 2]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 8]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2612342
-    No: 10  GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 2, 16]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10339952
+    No: 12  GFLOPS: 86.81/148.64    result: MeasureResult(costs=(0.0026666176739130435,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.2049615383148193, timestamp=1667866569.3942392)      [('tile_f', [-1, 1, 2, 2]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5228584
+    No: 13  GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1250,8 +1270,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 2, 64]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 16, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9344264
-    No: 11  GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 64, 2, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 32, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,55114
+    No: 14  GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1373,8 +1393,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 16, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 128, 4]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4160547
-    No: 12  GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 8, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 256, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7193023
+    No: 15  GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1496,8 +1516,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 8, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 16, 32]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5575709
-    No: 13  GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 64, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 1, 512]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,3096765
+    No: 16  GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1619,8 +1639,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 512, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 128]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,3076534
-    No: 14  GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 128, 2, 2]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 2, 64]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3648331
+    No: 17  GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1742,161 +1762,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 32, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 64]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10235852
-    No: 15  GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
-      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 738, in __call__
-        yield remote, remote.load_module(os.path.split(build_result.filename)[1])
-      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 702, in run_through_rpc
-        costs = time_f(*args).results
-      File "/workspace/python/tvm/runtime/module.py", line 357, in evaluator
-        blob = feval(*args)
-      File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
-      File "tvm/_ffi/_cython/./packed_func.pxi", line 262, in tvm._ffi._cy3.core.FuncCall
-      File "tvm/_ffi/_cython/./packed_func.pxi", line 251, in tvm._ffi._cy3.core.FuncCall3
-      File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
-    tvm._ffi.base.TVMError: Traceback (most recent call last):
-      4: TVMFuncCall
-            at ../src/runtime/c_runtime_api.cc:477
-      3: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
-            at ../include/tvm/runtime/packed_func.h:1217
-      2: tvm::runtime::RPCWrappedFunc::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
-            at ../src/runtime/rpc/rpc_module.cc:129
-      1: tvm::runtime::RPCClientSession::CallFunc(void*, TVMValue const*, int const*, int, std::function<void (tvm::runtime::TVMArgs)> const&)
-            at ../src/runtime/rpc/rpc_endpoint.cc:1012
-      0: tvm::runtime::RPCEndpoint::CallFunc(void*, TVMValue const*, int const*, int, std::function<void (tvm::runtime::TVMArgs)>)
-            at ../src/runtime/rpc/rpc_endpoint.cc:804
-      File "../src/runtime/rpc/rpc_endpoint.cc", line 804
-    TVMError: 
-    ---------------------------------------------------------------
-    An error occurred during the execution of TVM.
-    For more information, please see: https://tvm.apache.org/docs/errors.html
-    ---------------------------------------------------------------
-      Check failed: (code == RPCCode::kReturn) is false: code=kShutdown
-
-    During handling of the above exception, another exception occurred:
-
-    Traceback (most recent call last):
-      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 702, in run_through_rpc
-        costs = time_f(*args).results
-      File "/usr/lib/python3.7/contextlib.py", line 130, in __exit__
-        self.gen.throw(type, value, traceback)
-      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 742, in __call__
-        remote.remove(build_result.filename)
-      File "/workspace/python/tvm/rpc/client.py", line 144, in remove
-        self._remote_funcs["remove"] = self.get_function("tvm.rpc.server.remove")
-      File "/workspace/python/tvm/rpc/client.py", line 72, in get_function
-        return self._sess.get_function(name)
-      File "/workspace/python/tvm/runtime/module.py", line 171, in get_function
-        self.handle, c_str(name), ctypes.c_int(query_imports), ctypes.byref(ret_handle)
-      File "/workspace/python/tvm/_ffi/base.py", line 348, in check_call
-        raise get_last_ffi_error()
-    tvm._ffi.base.TVMError: Traceback (most recent call last):
-      52: 0xffffffffffffffff
-      51: _start
-      50: __libc_start_main
-      49: _Py_UnixMain
-      48: 0x0000000000650da0
-      47: 0x0000000000650afa
-      46: _PyFunction_FastCallDict
-      45: _PyEval_EvalCodeWithName
-      44: _PyEval_EvalFrameDefault
-      43: _PyFunction_FastCallKeywords
-      42: _PyEval_EvalCodeWithName
-      41: _PyEval_EvalFrameDefault
-      40: _PyMethodDef_RawFastCallKeywords
-      39: 0x0000000000546369
-      38: _PyEval_EvalCodeWithName
-      37: _PyEval_EvalFrameDefault
-      36: _PyFunction_FastCallKeywords
-      35: _PyEval_EvalCodeWithName
-      34: _PyEval_EvalFrameDefault
-      33: _PyFunction_FastCallDict
-      32: _PyEval_EvalCodeWithName
-      31: _PyEval_EvalFrameDefault
-      30: _PyObject_FastCallDict
-      29: 0x00000000004c06e1
-      28: _PyFunction_FastCallDict
-      27: _PyEval_EvalFrameDefault
-      26: _PyMethodDescr_FastCallKeywords
-      25: 0x00000000005dcb58
-      24: 0x00000000005dc83f
-      23: 0x00000000004ba127
-      22: _PyEval_EvalFrameDefault
-      21: _PyFunction_FastCallKeywords
-      20: _PyEval_EvalFrameDefault
-      19: _PyFunction_FastCallKeywords
-      18: _PyEval_EvalFrameDefault
-      17: _PyFunction_FastCallKeywords
-      16: _PyEval_EvalCodeWithName
-      15: _PyEval_EvalFrameDefault
-      14: 0x0000000000537c30
-      13: _PyObject_FastCallKeywords
-      12: 0x00007f8879914fa2
-      11: _ctypes_callproc
-      10: ffi_call
-      9: ffi_call_unix64
-      8: TVMModGetFunction
-            at ../src/runtime/c_runtime_api.cc:408
-      7: tvm::runtime::ModuleNode::GetFunction(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)
-            at ../src/runtime/module.cc:66
-      6: tvm::runtime::RPCModuleNode::GetFunction(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)
-            at ../src/runtime/rpc/rpc_module.cc:185
-      5: tvm::runtime::RPCClientSession::GetFunction(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)
-            at ../src/runtime/rpc/rpc_endpoint.cc:1007
-      4: tvm::runtime::TVMRetValue tvm::runtime::RPCEndpoint::SysCallRemote<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&>(tvm::runtime::RPCCode, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)
-            at ../src/runtime/rpc/rpc_endpoint.h:223
-      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&>(int&&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) const
-            at ../include/tvm/runtime/packed_func.h:1618
-      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
-            at ../include/tvm/runtime/packed_func.h:1217
-      1: Call
-            at ../include/tvm/runtime/packed_func.h:1213
-      0: operator()
-            at ../src/runtime/rpc/rpc_endpoint.cc:684
-      File "../src/runtime/rpc/rpc_endpoint.cc", line 684
-    TVMError: 
-    ---------------------------------------------------------------
-    An error occurred during the execution of TVM.
-    For more information, please see: https://tvm.apache.org/docs/errors.html
-    ---------------------------------------------------------------
-      Check failed: (code == RPCCode::kReturn) is false: code=1
-
-    Traceback (most recent call last):
-      52: 0xffffffffffffffff
-      51: _start
-      50: __libc_start_main
-      49: _Py_UnixMain
-      48: 0x0000000000650da0
-      47: 0x0000000000650afa
-      46: _PyFunction_FastCallDict
-      45: _PyEval_EvalCodeWithName
-      44: _PyEval_EvalFrameDefault
-      43: _PyFunction_FastCallKeywords
-      42: _PyEval_EvalCodeWithName
-      41: _PyEval_EvalFrameDefault
-      40: _PyMethodDef_RawFastCallKeywords
-      39: 0x0000000000546369
-      38: _PyEval_EvalCodeWithName
-      37: _PyEval_EvalFrameDefault
-      36: _PyFunction_FastCallKeywords
-      35: _PyEval_EvalCodeWithName
-      34: _PyEval_EvalFrameDefault
-      33: _PyFunction_FastCallDict
-      32: _PyEval_EvalCodeWithName
-      31: _PyEval_EvalFrameDefault
-      30: _PyObject_FastCallDict
-      29: 0x00000000004c06e1
-      28: _PyFunction_FastCallDict
-      27: _PyEval_EvalFrameDefault
-      26: _PyMethodDescr_FastCallKeywords
-      25: 0x00000000005dcb58
-      24: 0x00000000005dc83f
-      23: 0x00000000004ba127
-      22: _PyEval_EvalFrameDefault
-      21: _PyFunction_FastCallKeywords
-      20: _PyEval_EvalFrameDefault
-      19: _PyFunction_FastCall      [('tile_f', [-1, 2, 1, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 1, 8]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7068077
-    No: 16  GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 64, 2, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 8, 64]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4430156
+    No: 18  GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2018,9 +1885,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 64, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 32, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9120014
-    No: 17  GFLOPS: 73.22/248.27    result: MeasureResult(costs=(0.0031619162285714286,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.0219228267669678, timestamp=1667865870.0502481)      [('tile_f', [-1, 4, 16, 4]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 2, 4]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4137008
-    No: 18  GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 16, 1, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 128, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7962299
+    No: 19  GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2142,9 +2008,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 128, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 8]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4947191
-    No: 19  GFLOPS: 182.70/248.27   result: MeasureResult(costs=(0.001267116582278481,), error_no=MeasureErrorNo.NO_ERROR, all_cost=7.761154413223267, timestamp=1667865870.6874251)        [('tile_f', [-1, 1, 2, 8]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 64, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9122443
-    No: 20  GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 32, 8, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 8, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2142884
+    No: 20  GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2266,7 +2131,7 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 8, 1, 64]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 256]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,190063
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 32, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 16]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8647701
 
 
 
@@ -2321,9 +2186,9 @@ and measure running time.
     Finish loading 20 records
 
     Best config:
-    [('tile_f', [-1, 1, 64, 2]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 4]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,653714
+    [('tile_f', [-1, 1, 16, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 16]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1677314
     Finish loading 20 records
-    Time cost of this operator: 0.001150
+    Time cost of this operator: 0.001912
 
 
 
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
index 310fbdfa5f..e213853b41 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
@@ -327,10 +327,10 @@ Timing the untuned program
     ########## Build without Autotuning ##########
     Node Name                                     Ops                                           Time(us)  Time(%)  Shape              Inputs  Outputs  Measurements(us)  
     ---------                                     ---                                           --------  -------  -----              ------  -------  ----------------  
-    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  310.4     98.711   (1, 2, 10, 10, 3)  2       1        [310.4]           
-    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       3.087     0.982    (1, 6, 10, 10)     1       1        [3.087]           
-    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.966     0.307    (1, 1, 10, 10, 3)  1       1        [0.966]           
-    Total_time                                    -                                             314.453   -        -                  -       -        -                 
+    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  311.3     98.729   (1, 2, 10, 10, 3)  2       1        [311.3]           
+    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       3.017     0.957    (1, 6, 10, 10)     1       1        [3.017]           
+    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.991     0.314    (1, 1, 10, 10, 3)  1       1        [0.991]           
+    Total_time                                    -                                             315.308   -        -                  -       -        -                 
 
 
 
@@ -394,10 +394,10 @@ Timing the tuned program
     ########## Build with Autotuning ##########
     Node Name                                     Ops                                           Time(us)  Time(%)  Shape              Inputs  Outputs  Measurements(us)  
     ---------                                     ---                                           --------  -------  -----              ------  -------  ----------------  
-    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  104.9     97.537   (1, 6, 10, 10, 1)  2       1        [104.9]           
-    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       1.795     1.669    (1, 6, 10, 10)     1       1        [1.795]           
-    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.854     0.794    (1, 3, 10, 10, 1)  1       1        [0.854]           
-    Total_time                                    -                                             107.549   -        -                  -       -        -                 
+    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  104.8     97.581   (1, 6, 10, 10, 1)  2       1        [104.8]           
+    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       1.759     1.638    (1, 6, 10, 10)     1       1        [1.759]           
+    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.839     0.782    (1, 3, 10, 10, 1)  1       1        [0.839]           
+    Total_time                                    -                                             107.398   -        -                  -       -        -                 
 
 
 
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
index cece0aa9b1..09cc043115 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
@@ -225,7 +225,7 @@ take about **2 minutes** to download the Stanford Cars, while COCO 2017 validati
  .. code-block:: none
 
 
-    '/tmp/tmp1s4ra1nk/images/random'
+    '/tmp/tmpc_z4g3ef/images/random'
 
 
 
@@ -316,7 +316,7 @@ objects to other stuff? We can display some examples from our datasets using ``m
 
 
 .. image-sg:: /how_to/work_with_microtvm/images/sphx_glr_micro_train_001.png
-   :alt: [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]
+   :alt: [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]
    :srcset: /how_to/work_with_microtvm/images/sphx_glr_micro_train_001.png
    :class: sphx-glr-single-img
 
@@ -325,8 +325,8 @@ objects to other stuff? We can display some examples from our datasets using ``m
 
  .. code-block:: none
 
-    /tmp/tmp1s4ra1nk/images/target contains 8144 images
-    /tmp/tmp1s4ra1nk/images/random contains 5000 images
+    /tmp/tmpc_z4g3ef/images/target contains 8144 images
+    /tmp/tmpc_z4g3ef/images/random contains 5000 images
 
 
 
@@ -501,13 +501,13 @@ the time on our validation set).
  .. code-block:: none
 
     Epoch 1/3
-    328/328 - 48s - loss: 0.2227 - accuracy: 0.9201 - val_loss: 0.1678 - val_accuracy: 0.9411 - 48s/epoch - 147ms/step
+    328/328 - 47s - loss: 0.2142 - accuracy: 0.9254 - val_loss: 0.1351 - val_accuracy: 0.9539 - 47s/epoch - 142ms/step
     Epoch 2/3
-    328/328 - 43s - loss: 0.1024 - accuracy: 0.9609 - val_loss: 0.1075 - val_accuracy: 0.9573 - 43s/epoch - 132ms/step
+    328/328 - 43s - loss: 0.0911 - accuracy: 0.9666 - val_loss: 0.1195 - val_accuracy: 0.9622 - 43s/epoch - 132ms/step
     Epoch 3/3
-    328/328 - 43s - loss: 0.0729 - accuracy: 0.9735 - val_loss: 0.1108 - val_accuracy: 0.9600 - 43s/epoch - 132ms/step
+    328/328 - 43s - loss: 0.0675 - accuracy: 0.9757 - val_loss: 0.1146 - val_accuracy: 0.9607 - 43s/epoch - 131ms/step
 
-    <keras.callbacks.History object at 0x7fd5ccb95e10>
+    <keras.callbacks.History object at 0x7fb56c504290>
 
 
 
@@ -864,7 +864,7 @@ Arduino tutorial for how to do that `on GitHub <https://github.com/guberti/tvm-a
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 4 minutes  56.709 seconds)
+   **Total running time of the script:** ( 4 minutes  54.885 seconds)
 
 
 .. _sphx_glr_download_how_to_work_with_microtvm_micro_train.py:
diff --git a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
index 6dc9fcd887..11decf7792 100644
--- a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
@@ -5,16 +5,16 @@
 
 Computation times
 =================
-**05:58.685** total execution time for **how_to_work_with_microtvm** files:
+**05:56.380** total execution time for **how_to_work_with_microtvm** files:
 
 +---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py` (``micro_train.py``)               | 04:56.709 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py` (``micro_train.py``)               | 04:54.885 | 0.0 MB |
 +---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``)         | 00:49.857 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``)         | 00:49.642 | 0.0 MB |
 +---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_aot.py` (``micro_aot.py``)                   | 00:08.347 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_aot.py` (``micro_aot.py``)                   | 00:08.090 | 0.0 MB |
 +---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``)             | 00:03.771 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``)             | 00:03.762 | 0.0 MB |
 +---------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_work_with_microtvm_micro_ethosu.py` (``micro_ethosu.py``)             | 00:00.001 | 0.0 MB |
 +---------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
index 927a58037c..2ddf71567b 100644
--- a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
 
 Computation times
 =================
-**00:44.507** total execution time for **how_to_work_with_relay** files:
+**00:43.969** total execution time for **how_to_work_with_relay** files:
 
 +----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_using_pipeline_executor.py` (``using_pipeline_executor.py``) | 00:32.626 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_using_pipeline_executor.py` (``using_pipeline_executor.py``) | 00:32.087 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``)           | 00:10.275 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``)           | 00:10.306 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``)                             | 00:01.598 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``)                             | 00:01.569 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_work_with_relay_using_relay_viz.py` (``using_relay_viz.py``)                 | 00:00.007 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt b/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
index 807daf5c16..27ecd85ddc 100644
--- a/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
@@ -261,7 +261,7 @@ The following example customizes CUDA lowering rule for :code:`exp`.
  .. code-block:: none
 
 
-    <function my_cuda_math_rule at 0x7fd5c5158a70>
+    <function my_cuda_math_rule at 0x7fb51ba909e0>
 
 
 
diff --git a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
index 5ce19500eb..6b3208f82b 100644
--- a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
@@ -5,20 +5,20 @@
 
 Computation times
 =================
-**00:07.545** total execution time for **how_to_work_with_schedules** files:
+**00:06.612** total execution time for **how_to_work_with_schedules** files:
 
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``)                 | 00:05.160 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``)                 | 00:04.187 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``)                     | 00:01.050 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``)                     | 00:01.127 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``)                     | 00:00.573 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``)                     | 00:00.552 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``)                               | 00:00.548 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``)                               | 00:00.532 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_extern_op.py` (``extern_op.py``)                     | 00:00.115 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_extern_op.py` (``extern_op.py``)                     | 00:00.117 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_schedule_primitives.py` (``schedule_primitives.py``) | 00:00.051 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_schedule_primitives.py` (``schedule_primitives.py``) | 00:00.050 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_work_with_schedules_tedd.py` (``tedd.py``)                               | 00:00.029 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
index e5baa932a0..b7750c9272 100644
--- a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
@@ -347,7 +347,7 @@ The importing needs to happen before the tensorized GEMV being executed.
                  C: Buffer(C_2: Pointer(float32), float32, [524288], [])}
       buffer_map = {A_1: A, B_1: B, C_1: C}
       preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [1024, 64], []), B_1: B_3: Buffer(B_2, float32, [512, 64], []), C_1: C_3: Buffer(C_2, float32, [1024, 512], [])} {
-      attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmp5h3amj8c/input0.cc'\nsource_filename = \"/tmp/tmp5h3amj8c/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n  %7 = alloca float*, align 8\n  %8 = alloca float*, align 8\n  %9 = alloca floa [...]
+      attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmphe7nb_dg/input0.cc'\nsource_filename = \"/tmp/tmphe7nb_dg/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n  %7 = alloca float*, align 8\n  %8 = alloca float*, align 8\n  %9 = alloca floa [...]
       for (i, 0, 1024) {
         for (j.outer: int32, 0, 32) {
           @tir.call_extern("gemv_update", @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), C_2, ((i*512) + (j.outer*16)), 16, 2, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), A_2, (i*64), 64, 1, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), B_2, (j.outer*1024), 1024, 1, dtype=handle), 16, 64, 64, dtype=int32)
diff --git a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
index a6b3c778a1..5f5af721fc 100644
--- a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**00:26.920** total execution time for **topic_vta_tutorials_autotvm** files:
+**00:26.376** total execution time for **topic_vta_tutorials_autotvm** files:
 
 +---------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``) | 00:26.914 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``) | 00:26.370 | 0.0 MB |
 +---------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_alu_vta.py` (``tune_alu_vta.py``)     | 00:00.006 | 0.0 MB |
 +---------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
index bc724bdedb..2da9ed211b 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
@@ -289,7 +289,7 @@ The compilation steps are:
       DeprecationWarning,
     /workspace/vta/tutorials/frontend/deploy_classification.py:213: DeprecationWarning: legacy graph executor behavior of producing json / lib / params will be removed in the next release. Please see documents of tvm.contrib.graph_executor.GraphModule for the  new recommended usage.
       relay_prog, target=tvm.target.Target(target, host=env.target_host), params=params
-    resnet18_v1 inference graph built in 30.17s!
+    resnet18_v1 inference graph built in 29.21s!
 
 
 
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
index b3dc91ee00..d92700020e 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
@@ -333,7 +333,7 @@ The compilation steps are:
 
     /workspace/python/tvm/relay/build_module.py:348: DeprecationWarning: Please use input parameter mod (tvm.IRModule) instead of deprecated parameter mod (tvm.relay.function.Function)
       DeprecationWarning,
-    yolov3-tiny inference graph built in 20.17s!
+    yolov3-tiny inference graph built in 19.68s!
 
 
 
diff --git a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
index aa032ce639..f4dc037116 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**01:42.906** total execution time for **topic_vta_tutorials_frontend** files:
+**01:41.256** total execution time for **topic_vta_tutorials_frontend** files:
 
 +------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``)           | 00:52.783 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``)           | 00:52.033 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``) | 00:50.123 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``) | 00:49.223 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
index 6883b75667..96444fd24f 100644
--- a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**00:03.132** total execution time for **topic_vta_tutorials_optimize** files:
+**00:03.077** total execution time for **topic_vta_tutorials_optimize** files:
 
 +--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``)         | 00:02.690 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``)         | 00:02.641 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``) | 00:00.442 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``) | 00:00.436 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
index 71744607a6..8a35bdacd7 100644
--- a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**00:00.783** total execution time for **topic_vta_tutorials** files:
+**00:00.774** total execution time for **topic_vta_tutorials** files:
 
 +---------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``) | 00:00.420 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``) | 00:00.415 | 0.0 MB |
 +---------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``) | 00:00.363 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``) | 00:00.359 | 0.0 MB |
 +---------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
index a7675d577f..7fbeb3b0eb 100644
--- a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
@@ -326,7 +326,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 97.253 ms
+    Execution time of this operator: 96.644 ms
 
 
 
@@ -444,7 +444,7 @@ operations.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  29.291 seconds)
+   **Total running time of the script:** ( 1 minutes  14.656 seconds)
 
 
 .. _sphx_glr_download_tutorial_auto_scheduler_matmul_x86.py:
diff --git a/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt b/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
index bc0603c5c1..36e4db9afd 100644
--- a/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
@@ -450,16 +450,16 @@ reduce variance, we take 5 measurements and average them.
     waiting for device...
     device available
     Get devices for measurement successfully!
-    No: 1   GFLOPS: 1.91/1.91       result: MeasureResult(costs=(0.140509768,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.391103744506836, timestamp=1667864448.602197)  [('tile_y', [-1, 2]), ('tile_x', [-1, 2])],None,11
-    No: 2   GFLOPS: 13.32/13.32     result: MeasureResult(costs=(0.0201473346,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5983037948608398, timestamp=1667864449.1324494)       [('tile_y', [-1, 1]), ('tile_x', [-1, 64])],None,60
-    No: 3   GFLOPS: 3.91/13.32      result: MeasureResult(costs=(0.0686360044,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.286271333694458, timestamp=1667864451.1571248)        [('tile_y', [-1, 32]), ('tile_x', [-1, 16])],None,45
-    No: 4   GFLOPS: 14.34/14.34     result: MeasureResult(costs=(0.018723097799999998,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.48362231254577637, timestamp=1667864451.6207647)      [('tile_y', [-1, 32]), ('tile_x', [-1, 64])],None,65
-    No: 5   GFLOPS: 14.57/14.57     result: MeasureResult(costs=(0.0184214082,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.7201747894287109, timestamp=1667864452.4545915)       [('tile_y', [-1, 64]), ('tile_x', [-1, 64])],None,66
-    No: 6   GFLOPS: 3.64/14.57      result: MeasureResult(costs=(0.073792032,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.3395512104034424, timestamp=1667864454.5698469)        [('tile_y', [-1, 128]), ('tile_x', [-1, 16])],None,47
-    No: 7   GFLOPS: 0.51/14.57      result: MeasureResult(costs=(0.5291343239999999,), error_no=MeasureErrorNo.NO_ERROR, all_cost=8.613178014755249, timestamp=1667864463.9545631)  [('tile_y', [-1, 128]), ('tile_x', [-1, 1])],None,7
-    No: 8   GFLOPS: 11.94/14.57     result: MeasureResult(costs=(0.0224736606,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5735018253326416, timestamp=1667864464.5261083)       [('tile_y', [-1, 32]), ('tile_x', [-1, 256])],None,85
-    No: 9   GFLOPS: 1.93/14.57      result: MeasureResult(costs=(0.1392146204,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.3448517322540283, timestamp=1667864466.9855354)       [('tile_y', [-1, 8]), ('tile_x', [-1, 2])],None,13
-    No: 10  GFLOPS: 9.37/14.57      result: MeasureResult(costs=(0.028638337799999997,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5769240856170654, timestamp=1667864467.6060805)       [('tile_y', [-1, 2]), ('tile_x', [-1, 64])],None,61
+    No: 1   GFLOPS: 10.31/10.31     result: MeasureResult(costs=(0.026031847199999996,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6412060260772705, timestamp=1667865136.0527933)       [('tile_y', [-1, 8]), ('tile_x', [-1, 64])],None,63
+    No: 2   GFLOPS: 8.59/10.31      result: MeasureResult(costs=(0.031239819000000002,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.8587596416473389, timestamp=1667865137.47197) [('tile_y', [-1, 16]), ('tile_x', [-1, 64])],None,64
+    No: 3   GFLOPS: 3.91/10.31      result: MeasureResult(costs=(0.0687364968,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.3036270141601562, timestamp=1667865138.7363503)       [('tile_y', [-1, 64]), ('tile_x', [-1, 16])],None,46
+    No: 4   GFLOPS: 2.08/10.31      result: MeasureResult(costs=(0.12894091820000003,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.21873140335083, timestamp=1667865141.7276092)  [('tile_y', [-1, 256]), ('tile_x', [-1, 4])],None,28
+    No: 5   GFLOPS: 0.50/10.31      result: MeasureResult(costs=(0.5374213431999999,), error_no=MeasureErrorNo.NO_ERROR, all_cost=8.70977234840393, timestamp=1667865150.6200793)   [('tile_y', [-1, 32]), ('tile_x', [-1, 1])],None,5
+    No: 6   GFLOPS: 9.04/10.31      result: MeasureResult(costs=(0.0297041796,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6853735446929932, timestamp=1667865151.26007) [('tile_y', [-1, 16]), ('tile_x', [-1, 32])],None,54
+    No: 7   GFLOPS: 12.79/12.79     result: MeasureResult(costs=(0.020995929599999998,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.47757792472839355, timestamp=1667865152.530675)       [('tile_y', [-1, 32]), ('tile_x', [-1, 512])],None,95
+    No: 8   GFLOPS: 2.11/12.79      result: MeasureResult(costs=(0.12734770759999997,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.240013360977173, timestamp=1667865154.7778563) [('tile_y', [-1, 128]), ('tile_x', [-1, 4])],None,27
+    No: 9   GFLOPS: 1.30/12.79      result: MeasureResult(costs=(0.2070525312,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.417884349822998, timestamp=1667865158.3209898)        [('tile_y', [-1, 2]), ('tile_x', [-1, 1])],None,1
+    No: 10  GFLOPS: 10.02/12.79     result: MeasureResult(costs=(0.026785170199999996,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5488584041595459, timestamp=1667865158.915134)        [('tile_y', [-1, 2]), ('tile_x', [-1, 64])],None,61
 
 
 
diff --git a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
index ea43bd7bba..7f5c2527f5 100644
--- a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
@@ -320,7 +320,7 @@ standard deviation.
 
  .. code-block:: none
 
-    {'mean': 516.7721257999995, 'median': 516.1846908499967, 'std': 2.6758526756169507}
+    {'mean': 519.3600799600063, 'median': 519.3016042000181, 'std': 1.4946292799988907}
 
 
 
@@ -554,30 +554,31 @@ the tuning data to.
 
  .. code-block:: none
 
-
    [Task  1/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  1/25]  Current/Best:   15.00/  22.98 GFLOPS | Progress: (4/20) | 6.86 s
    [Task  1/25]  Current/Best:   15.07/  23.06 GFLOPS | Progress: (8/20) | 9.95 s
    [Task  1/25]  Current/Best:   22.51/  23.07 GFLOPS | Progress: (12/20) | 11.62 s
    [Task  1/25]  Current/Best:   15.96/  23.53 GFLOPS | Progress: (16/20) | 14.93 s
    [Task  1/25]  Current/Best:   13.46/  23.53 GFLOPS | Progress: (20/20) | 17.76 s Done.
-
    [Task  2/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  2/25]  Current/Best:   19.40/  19.40 GFLOPS | Progress: (4/20) | 2.54 s
    [Task  2/25]  Current/Best:   12.19/  19.40 GFLOPS | Progress: (8/20) | 3.89 s
    [Task  2/25]  Current/Best:   17.75/  19.40 GFLOPS | Progress: (12/20) | 5.11 s
    [Task  2/25]  Current/Best:   16.87/  19.40 GFLOPS | Progress: (16/20) | 6.18 s
    [Task  2/25]  Current/Best:    7.93/  19.40 GFLOPS | Progress: (20/20) | 7.89 s Done.
-
    [Task  3/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  3/25]  Current/Best:   12.49/  17.46 GFLOPS | Progress: (4/20) | 3.49 s
    [Task  3/25]  Current/Best:   21.00/  21.00 GFLOPS | Progress: (8/20) | 5.55 s
    [Task  3/25]  Current/Best:   15.38/  21.00 GFLOPS | Progress: (12/20) | 7.43 s
    [Task  3/25]  Current/Best:    6.93/  21.00 GFLOPS | Progress: (16/20) | 9.89 s
    [Task  3/25]  Current/Best:   12.79/  22.53 GFLOPS | Progress: (20/20) | 11.83 s Done.
-
    [Task  4/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  4/25]  Current/Best:    7.82/  13.17 GFLOPS | Progress: (4/20) | 4.45 s
    [Task  4/25]  Current/Best:    5.61/  16.02 GFLOPS | Progress: (8/20) | 15.36 s
    [Task  4/25]  Current/Best:    9.46/  17.21 GFLOPS | Progress: (12/20) | 21.78 s
    [Task  4/25]  Current/Best:   12.14/  17.46 GFLOPS | Progress: (16/20) | 24.33 s
    [Task  4/25]  Current/Best:   12.05/  17.46 GFLOPS | Progress: (20/20) | 26.56 s Done.
-
    [Task  5/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  5/25]  Current/Best:   14.40/  17.49 GFLOPS | Progress: (4/20) | 3.19 s
    [Task  5/25]  Current/Best:   11.16/  17.49 GFLOPS | Progress: (8/20) | 5.57 s
    [Task  5/25]  Current/Best:    5.75/  17.49 GFLOPS | Progress: (12/20) | 7.56 s
    [Task  5/25]  Current/Best:    3.21/  17.49 GFLOPS | Progress: (16/20) | 9.45 s
    [Task  5/25]  Current/Best:    3.33/  17.49 GFLOPS | Progress: (20/20) | 11.27 s Done.
-
    [Task  6/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  6/25]  Current/Best:   12.03/  20.59 GFLOPS | Progress: (4/20) | 3.85 s
    [Task  6/25]  Current/Best:   10.54/  20.59 GFLOPS | Progress: (8/20) | 8.30 s
    [Task  6/25]  Current/Best:    6.99/  20.59 GFLOPS | Progress: (12/20) | 10.98 s
    [Task  6/25]  Current/Best:    6.00/  20.59 GFLOPS | Progress: (16/20) | 13.90 s
    [Task  6/25]  Current/Best:    5.88/  20.59 GFLOPS | Progress: (20/20) | 16.70 s Done.
-
    [Task  7/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  7/25]  Current/Best:   12.32/  17.42 GFLOPS | Progress: (4/20) | 3.58 s
    [Task  7/25]  Current/Best:    6.08/  17.42 GFLOPS | Progress: (8/20) | 6.23 s
    [Task  7/25]  Current/Best:   18.48/  18.48 GFLOPS | Progress: (12/20) | 8.20 s
    [Task  7/25]  Current/Best:    3.14/  18.48 GFLOPS | Progress: (16/20) | 10.67 s
    [Task  7/25]  Current/Best:    9.86/  20.39 GFLOPS | Progress: (20/20) | 14.07 s Done.
-
    [Task  8/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  8/25]  Current/Best:   10.08/  17.27 GFLOPS | Progress: (4/20) | 11.21 s
    [Task  8/25]  Current/Best:   16.55/  17.27 GFLOPS | Progress: (8/20) | 14.58 s
    [Task  8/25]  Current/Best:    4.16/  17.27 GFLOPS | Progress: (12/20) | 17.95 s
    [Task  8/25]  Current/Best:   10.39/  19.45 GFLOPS | Progress: (16/20) | 21.56 s
    [Task  8/25]  Current/Best:    4.99/  19.45 GFLOPS | Progress: (20/20) | 28.00 s Done.
-
    [Task  9/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  9/25]  Current/Best:    3.20/  18.30 GFLOPS | Progress: (4/20) | 12.64 s
    [Task  9/25]  Current/Best:   11.30/  18.30 GFLOPS | Progress: (8/20) | 23.72 s
    [Task  9/25]  Current/Best:   12.27/  18.30 GFLOPS | Progress: (12/20) | 25.70 s
    [Task  9/25]  Current/Best:    5.14/  18.30 GFLOPS | Progress: (16/20) | 30.74 s
    [Task  9/25]  Current/Best:   17.64/  20.37 GFLOPS | Progress: (20/20) | 32.37 s
    [Task 10/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 10/25]  Current/Best:    5.60/  16.53 GFLOPS | Progress: (4/20) | 3.07 s
    [Task 10/25]  Current/Best:   11.21/  17.76 GFLOPS | Progress: (8/20) | 5.00 s
    [Task 10/25]  Current/Best:    3.08/  17.76 GFLOPS | Progress: (12/20) | 6.83 s
    [Task 10/25]  Current/Best:   18.12/  18.12 GFLOPS | Progress: (16/20) | 9.15 s
    [Task 10/25]  Current/Best:   10.62/  18.12 GFLOPS | Progress: (20/20
 ) | 10.72 s Done.
-
    [Task 11/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 11/25]  Current/Best:    9.97/  12.24 GFLOPS | Progress: (4/20) | 4.01 s
    [Task 11/25]  Current/Best:    6.09/  18.27 GFLOPS | Progress: (8/20) | 6.74 s
    [Task 11/25]  Current/Best:   10.63/  18.77 GFLOPS | Progress: (12/20) | 9.07 s
    [Task 11/25]  Current/Best:   18.00/  18.77 GFLOPS | Progress: (16/20) | 11.21 s
    [Task 11/25]  Current/Best:   18.88/  21.74 GFLOPS | Progress: (20/20) | 13.25 s Done.
-
    [Task 12/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 12/25]  Current/Best:   11.31/  16.17 GFLOPS | Progress: (4/20) | 3.45 s
    [Task 12/25]  Current/Best:   11.08/  17.90 GFLOPS | Progress: (8/20) | 6.80 s
    [Task 12/25]  Current/Best:    4.94/  17.90 GFLOPS | Progress: (12/20) | 9.74 s
    [Task 12/25]  Current/Best:   10.68/  19.64 GFLOPS | Progress: (16/20) | 11.43 s
    [Task 12/25]  Current/Best:   13.29/  19.64 GFLOPS | Progress: (20/20) | 14.07 s Done.
-
    [Task 13/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 13/25]  Current/Best:   12.16/  23.05 GFLOPS | Progress: (4/20) | 3.61 s
    [Task 13/25]  Current/Best:   12.12/  23.05 GFLOPS | Progress: (8/20) | 5.75 s
    [Task 13/25]  Current/Best:   16.11/  23.05 GFLOPS | Progress: (12/20) | 7.47 s
    [Task 13/25]  Current/Best:   12.86/  23.05 GFLOPS | Progress: (16/20) | 9.31 s
    [Task 13/25]  Current/Best:   16.21/  23.05 GFLOPS | Progress: (20/20) | 11.26 s Done.
-
    [Task 14/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 14/25]  Current/Best:   12.00/  18.35 GFLOPS | Progress: (4/20) | 3.45 s
    [Task 14/25]  Current/Best:   14.21/  18.35 GFLOPS | Progress: (8/20) | 5.87 s Done.
-
    [Task 14/25]  Current/Best:   12.75/  18.74 GFLOPS | Progress: (12/20) | 12.60 s
    [Task 14/25]  Current/Best:   19.86/  22.43 GFLOPS | Progress: (16/20) | 14.27 s
    [Task 14/25]  Current/Best:   15.73/  22.43 GFLOPS | Progress: (20/20) | 17.46 s Done.
-
    [Task 15/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 15/25]  Current/Best:   19.18/  19.18 GFLOPS | Progress: (4/20) | 4.30 s
    [Task 15/25]  Current/Best:    3.00/  19.66 GFLOPS | Progress: (8/20) | 6.04 s
    [Task 15/25]  Current/Best:   14.32/  19.66 GFLOPS | Progress: (12/20) | 8.57 s
    [Task 15/25]  Current/Best:   18.86/  21.22 GFLOPS | Progress: (16/20) | 10.16 s
    [Task 15/25]  Current/Best:   10.19/  21.53 GFLOPS | Progress: (20/20) | 15.23 s
    [Task 16/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 16/25]  Current/Best:   20.12/  20.12 GFLOPS | Progress: (4/20) | 4.06 s
    [Task 16/25]  Current/Best:   14.73/  21.59 GFLOPS | Progress: (8/20) | 5.22 s
    [Task 16/25]  Current/Best:    7.70/  21.59 GFLOPS | Progress: (12/20) | 6.64 s
    [Task 16/25]  Current/Best:   17.01/  21.59 GFLOPS | Progress: (16/20) | 8.40 s
    [Task 16/25]  Current/Best:   18.59/  21.59 GFLOPS | Progress: (20/20) |
  10.11 s Done.
-
    [Task 17/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 17/25]  Current/Best:    3.10/  17.24 GFLOPS | Progress: (4/20) | 4.34 s
    [Task 17/25]  Current/Best:    3.06/  17.24 GFLOPS | Progress: (8/20) | 8.38 s
    [Task 17/25]  Current/Best:   10.17/  17.24 GFLOPS | Progress: (12/20) | 11.22 s
    [Task 17/25]  Current/Best:   16.16/  22.52 GFLOPS | Progress: (16/20) | 14.13 s
    [Task 17/25]  Current/Best:   12.15/  22.52 GFLOPS | Progress: (20/20) | 16.18 s Done.
-
    [Task 18/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 18/25]  Current/Best:    7.09/  12.39 GFLOPS | Progress: (4/20) | 5.98 s
    [Task 18/25]  Current/Best:   14.53/  16.77 GFLOPS | Progress: (8/20) | 7.78 s
    [Task 18/25]  Current/Best:   11.53/  20.96 GFLOPS | Progress: (12/20) | 11.24 s
    [Task 18/25]  Current/Best:   15.89/  20.96 GFLOPS | Progress: (16/20) | 13.48 s
    [Task 18/25]  Current/Best:    7.45/  20.96 GFLOPS | Progress: (20/20) | 16.93 s Done.
-
    [Task 19/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 19/25]  Current/Best:   19.24/  19.24 GFLOPS | Progress: (4/20) | 4.41 s
    [Task 19/25]  Current/Best:    9.82/  19.24 GFLOPS | Progress: (8/20) | 7.92 s
    [Task 19/25]  Current/Best:   10.45/  19.65 GFLOPS | Progress: (12/20) | 10.50 s
    [Task 19/25]  Current/Best:    9.48/  19.65 GFLOPS | Progress: (16/20) | 14.60 s
    [Task 19/25]  Current/Best:    2.69/  19.70 GFLOPS | Progress: (20/20) | 17.29 s Done.
-
    [Task 20/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 20/25]  Current/Best:   10.51/  12.50 GFLOPS | Progress: (4/20) | 3.58 s
    [Task 20/25]  Current/Best:   17.11/  19.48 GFLOPS | Progress: (8/20) | 6.00 s
    [Task 20/25]  Current/Best:   16.63/  19.48 GFLOPS | Progress: (12/20) | 8.95 s
    [Task 20/25]  Current/Best:   14.56/  19.48 GFLOPS | Progress: (16/20) | 10.87 s
    [Task 20/25]  Current/Best:    5.19/  19.48 GFLOPS | Progress: (20/20) | 13.61 s
    [Task 21/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 21/25]  Current/Best:   17.94/  19.73 GFLOPS | Progress: (4/20) | 4.71 s
    [Task 21/25]  Current/Best:   10.15/  20.31 GFLOPS | Progress: (8/20) | 7.32 s
    [Task 21/25]  Current/Best:   15.52/  20.31 GFLOPS | Progress: (12/20) | 9.22 s Done.
+
    [Task  1/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  1/25]  Current/Best:    8.34/  19.23 GFLOPS | Progress: (4/20) | 8.26 s
    [Task  1/25]  Current/Best:    3.20/  19.23 GFLOPS | Progress: (8/20) | 10.55 s
    [Task  1/25]  Current/Best:   12.66/  22.00 GFLOPS | Progress: (12/20) | 12.46 s
    [Task  1/25]  Current/Best:    7.73/  22.00 GFLOPS | Progress: (16/20) | 15.63 s
    [Task  1/25]  Current/Best:    7.11/  22.00 GFLOPS | Progress: (20/20) | 17.76 s Done.
+
    [Task  2/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  2/25]  Current/Best:   14.99/  22.08 GFLOPS | Progress: (4/20) | 2.71 s
    [Task  2/25]  Current/Best:    5.49/  22.08 GFLOPS | Progress: (8/20) | 4.51 s
    [Task  2/25]  Current/Best:    5.59/  22.08 GFLOPS | Progress: (12/20) | 6.34 s
    [Task  2/25]  Current/Best:   13.66/  22.08 GFLOPS | Progress: (16/20) | 8.09 s
    [Task  2/25]  Current/Best:    6.43/  22.08 GFLOPS | Progress: (20/20) | 10.96 s Done.
+
    [Task  3/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  3/25]  Current/Best:   22.83/  22.83 GFLOPS | Progress: (4/20) | 3.55 s
    [Task  3/25]  Current/Best:   17.22/  22.83 GFLOPS | Progress: (8/20) | 5.41 s
    [Task  3/25]  Current/Best:   23.95/  23.95 GFLOPS | Progress: (12/20) | 7.13 s
    [Task  3/25]  Current/Best:    9.21/  23.95 GFLOPS | Progress: (16/20) | 9.19 s
    [Task  3/25]  Current/Best:   16.91/  23.95 GFLOPS | Progress: (20/20) | 11.55 s Done.
+
    [Task  4/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  4/25]  Current/Best:   12.96/  17.43 GFLOPS | Progress: (4/20) | 3.38 s
    [Task  4/25]  Current/Best:    3.40/  17.43 GFLOPS | Progress: (8/20) | 8.10 s
    [Task  4/25]  Current/Best:   14.59/  17.43 GFLOPS | Progress: (12/20) | 14.43 s
    [Task  4/25]  Current/Best:    6.79/  17.43 GFLOPS | Progress: (16/20) | 17.60 s
    [Task  4/25]  Current/Best:    6.47/  17.43 GFLOPS | Progress: (20/20) | 19.18 s Done.
+
    [Task  5/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  5/25]  Current/Best:   11.06/  18.53 GFLOPS | Progress: (4/20) | 3.24 s
    [Task  5/25]  Current/Best:   11.17/  18.53 GFLOPS | Progress: (8/20) | 5.84 s
    [Task  5/25]  Current/Best:    4.63/  18.53 GFLOPS | Progress: (12/20) | 7.84 s
    [Task  5/25]  Current/Best:    4.23/  18.53 GFLOPS | Progress: (16/20) | 9.35 s
    [Task  5/25]  Current/Best:    7.71/  18.53 GFLOPS | Progress: (20/20) | 11.20 s Done.
+
    [Task  6/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  6/25]  Current/Best:   18.09/  19.91 GFLOPS | Progress: (4/20) | 4.07 s
    [Task  6/25]  Current/Best:   20.52/  20.52 GFLOPS | Progress: (8/20) | 6.56 s
    [Task  6/25]  Current/Best:    5.78/  20.52 GFLOPS | Progress: (12/20) | 9.14 s
    [Task  6/25]  Current/Best:   13.86/  20.52 GFLOPS | Progress: (16/20) | 11.56 s
    [Task  6/25]  Current/Best:   10.93/  20.52 GFLOPS | Progress: (20/20) | 13.46 s Done.
+
    [Task  7/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  7/25]  Current/Best:   12.78/  12.78 GFLOPS | Progress: (4/20) | 4.22 s
    [Task  7/25]  Current/Best:   14.16/  14.16 GFLOPS | Progress: (8/20) | 8.59 s
    [Task  7/25]  Current/Best:   15.25/  15.25 GFLOPS | Progress: (12/20) | 11.06 s
    [Task  7/25]  Current/Best:   15.47/  15.55 GFLOPS | Progress: (16/20) | 13.77 s
    [Task  7/25]  Current/Best:   21.98/  21.98 GFLOPS | Progress: (20/20) | 15.44 s Done.
+
    [Task  8/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  8/25]  Current/Best:    5.14/  13.49 GFLOPS | Progress: (4/20) | 8.14 s
    [Task  8/25]  Current/Best:   13.04/  14.40 GFLOPS | Progress: (8/20) | 11.55 s
    [Task  8/25]  Current/Best:   12.79/  14.40 GFLOPS | Progress: (12/20) | 20.72 s
    [Task  8/25]  Current/Best:   15.98/  15.98 GFLOPS | Progress: (16/20) | 25.84 s
    [Task  8/25]  Current/Best:   11.34/  15.98 GFLOPS | Progress: (20/20) | 31.30 s Done.
+
    [Task  9/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  9/25]  Current/Best:   10.55/  17.01 GFLOPS | Progress: (4/20) | 4.50 s
    [Task  9/25]  Current/Best:   14.12/  17.01 GFLOPS | Progress: (8/20) | 9.30 s
    [Task  9/25]  Current/Best:   11.30/  21.35 GFLOPS | Progress: (12/20) | 13.20 s
    [Task  9/25]  Current/Best:    6.81/  21.35 GFLOPS | Progress: (16/20) | 15.12 s
    [Task  9/25]  Current/Best:   20.31/  21.35 GFLOPS | Progress: (20/20) | 26.17 s
    [Task 10/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 10/25]  Current/Best:   10.20/  19.15 GFLOPS | Progress: (4/20) | 4.53 s
    [Task 10/25]  Current/Best:   12.40/  19.15 GFLOPS | Progress: (8/20) | 6.09 s
    [Task 10/25]  Current/Best:    9.38/  19.15 GFLOPS | Progress: (12/20) | 8.94 s
    [Task 10/25]  Current/Best:    5.15/  19.15 GFLOPS | Progress: (16/20) | 10.99 s
    [Task 10/25]  Current/Best:    2.64/  19.15 GFLOPS | Progress: (20/20)
  | 12.97 s Done.
+
    [Task 11/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 11/25]  Current/Best:   11.79/  16.96 GFLOPS | Progress: (4/20) | 4.29 s
    [Task 11/25]  Current/Best:    3.13/  16.96 GFLOPS | Progress: (8/20) | 6.92 s
    [Task 11/25]  Current/Best:    3.11/  18.21 GFLOPS | Progress: (12/20) | 10.05 s
    [Task 11/25]  Current/Best:   12.27/  18.21 GFLOPS | Progress: (16/20) | 12.73 s
    [Task 11/25]  Current/Best:   17.76/  18.34 GFLOPS | Progress: (20/20) | 15.43 s Done.
+
    [Task 12/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 12/25]  Current/Best:   20.30/  20.30 GFLOPS | Progress: (4/20) | 5.06 s
    [Task 12/25]  Current/Best:    9.67/  20.30 GFLOPS | Progress: (8/20) | 6.65 s
    [Task 12/25]  Current/Best:   12.71/  20.30 GFLOPS | Progress: (12/20) | 9.97 s
    [Task 12/25]  Current/Best:   14.50/  20.30 GFLOPS | Progress: (16/20) | 14.31 s
    [Task 12/25]  Current/Best:   15.98/  20.30 GFLOPS | Progress: (20/20) | 16.18 s Done.
+
    [Task 13/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 13/25]  Current/Best:   17.27/  17.27 GFLOPS | Progress: (4/20) | 3.91 s
    [Task 13/25]  Current/Best:    9.94/  19.57 GFLOPS | Progress: (8/20) | 6.54 s
    [Task 13/25]  Current/Best:    6.11/  21.67 GFLOPS | Progress: (12/20) | 9.63 s
    [Task 13/25]  Current/Best:    5.71/  21.67 GFLOPS | Progress: (16/20) | 12.93 s
    [Task 13/25]  Current/Best:   15.75/  21.67 GFLOPS | Progress: (20/20) | 14.86 s Done.
+
    [Task 14/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 14/25]  Current/Best:   12.36/  18.32 GFLOPS | Progress: (4/20) | 4.92 s
    [Task 14/25]  Current/Best:    9.27/  18.32 GFLOPS | Progress: (8/20) | 6.95 s
    [Task 14/25]  Current/Best:   14.14/  18.32 GFLOPS | Progress: (12/20) | 10.35 s
    [Task 14/25]  Current/Best:    5.02/  18.32 GFLOPS | Progress: (16/20) | 13.09 s
    [Task 14/25]  Current/Best:   15.90/  18.32 GFLOPS | Progress: (20/20) | 15.03 s
    [Task 15/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s Done.
      Done.
-
    [Task 21/25]  Current/Best:    9.08/  20.31 GFLOPS | Progress: (16/20) | 14.61 s
    [Task 21/25]  Current/Best:   11.79/  20.31 GFLOPS | Progress: (20/20) | 17.50 s
    [Task 22/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 22/25]  Current/Best:   10.31/  13.35 GFLOPS | Progress: (4/20) | 3.70 s
    [Task 22/25]  Current/Best:    8.88/  15.88 GFLOPS | Progress: (8/20) | 5.81 s
    [Task 22/25]  Current/Best:   10.31/  15.88 GFLOPS | Progress: (12/20) | 7.37 s
    [Task 22/25]  Current/Best:   10.38/  17.76 GFLOPS | Progress: (16/20) | 9.40 s
    [Task 22/25]  Current/Best:   15.43/  17.76 GFLOPS | Progress: (20/20) | 11.33 s Done.
-
    [Task 23/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 23/25]  Current/Best:   15.40/  15.40 GFLOPS | Progress: (4/20) | 9.43 s
    [Task 23/25]  Current/Best:    8.35/  18.50 GFLOPS | Progress: (8/20) | 12.46 s
    [Task 23/25]  Current/Best:    2.65/  18.50 GFLOPS | Progress: (12/20) | 17.38 s
    [Task 23/25]  Current/Best:   19.32/  21.58 GFLOPS | Progress: (16/20) | 19.25 s
    [Task 23/25]  Current/Best:   21.26/  21.58 GFLOPS | Progress: (20/20) | 21.76 s Done.
-
    [Task 24/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 24/25]  Current/Best:    1.71/   3.02 GFLOPS | Progress: (4/20) | 12.29 s
    [Task 24/25]  Current/Best:   10.23/  10.23 GFLOPS | Progress: (8/20) | 22.75 s
    [Task 24/25]  Current/Best:    9.36/  10.23 GFLOPS | Progress: (12/20) | 24.68 s
    [Task 24/25]  Current/Best:    8.32/  10.23 GFLOPS | Progress: (16/20) | 35.39 s
    [Task 24/25]  Current/Best:    3.73/  10.23 GFLOPS | Progress: (20/20) | 46.18 s
    [Task 25/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s Done.
-
    [Task 25/25]  Current/Best:    8.96/   8.96 GFLOPS | Progress: (4/20) | 12.24 s
    [Task 25/25]  Current/Best:    1.55/   9.20 GFLOPS | Progress: (8/20) | 22.97 s
    [Task 25/25]  Current/Best:    5.72/   9.20 GFLOPS | Progress: (12/20) | 25.69 s
    [Task 25/25]  Current/Best:    6.04/   9.20 GFLOPS | Progress: (16/20) | 28.45 s
    [Task 25/25]  Current/Best:    5.80/   9.23 GFLOPS | Progress: (20/20) | 39.22 s
+
    [Task 15/25]  Current/Best:   14.44/  17.40 GFLOPS | Progress: (4/20) | 3.84 s
    [Task 15/25]  Current/Best:    5.83/  18.82 GFLOPS | Progress: (8/20) | 8.84 s
    [Task 15/25]  Current/Best:   13.70/  18.82 GFLOPS | Progress: (12/20) | 10.89 s
    [Task 15/25]  Current/Best:   14.27/  18.82 GFLOPS | Progress: (16/20) | 13.60 s
    [Task 15/25]  Current/Best:   16.16/  22.41 GFLOPS | Progress: (20/20) | 14.89 s
    [Task 16/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 16/25]  Current/Best:   11.80/  18.54 GFLOPS | Progress: (4/20) | 3.09 s
    [Task 16/25]  Current/Best:   12.91/  18.54 GFLOPS | Progress: (8/20) | 4.84 s
    [Task 16/25]  Current/Best:   13.46/  20.12 GFLOPS | Progress: (12/20) | 6.82 s
    [Task 16/25]  Current/Best:   16.15/  20.12 GFLOPS | Progress: (16/20) | 8.31 s
    [Task 16/25]  Current/Best:   15.13/  20.12 GFLOPS | Progress: (20/20) | 9.69 s Done.
+
    [Task 17/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 17/25]  Current/Best:    9.27/  18.62 GFLOPS | Progress: (4/20) | 5.14 s
    [Task 17/25]  Current/Best:   16.37/  18.62 GFLOPS | Progress: (8/20) | 7.58 s
    [Task 17/25]  Current/Best:   12.77/  18.63 GFLOPS | Progress: (12/20) | 10.27 s
    [Task 17/25]  Current/Best:   11.99/  18.63 GFLOPS | Progress: (16/20) | 14.19 s
    [Task 17/25]  Current/Best:   18.82/  18.82 GFLOPS | Progress: (20/20) | 16.34 s Done.
+
    [Task 18/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 18/25]  Current/Best:   10.42/  18.29 GFLOPS | Progress: (4/20) | 4.43 s
    [Task 18/25]  Current/Best:    9.82/  18.29 GFLOPS | Progress: (8/20) | 6.98 s
    [Task 18/25]  Current/Best:   17.65/  20.30 GFLOPS | Progress: (12/20) | 9.26 s
    [Task 18/25]  Current/Best:   14.26/  20.30 GFLOPS | Progress: (16/20) | 15.38 s
    [Task 18/25]  Current/Best:    9.85/  20.30 GFLOPS | Progress: (20/20) | 18.55 s Done.
+
    [Task 19/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 19/25]  Current/Best:   16.00/  21.17 GFLOPS | Progress: (4/20) | 3.76 s
    [Task 19/25]  Current/Best:   10.49/  21.17 GFLOPS | Progress: (8/20) | 6.36 s
    [Task 19/25]  Current/Best:    4.90/  21.17 GFLOPS | Progress: (12/20) | 11.79 s
    [Task 19/25]  Current/Best:    7.65/  21.17 GFLOPS | Progress: (16/20) | 13.87 s
    [Task 19/25]  Current/Best:    9.34/  21.17 GFLOPS | Progress: (20/20) | 17.76 s Done.
+
    [Task 20/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 20/25]  Current/Best:   11.32/  11.32 GFLOPS | Progress: (4/20) | 4.14 s
    [Task 20/25]  Current/Best:    3.10/  16.60 GFLOPS | Progress: (8/20) | 6.81 s
    [Task 20/25]  Current/Best:   10.63/  16.60 GFLOPS | Progress: (12/20) | 8.47 s
    [Task 20/25]  Current/Best:   11.75/  20.00 GFLOPS | Progress: (16/20) | 11.10 s
    [Task 20/25]  Current/Best:   13.61/  20.00 GFLOPS | Progress: (20/20) | 14.49 s
    [Task 21/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 21/25]  Current/Best:   12.69/  17.47 GFLOPS | Progress: (4/20) | 2.84 s
    [Task 21/25]  Current/Best:   15.23/  20.09 GFLOPS | Progress: (8/20) | 5.06 s
    [Task 21/25]  Current/Best:    9.60/  20.09 GFLOPS | Progress: (12/20) | 6.54 s
    [Task 21/25]  Current/Best:   20.69/  20.69 GFLOPS | Progress: (16/20) | 11.99 s Done.
+     Done.
+
    [Task 21/25]  Current/Best:   17.53/  20.69 GFLOPS | Progress: (20/20) | 15.14 s Done.
+
    [Task 22/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 22/25]  Current/Best:    2.68/  18.79 GFLOPS | Progress: (4/20) | 3.48 s
    [Task 22/25]  Current/Best:   10.88/  18.79 GFLOPS | Progress: (8/20) | 6.74 s
    [Task 22/25]  Current/Best:   16.40/  18.79 GFLOPS | Progress: (12/20) | 10.06 s
    [Task 22/25]  Current/Best:    6.70/  18.79 GFLOPS | Progress: (16/20) | 13.25 s
    [Task 22/25]  Current/Best:   17.72/  19.62 GFLOPS | Progress: (20/20) | 15.28 s Done.
+
    [Task 23/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 23/25]  Current/Best:   19.78/  19.78 GFLOPS | Progress: (4/20) | 5.91 s
    [Task 23/25]  Current/Best:   18.81/  19.78 GFLOPS | Progress: (8/20) | 9.33 s
    [Task 23/25]  Current/Best:   17.94/  19.78 GFLOPS | Progress: (12/20) | 12.42 s
    [Task 23/25]  Current/Best:   13.09/  19.78 GFLOPS | Progress: (16/20) | 16.08 s
    [Task 23/25]  Current/Best:   19.67/  19.78 GFLOPS | Progress: (20/20) | 19.93 s Done.
+
    [Task 24/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 24/25]  Current/Best:    3.00/   3.00 GFLOPS | Progress: (4/20) | 12.07 s
    [Task 24/25]  Current/Best:    2.23/   7.47 GFLOPS | Progress: (8/20) | 23.95 s
    [Task 24/25]  Current/Best:    8.21/   8.21 GFLOPS | Progress: (12/20) | 33.25 s
    [Task 24/25]  Current/Best:    7.03/   8.21 GFLOPS | Progress: (16/20) | 43.50 s
    [Task 24/25]  Current/Best:    2.94/   8.26 GFLOPS | Progress: (20/20) | 45.81 s
    [Task 25/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s Done.
+
    [Task 25/25]  Current/Best:    5.81/   5.81 GFLOPS | Progress: (4/20) | 12.27 s
    [Task 25/25]  Current/Best:    9.19/   9.19 GFLOPS | Progress: (8/20) | 23.56 s
    [Task 25/25]  Current/Best:    3.83/   9.19 GFLOPS | Progress: (12/20) | 27.37 s
    [Task 25/25]  Current/Best:    8.59/   9.19 GFLOPS | Progress: (16/20) | 38.06 s
    [Task 25/25]  Current/Best:    5.66/   9.19 GFLOPS | Progress: (20/20) | 49.55 s
 
 
 
@@ -673,8 +674,8 @@ Verify that the optimized model runs and produces the same results:
 
  .. code-block:: none
 
-    class='n02123045 tabby, tabby cat' with probability=0.621104
-    class='n02123159 tiger cat' with probability=0.356377
+    class='n02123045 tabby, tabby cat' with probability=0.621102
+    class='n02123159 tiger cat' with probability=0.356380
     class='n02124075 Egyptian cat' with probability=0.019712
     class='n02129604 tiger, Panthera tigris' with probability=0.001215
     class='n04040759 radiator' with probability=0.000262
@@ -731,8 +732,8 @@ improvement in comparing the optimized model to the unoptimized model.
 
  .. code-block:: none
 
-    optimized: {'mean': 426.7246240199984, 'median': 427.6279031000058, 'std': 3.6335058826692914}
-    unoptimized: {'mean': 516.7721257999995, 'median': 516.1846908499967, 'std': 2.6758526756169507}
+    optimized: {'mean': 432.2289605100286, 'median': 430.22511945005135, 'std': 4.6967404645537965}
+    unoptimized: {'mean': 519.3600799600063, 'median': 519.3016042000181, 'std': 1.4946292799988907}
 
 
 
@@ -755,7 +756,7 @@ profiling/benchmarking.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 11 minutes  11.498 seconds)
+   **Total running time of the script:** ( 11 minutes  22.803 seconds)
 
 
 .. _sphx_glr_download_tutorial_autotvm_relay_x86.py:
diff --git a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
index 7d7e32aa6a..ec3331d84f 100644
--- a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
+++ b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
@@ -270,7 +270,7 @@ device and returns the measured cost. Network overhead is excluded.
 
  .. code-block:: none
 
-    1.261e-07 secs/op
+    1.19e-07 secs/op
 
 
 
diff --git a/docs/_sources/tutorial/intro_topi.rst.txt b/docs/_sources/tutorial/intro_topi.rst.txt
index 260881e12c..554f8e6e34 100644
--- a/docs/_sources/tutorial/intro_topi.rst.txt
+++ b/docs/_sources/tutorial/intro_topi.rst.txt
@@ -263,7 +263,7 @@ As you can see, scheduled stages of computation have been accumulated and we can
 
  .. code-block:: none
 
-    [stage(a, placeholder(a, 0x22d7f790)), stage(b, placeholder(b, 0xe42ade0)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min [...]
+    [stage(a, placeholder(a, 0x493c220)), stage(b, placeholder(b, 0x7b8a520)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min= [...]
 
 
 
diff --git a/docs/_sources/tutorial/sg_execution_times.rst.txt b/docs/_sources/tutorial/sg_execution_times.rst.txt
index f54bcaefed..b946754a04 100644
--- a/docs/_sources/tutorial/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorial/sg_execution_times.rst.txt
@@ -5,32 +5,32 @@
 
 Computation times
 =================
-**14:46.737** total execution time for **tutorial** files:
+**14:43.164** total execution time for **tutorial** files:
 
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``)                 | 11:11.498 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``)                 | 11:22.803 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``) | 01:29.291 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``) | 01:14.656 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``)     | 01:01.634 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``)     | 01:00.625 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``)                 | 00:36.329 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``)                 | 00:36.114 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``)               | 00:25.643 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``)               | 00:27.481 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``)       | 00:01.383 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``)                               | 00:00.769 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``)                               | 00:00.775 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``)       | 00:00.539 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``) | 00:00.174 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``) | 00:00.164 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_introduction.py` (``introduction.py``)                           | 00:00.006 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_introduction.py` (``introduction.py``)                           | 00:00.009 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_tutorial_uma.py` (``uma.py``)                                             | 00:00.001 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_install.py` (``install.py``)                                     | 00:00.001 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tvmc_command_line_driver.py` (``tvmc_command_line_driver.py``)   | 00:00.001 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_tutorial_tvmc_python.py` (``tvmc_python.py``)                             | 00:00.001 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tvmc_command_line_driver.py` (``tvmc_command_line_driver.py``)   | 00:00.001 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_install.py` (``install.py``)                                     | 00:00.001 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
index 735f2b8270..930870f699 100644
--- a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
+++ b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
@@ -295,7 +295,7 @@ helper function to run a profile of the TVM generated code.
  .. code-block:: none
 
     Numpy running time: 0.000007
-    naive: 0.000007
+    naive: 0.000008
 
 
 
@@ -394,7 +394,7 @@ compile and run this new schedule with the parallel operation applied:
 
  .. code-block:: none
 
-    parallel: 0.000007
+    parallel: 0.000008
 
 
 
@@ -501,10 +501,10 @@ We can now compare the different schedules
  .. code-block:: none
 
                 Operator                  Timing             Performance
-                   numpy    6.8515499992827245e-06                   1.0
-                   naive              6.7141e-06      0.9799388460571529
-                parallel              6.9646e-06      1.0164999161837993
-                  vector             2.45354e-05       3.580999920101081
+                   numpy    7.082329993863823e-06                    1.0
+                   naive    7.593499999999999e-06     1.0721754008326438
+                parallel               8.116e-06       1.145950556812766
+                  vector             2.46192e-05      3.4761441533125734
 
 
 
@@ -925,7 +925,7 @@ matrix multiplication.
 
  .. code-block:: none
 
-    Numpy running time: 0.018920
+    Numpy running time: 0.018744
 
 
 
@@ -983,7 +983,7 @@ optimizations.
 
  .. code-block:: none
 
-    none: 3.434763
+    none: 3.346230
 
 
 
@@ -1086,7 +1086,7 @@ schedule.
 
  .. code-block:: none
 
-    blocking: 0.308684
+    blocking: 0.316099
 
 
 
@@ -1182,7 +1182,7 @@ already cache friendly from our previous optimizations.
 
  .. code-block:: none
 
-    vectorization: 0.350372
+    vectorization: 0.351356
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1256,7 +1256,7 @@ more cache friendly.
 
  .. code-block:: none
 
-    loop permutation: 0.117353
+    loop permutation: 0.122902
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1355,7 +1355,7 @@ optimized schedule.
 
  .. code-block:: none
 
-    array packing: 0.109938
+    array packing: 0.109751
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1448,7 +1448,7 @@ to `C` when all the block results are ready.
 
  .. code-block:: none
 
-    block caching: 0.111174
+    block caching: 0.110485
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1534,7 +1534,7 @@ of thread-level parallelization.
 
  .. code-block:: none
 
-    parallelization: 0.147097
+    parallelization: 0.146882
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1615,13 +1615,13 @@ working, we can compare the results.
  .. code-block:: none
 
                 Operator                  Timing             Performance
-                    none            3.4347627511                     1.0
-                blocking            0.3086840032       0.089870545818963
-           vectorization            0.3503720796     0.10200765088878164
-        loop permutation            0.1173534316     0.03416638647383054
-           array packing            0.1099380713     0.03200747162661869
-           block caching     0.11117442880000002      0.0323674258911757
-         parallelization            0.1470974565     0.04282608935737739
+                    none            3.3462296834                     1.0
+                blocking     0.31609916180000003     0.09446427523134679
+           vectorization            0.3513558408     0.10500051521956444
+        loop permutation     0.12290175079999999     0.03672842644654426
+           array packing             0.109750898    0.032798375599993344
+           block caching            0.1104851169    0.033017792367360604
+         parallelization            0.1468822499    0.043894849964619745
 
 
 
@@ -1663,7 +1663,7 @@ the computation for specific platforms.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  1.634 seconds)
+   **Total running time of the script:** ( 1 minutes  0.625 seconds)
 
 
 .. _sphx_glr_download_tutorial_tensor_expr_get_started.py:
diff --git a/docs/commit_hash b/docs/commit_hash
index 1d83f46a99..7f87f78b9a 100644
--- a/docs/commit_hash
+++ b/docs/commit_hash
@@ -1 +1 @@
-ce777fde18bb4c1ef23a856a998c50606c7947f8
+b16a64d6edb9fd1a014fc51995dff7d0e2f4c84e
diff --git a/docs/genindex.html b/docs/genindex.html
index c69497bee2..07a5a64bab 100644
--- a/docs/genindex.html
+++ b/docs/genindex.html
@@ -985,10 +985,10 @@
 </li>
       <li><a href="reference/api/python/relay/nn.html#tvm.relay.nn.contrib_conv3d_winograd_weight_transform">contrib_conv3d_winograd_weight_transform() (in module tvm.relay.nn)</a>
 </li>
-  </ul></td>
-  <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="reference/api/python/relay/nn.html#tvm.relay.nn.contrib_conv3d_winograd_without_weight_transform">contrib_conv3d_winograd_without_weight_transform() (in module tvm.relay.nn)</a>
 </li>
+  </ul></td>
+  <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="reference/api/python/relay/nn.html#tvm.relay.nn.contrib_dense_pack">contrib_dense_pack() (in module tvm.relay.nn)</a>
 </li>
       <li><a href="reference/api/python/relay/nn.html#tvm.relay.nn.contrib_depthwise_conv2d_nchwc">contrib_depthwise_conv2d_nchwc() (in module tvm.relay.nn)</a>
@@ -1044,6 +1044,10 @@
       <li><a href="reference/api/python/topi.html#tvm.topi.nn.conv2d_transpose_nchw">conv2d_transpose_nchw() (in module tvm.topi.nn)</a>
 </li>
       <li><a href="reference/api/python/topi.html#tvm.topi.nn.conv2d_transpose_nchw_preprocess">conv2d_transpose_nchw_preprocess() (in module tvm.topi.nn)</a>
+</li>
+      <li><a href="reference/api/python/topi.html#tvm.topi.nn.conv2d_winograd_nchw">conv2d_winograd_nchw() (in module tvm.topi.nn)</a>
+</li>
+      <li><a href="reference/api/python/topi.html#tvm.topi.nn.conv2d_winograd_nchw_without_weight_transform">conv2d_winograd_nchw_without_weight_transform() (in module tvm.topi.nn)</a>
 </li>
       <li><a href="reference/api/python/topi.html#tvm.topi.nn.conv2d_winograd_nhwc">conv2d_winograd_nhwc() (in module tvm.topi.nn)</a>
 </li>
diff --git a/docs/how_to/compile_models/from_darknet.html b/docs/how_to/compile_models/from_darknet.html
index b34776a2a3..83da127f58 100644
--- a/docs/how_to/compile_models/from_darknet.html
+++ b/docs/how_to/compile_models/from_darknet.html
@@ -585,7 +585,7 @@ class:[&#39;truck 0.9266&#39;] left:471 top:83 right:689 bottom:169
 class:[&#39;bicycle 0.9984&#39;] left:111 top:113 right:577 bottom:447
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  14.292 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  13.416 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-darknet-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/7716f96385bd5abb6e822041e285be54/from_darknet.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_darknet.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/from_keras.html b/docs/how_to/compile_models/from_keras.html
index 5a92628a61..3e8e14895d 100644
--- a/docs/how_to/compile_models/from_keras.html
+++ b/docs/how_to/compile_models/from_keras.html
@@ -506,7 +506,7 @@ pip install -U tensorflow --user
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Relay top-1 id: 285, class name: Egyptian cat
 
 1/1 [==============================] - ETA: 0s
-1/1 [==============================] - 1s 949ms/step
+1/1 [==============================] - 1s 972ms/step
 Keras top-1 id: 285, class name: Egyptian cat
 </pre></div>
 </div>
diff --git a/docs/how_to/compile_models/from_mxnet.html b/docs/how_to/compile_models/from_mxnet.html
index 1d229d9240..478cb498e0 100644
--- a/docs/how_to/compile_models/from_mxnet.html
+++ b/docs/how_to/compile_models/from_mxnet.html
@@ -440,7 +440,7 @@ to download the full example code</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;x&quot;</span><span class="p">,</span> <a href="https://docs.python.org/3/library/stdtypes.html#tuple" title="builtins.tuple" class="sphx-glr-backref-module-builtins sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">x</span><span class="o">.</span><span class="n">shape</span></a><span class="p">)</span>
 </pre></div>
 </div>
-<img src="../../_images/sphx_glr_from_mxnet_001.png" srcset="../../_images/sphx_glr_from_mxnet_001.png" alt="from mxnet" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip6bf3f2ad-c7cc-4b27-b0d1-8236805957de from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+<img src="../../_images/sphx_glr_from_mxnet_001.png" srcset="../../_images/sphx_glr_from_mxnet_001.png" alt="from mxnet" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zipe397f8ff-e142-4742-9764-ba71e41f7e82 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
 x (1, 3, 224, 224)
 </pre></div>
 </div>
diff --git a/docs/how_to/compile_models/from_oneflow.html b/docs/how_to/compile_models/from_oneflow.html
index 3184d549cd..8d9ae7f4a4 100644
--- a/docs/how_to/compile_models/from_oneflow.html
+++ b/docs/how_to/compile_models/from_oneflow.html
@@ -448,14 +448,13 @@ Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdo
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: &quot;https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip&quot; to /workspace/.oneflow/flowvision_cache/resnet18.zip
 
   0%|          | 0.00/41.5M [00:00&lt;?, ?B/s]
- 15%|#5        | 6.33M/41.5M [00:00&lt;00:00, 53.4MB/s]
- 28%|##7       | 11.4M/41.5M [00:00&lt;00:00, 43.1MB/s]
- 38%|###7      | 15.6M/41.5M [00:00&lt;00:00, 40.4MB/s]
- 49%|####8     | 20.3M/41.5M [00:00&lt;00:00, 43.5MB/s]
- 59%|#####9    | 24.5M/41.5M [00:00&lt;00:00, 34.9MB/s]
- 77%|#######7  | 32.0M/41.5M [00:00&lt;00:00, 41.3MB/s]
- 92%|#########2| 38.3M/41.5M [00:00&lt;00:00, 39.3MB/s]
-100%|##########| 41.5M/41.5M [00:01&lt;00:00, 41.4MB/s]
+ 15%|#5        | 6.33M/41.5M [00:00&lt;00:01, 33.1MB/s]
+ 27%|##7       | 11.3M/41.5M [00:00&lt;00:00, 41.0MB/s]
+ 39%|###8      | 16.0M/41.5M [00:00&lt;00:00, 35.2MB/s]
+ 58%|#####7    | 24.0M/41.5M [00:00&lt;00:00, 44.8MB/s]
+ 77%|#######7  | 32.0M/41.5M [00:00&lt;00:00, 53.1MB/s]
+ 96%|#########6| 40.0M/41.5M [00:00&lt;00:00, 58.4MB/s]
+100%|##########| 41.5M/41.5M [00:00&lt;00:00, 51.3MB/s]
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/compile_models/from_pytorch.html b/docs/how_to/compile_models/from_pytorch.html
index e25539e300..0d95d14e9f 100644
--- a/docs/how_to/compile_models/from_pytorch.html
+++ b/docs/how_to/compile_models/from_pytorch.html
@@ -431,10 +431,13 @@ be unstable.</p>
 Downloading: &quot;https://download.pytorch.org/models/resnet18-f37072fd.pth&quot; to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
 
   0%|          | 0.00/44.7M [00:00&lt;?, ?B/s]
- 18%|#7        | 7.99M/44.7M [00:00&lt;00:00, 67.8MB/s]
- 60%|######    | 26.9M/44.7M [00:00&lt;00:00, 137MB/s]
- 91%|#########1| 40.7M/44.7M [00:00&lt;00:00, 105MB/s]
-100%|##########| 44.7M/44.7M [00:00&lt;00:00, 113MB/s]
+ 14%|#4        | 6.42M/44.7M [00:00&lt;00:00, 67.3MB/s]
+ 29%|##8       | 12.8M/44.7M [00:00&lt;00:00, 56.9MB/s]
+ 41%|####1     | 18.4M/44.7M [00:00&lt;00:00, 41.0MB/s]
+ 54%|#####3    | 24.0M/44.7M [00:00&lt;00:00, 45.8MB/s]
+ 72%|#######1  | 32.0M/44.7M [00:00&lt;00:00, 51.3MB/s]
+ 90%|########9 | 40.0M/44.7M [00:00&lt;00:00, 56.3MB/s]
+100%|##########| 44.7M/44.7M [00:00&lt;00:00, 57.6MB/s]
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/compile_models/from_tensorflow.html b/docs/how_to/compile_models/from_tensorflow.html
index 82d570c46d..d0047577ad 100644
--- a/docs/how_to/compile_models/from_tensorflow.html
+++ b/docs/how_to/compile_models/from_tensorflow.html
@@ -645,7 +645,7 @@ banana (score = 0.00022)
 desk (score = 0.00019)
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  13.151 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  12.002 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-tensorflow-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/7f1d3d1b878694c201c614c807cdebc8/from_tensorflow.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_tensorflow.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/sg_execution_times.html b/docs/how_to/compile_models/sg_execution_times.html
index 6bb32ee2a8..8d516892bf 100644
--- a/docs/how_to/compile_models/sg_execution_times.html
+++ b/docs/how_to/compile_models/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-compile-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>05:56.507</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
+<p><strong>05:52.428</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 81%" />
@@ -349,43 +349,43 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="from_darknet.html#sphx-glr-how-to-compile-models-from-darknet-py"><span class="std std-ref">Compile YOLO-V2 and YOLO-V3 in DarkNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_darknet.py</span></code>)</p></td>
-<td><p>01:14.292</p></td>
+<td><p>01:13.416</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="from_tensorflow.html#sphx-glr-how-to-compile-models-from-tensorflow-py"><span class="std std-ref">Compile Tensorflow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tensorflow.py</span></code>)</p></td>
-<td><p>01:13.151</p></td>
+<td><p>01:12.002</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="from_paddle.html#sphx-glr-how-to-compile-models-from-paddle-py"><span class="std std-ref">Compile PaddlePaddle Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_paddle.py</span></code>)</p></td>
-<td><p>00:48.189</p></td>
+<td><p>00:46.691</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="from_oneflow.html#sphx-glr-how-to-compile-models-from-oneflow-py"><span class="std std-ref">Compile OneFlow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_oneflow.py</span></code>)</p></td>
-<td><p>00:33.162</p></td>
+<td><p>00:33.083</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="from_mxnet.html#sphx-glr-how-to-compile-models-from-mxnet-py"><span class="std std-ref">Compile MXNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_mxnet.py</span></code>)</p></td>
-<td><p>00:31.241</p></td>
+<td><p>00:29.497</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="from_coreml.html#sphx-glr-how-to-compile-models-from-coreml-py"><span class="std std-ref">Compile CoreML Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_coreml.py</span></code>)</p></td>
-<td><p>00:28.045</p></td>
+<td><p>00:27.011</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="from_tflite.html#sphx-glr-how-to-compile-models-from-tflite-py"><span class="std std-ref">Compile TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tflite.py</span></code>)</p></td>
-<td><p>00:25.118</p></td>
+<td><p>00:25.709</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="from_pytorch.html#sphx-glr-how-to-compile-models-from-pytorch-py"><span class="std std-ref">Compile PyTorch Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_pytorch.py</span></code>)</p></td>
-<td><p>00:22.425</p></td>
+<td><p>00:23.732</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="from_keras.html#sphx-glr-how-to-compile-models-from-keras-py"><span class="std std-ref">Compile Keras Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_keras.py</span></code>)</p></td>
-<td><p>00:18.418</p></td>
+<td><p>00:18.859</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="from_onnx.html#sphx-glr-how-to-compile-models-from-onnx-py"><span class="std std-ref">Compile ONNX Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_onnx.py</span></code>)</p></td>
-<td><p>00:02.465</p></td>
+<td><p>00:02.428</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 </tbody>
diff --git a/docs/how_to/deploy_models/deploy_model_on_android.html b/docs/how_to/deploy_models/deploy_model_on_android.html
index 6cf6253056..36e61e92a7 100644
--- a/docs/how_to/deploy_models/deploy_model_on_android.html
+++ b/docs/how_to/deploy_models/deploy_model_on_android.html
@@ -662,7 +662,7 @@ to the remote android device.</p>
 Evaluate inference time cost...
 Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  16.4190      16.4221      16.5149      16.3321       0.0615
+  16.1611      16.1398      16.5247      15.9416       0.1550
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
index 401ab5c388..34879c84af 100644
--- a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
+++ b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
@@ -453,27 +453,38 @@ be unstable.</p>
 Downloading: &quot;https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth&quot; to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
 
   0%|          | 0.00/170M [00:00&lt;?, ?B/s]
-  1%|1         | 2.05M/170M [00:00&lt;00:10, 17.2MB/s]
-  3%|3         | 5.18M/170M [00:00&lt;00:08, 21.3MB/s]
-  7%|6         | 11.4M/170M [00:00&lt;00:04, 38.4MB/s]
-  9%|9         | 16.0M/170M [00:00&lt;00:04, 37.3MB/s]
- 16%|#5        | 26.8M/170M [00:00&lt;00:02, 61.2MB/s]
- 24%|##3       | 40.0M/170M [00:00&lt;00:01, 79.5MB/s]
- 32%|###2      | 54.9M/170M [00:00&lt;00:01, 102MB/s]
- 38%|###8      | 65.0M/170M [00:00&lt;00:01, 99.9MB/s]
- 44%|####4     | 74.8M/170M [00:01&lt;00:01, 77.1MB/s]
- 49%|####8     | 83.0M/170M [00:01&lt;00:01, 74.9MB/s]
- 55%|#####4    | 92.8M/170M [00:01&lt;00:00, 81.6MB/s]
- 60%|#####9    | 101M/170M [00:01&lt;00:00, 82.1MB/s]
- 64%|######4   | 109M/170M [00:01&lt;00:01, 60.2MB/s]
- 68%|######8   | 116M/170M [00:01&lt;00:00, 56.4MB/s]
- 72%|#######1  | 122M/170M [00:01&lt;00:00, 56.1MB/s]
- 75%|#######5  | 128M/170M [00:02&lt;00:00, 52.1MB/s]
- 80%|########  | 136M/170M [00:02&lt;00:00, 58.1MB/s]
- 85%|########4 | 144M/170M [00:02&lt;00:00, 62.7MB/s]
- 92%|#########2| 156M/170M [00:02&lt;00:00, 80.0MB/s]
- 97%|#########6| 165M/170M [00:02&lt;00:00, 69.5MB/s]
-100%|##########| 170M/170M [00:02&lt;00:00, 67.5MB/s]
+  4%|3         | 6.30M/170M [00:00&lt;00:02, 61.9MB/s]
+  7%|7         | 12.2M/170M [00:00&lt;00:03, 53.4MB/s]
+ 10%|#         | 17.4M/170M [00:00&lt;00:04, 38.9MB/s]
+ 13%|#3        | 22.3M/170M [00:00&lt;00:03, 41.5MB/s]
+ 16%|#5        | 26.5M/170M [00:00&lt;00:03, 40.0MB/s]
+ 19%|#8        | 32.1M/170M [00:00&lt;00:03, 45.2MB/s]
+ 24%|##3       | 40.0M/170M [00:00&lt;00:02, 46.8MB/s]
+ 28%|##8       | 47.8M/170M [00:01&lt;00:02, 55.8MB/s]
+ 31%|###1      | 53.4M/170M [00:01&lt;00:02, 53.2MB/s]
+ 35%|###4      | 58.7M/170M [00:01&lt;00:02, 52.2MB/s]
+ 38%|###7      | 63.8M/170M [00:01&lt;00:03, 34.7MB/s]
+ 40%|###9      | 67.9M/170M [00:01&lt;00:03, 31.2MB/s]
+ 43%|####2     | 72.5M/170M [00:01&lt;00:02, 34.6MB/s]
+ 46%|####6     | 78.3M/170M [00:01&lt;00:02, 36.1MB/s]
+ 48%|####8     | 82.1M/170M [00:02&lt;00:02, 34.6MB/s]
+ 51%|#####     | 86.3M/170M [00:02&lt;00:02, 34.1MB/s]
+ 53%|#####2    | 89.7M/170M [00:02&lt;00:02, 31.9MB/s]
+ 56%|#####5    | 94.3M/170M [00:02&lt;00:02, 32.5MB/s]
+ 57%|#####7    | 97.5M/170M [00:02&lt;00:02, 32.6MB/s]
+ 60%|######    | 102M/170M [00:02&lt;00:02, 27.4MB/s]
+ 62%|######1   | 105M/170M [00:03&lt;00:02, 24.8MB/s]
+ 66%|######5   | 112M/170M [00:03&lt;00:01, 32.0MB/s]
+ 70%|######9   | 118M/170M [00:03&lt;00:01, 38.5MB/s]
+ 72%|#######1  | 122M/170M [00:03&lt;00:01, 38.2MB/s]
+ 75%|#######5  | 128M/170M [00:03&lt;00:01, 36.7MB/s]
+ 80%|########  | 136M/170M [00:03&lt;00:00, 39.3MB/s]
+ 85%|########4 | 144M/170M [00:03&lt;00:00, 43.9MB/s]
+ 88%|########8 | 150M/170M [00:04&lt;00:00, 31.6MB/s]
+ 91%|#########1| 155M/170M [00:04&lt;00:00, 34.3MB/s]
+ 94%|#########4| 160M/170M [00:04&lt;00:00, 34.3MB/s]
+ 99%|#########8| 168M/170M [00:04&lt;00:00, 41.2MB/s]
+100%|##########| 170M/170M [00:04&lt;00:00, 38.1MB/s]
 /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torch/nn/functional.py:3897: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
   for i in range(dim)
 /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/detection/anchor_utils.py:124: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the &#39;trunc&#39; function NOT &#39;floor&#39;). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode=&#39;trunc&#39;), or for actual floor division, use torch.div(a, b, rounding_mode=& [...]
@@ -571,7 +582,7 @@ torchvision rcnn models.</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Get 9 valid boxes
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes  19.999 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes  22.367 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-object-detection-pytorch-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/7795da4b258c8feff986668b95ef57ad/deploy_object_detection_pytorch.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_object_detection_pytorch.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized.html b/docs/how_to/deploy_models/deploy_prequantized.html
index d79085f746..b8d63f55f8 100644
--- a/docs/how_to/deploy_models/deploy_prequantized.html
+++ b/docs/how_to/deploy_models/deploy_prequantized.html
@@ -497,9 +497,8 @@ training. Other models require a full post training calibration.</p>
 Downloading: &quot;https://download.pytorch.org/models/mobilenet_v2-b0353104.pth&quot; to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
 
   0%|          | 0.00/13.6M [00:00&lt;?, ?B/s]
- 48%|####7     | 6.46M/13.6M [00:00&lt;00:00, 67.7MB/s]
- 95%|#########5| 12.9M/13.6M [00:00&lt;00:00, 55.8MB/s]
-100%|##########| 13.6M/13.6M [00:00&lt;00:00, 59.2MB/s]
+ 59%|#####8    | 7.99M/13.6M [00:00&lt;00:00, 64.1MB/s]
+100%|##########| 13.6M/13.6M [00:00&lt;00:00, 93.5MB/s]
 </pre></div>
 </div>
 </div>
@@ -590,7 +589,7 @@ output values are identical out of 1000 outputs from mobilenet v2.</p>
 </div>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  90.6124      90.5421      91.8263      90.1658       0.3243
+  90.4649      90.2693      95.9496      90.0914       0.7106
 </pre></div>
 </div>
 <div class="admonition note">
@@ -629,7 +628,7 @@ This includes support for the VNNI 8 bit dot product instruction (CascadeLake or
 <div class="section" id="deploy-a-quantized-tflite-model">
 <h2>Deploy a quantized TFLite Model<a class="headerlink" href="#deploy-a-quantized-tflite-model" title="Permalink to this headline">¶</a></h2>
 <p>TODO</p>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  8.086 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  7.421 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/fb8217c13f4351224c6cf3aacf1a87fc/deploy_prequantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized_tflite.html b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
index 751e236266..e6659647f8 100644
--- a/docs/how_to/deploy_models/deploy_prequantized_tflite.html
+++ b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
@@ -582,7 +582,7 @@ TFLite Top-5 labels: [387 102 386 341 349]
 </div>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  121.2750     121.1947     123.1208     120.4836      0.4973
+  120.8390     120.7152     124.1033     119.9997      0.6289
 </pre></div>
 </div>
 <div class="admonition note">
@@ -610,7 +610,7 @@ network for ARM CPU</span></a>.</p></li>
 </ul>
 </div></blockquote>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes  30.376 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes  29.844 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-tflite-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/56691c7a27d45da61d112276334640d3/deploy_prequantized_tflite.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized_tflite.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_quantized.html b/docs/how_to/deploy_models/deploy_quantized.html
index 834e8d62c0..d5a1956fe0 100644
--- a/docs/how_to/deploy_models/deploy_quantized.html
+++ b/docs/how_to/deploy_models/deploy_quantized.html
@@ -520,7 +520,7 @@ for calibration. But the accuracy might be impacted.</p>
   DeprecationWarning,
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  31.056 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  55.880 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-quantized-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/7810ecf51bfc05f7d5e8a400ac3e815d/deploy_quantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_quantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
index ffca6f430d..c91631b5a8 100644
--- a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
+++ b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
@@ -462,23 +462,23 @@ to your device.</p>
 Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
 
   0%|          | 0/132723 [00:00&lt;?, ?KB/s]
-  3%|2         | 3679/132723 [00:00&lt;00:03, 36784.77KB/s]
-  8%|8         | 11013/132723 [00:00&lt;00:02, 58282.29KB/s]
- 15%|#4        | 19714/132723 [00:00&lt;00:01, 71395.71KB/s]
- 21%|##1       | 28372/132723 [00:00&lt;00:01, 77387.86KB/s]
- 28%|##7       | 36976/132723 [00:00&lt;00:01, 80506.26KB/s]
- 34%|###4      | 45695/132723 [00:00&lt;00:01, 82776.10KB/s]
- 41%|####      | 54282/132723 [00:00&lt;00:00, 83784.86KB/s]
- 47%|####7     | 63028/132723 [00:00&lt;00:00, 84951.26KB/s]
- 54%|#####4    | 71712/132723 [00:00&lt;00:00, 85539.59KB/s]
- 61%|######    | 80417/132723 [00:01&lt;00:00, 86002.08KB/s]
- 67%|######7   | 89126/132723 [00:01&lt;00:00, 86264.48KB/s]
- 74%|#######3  | 97826/132723 [00:01&lt;00:00, 86486.24KB/s]
- 80%|########  | 106535/132723 [00:01&lt;00:00, 86666.77KB/s]
- 87%|########6 | 115202/132723 [00:01&lt;00:00, 86522.76KB/s]
- 93%|#########3| 123855/132723 [00:01&lt;00:00, 86517.83KB/s]
-100%|#########9| 132701/132723 [00:01&lt;00:00, 87099.53KB/s]
-100%|##########| 132723/132723 [00:01&lt;00:00, 82750.86KB/s]
+  4%|3         | 4692/132723 [00:00&lt;00:02, 46914.12KB/s]
+ 10%|#         | 13412/132723 [00:00&lt;00:01, 70605.36KB/s]
+ 15%|#5        | 20473/132723 [00:00&lt;00:01, 69856.79KB/s]
+ 22%|##1       | 29149/132723 [00:00&lt;00:01, 76482.76KB/s]
+ 28%|##7       | 36803/132723 [00:00&lt;00:01, 68852.31KB/s]
+ 34%|###4      | 45556/132723 [00:00&lt;00:01, 74748.18KB/s]
+ 40%|####      | 53153/132723 [00:00&lt;00:01, 60600.90KB/s]
+ 47%|####6     | 62015/132723 [00:00&lt;00:01, 67990.43KB/s]
+ 52%|#####2    | 69278/132723 [00:01&lt;00:01, 57875.78KB/s]
+ 58%|#####8    | 77196/132723 [00:01&lt;00:00, 63120.16KB/s]
+ 63%|######3   | 84005/132723 [00:01&lt;00:00, 55720.83KB/s]
+ 70%|######9   | 92761/132723 [00:01&lt;00:00, 63467.05KB/s]
+ 75%|#######5  | 100009/132723 [00:01&lt;00:00, 65789.42KB/s]
+ 82%|########1 | 108721/132723 [00:01&lt;00:00, 71530.75KB/s]
+ 88%|########7 | 116221/132723 [00:01&lt;00:00, 56441.08KB/s]
+ 94%|#########4| 125006/132723 [00:01&lt;00:00, 63853.39KB/s]
+100%|##########| 132723/132723 [00:02&lt;00:00, 65271.66KB/s]
 </pre></div>
 </div>
 <p>Create TVM runtime and do inference
@@ -517,7 +517,7 @@ Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from h
 <span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
 </pre></div>
 </div>
-<img src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" srcset="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" alt="deploy ssd gluoncv" class = "sphx-glr-single-img"/><p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes  5.790 seconds)</p>
+<img src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" srcset="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" alt="deploy ssd gluoncv" class = "sphx-glr-single-img"/><p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes  5.709 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-ssd-gluoncv-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/cccb17d28e5e8b2e94ea8cd5ec59f6ed/deploy_ssd_gluoncv.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_ssd_gluoncv.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/sg_execution_times.html b/docs/how_to/deploy_models/sg_execution_times.html
index 52f6dafe76..c28e43f1ad 100644
--- a/docs/how_to/deploy_models/sg_execution_times.html
+++ b/docs/how_to/deploy_models/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-deploy-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>13:02.812</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
+<p><strong>13:29.609</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 86%" />
@@ -349,35 +349,35 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="deploy_object_detection_pytorch.html#sphx-glr-how-to-deploy-models-deploy-object-detection-pytorch-py"><span class="std std-ref">Compile PyTorch Object Detection Models</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_object_detection_pytorch.py</span></code>)</p></td>
-<td><p>03:19.999</p></td>
+<td><p>03:22.367</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="deploy_ssd_gluoncv.html#sphx-glr-how-to-deploy-models-deploy-ssd-gluoncv-py"><span class="std std-ref">Deploy Single Shot Multibox Detector(SSD) model</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_ssd_gluoncv.py</span></code>)</p></td>
-<td><p>03:05.790</p></td>
+<td><p>03:05.709</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="deploy_prequantized_tflite.html#sphx-glr-how-to-deploy-models-deploy-prequantized-tflite-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM - Part 3 (TFLite)</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized_tflite.py</span></code>)</p></td>
-<td><p>02:30.376</p></td>
+<td><p>02:29.844</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="deploy_quantized.html#sphx-glr-how-to-deploy-models-deploy-quantized-py"><span class="std std-ref">Deploy a Quantized Model on Cuda</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_quantized.py</span></code>)</p></td>
-<td><p>01:31.056</p></td>
+<td><p>01:55.880</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="deploy_prequantized.html#sphx-glr-how-to-deploy-models-deploy-prequantized-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized.py</span></code>)</p></td>
-<td><p>01:08.086</p></td>
+<td><p>01:07.421</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="deploy_model_on_android.html#sphx-glr-how-to-deploy-models-deploy-model-on-android-py"><span class="std std-ref">Deploy the Pretrained Model on Android</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_android.py</span></code>)</p></td>
-<td><p>00:37.059</p></td>
+<td><p>00:36.832</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="deploy_model_on_nano.html#sphx-glr-how-to-deploy-models-deploy-model-on-nano-py"><span class="std std-ref">Deploy the Pretrained Model on Jetson Nano</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_nano.py</span></code>)</p></td>
-<td><p>00:25.535</p></td>
+<td><p>00:26.055</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="deploy_model_on_rasp.html#sphx-glr-how-to-deploy-models-deploy-model-on-rasp-py"><span class="std std-ref">Deploy the Pretrained Model on Raspberry Pi</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_rasp.py</span></code>)</p></td>
-<td><p>00:24.905</p></td>
+<td><p>00:25.495</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="deploy_sparse.html#sphx-glr-how-to-deploy-models-deploy-sparse-py"><span class="std std-ref">Deploy a Hugging Face Pruned Model on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_sparse.py</span></code>)</p></td>
diff --git a/docs/how_to/extend_tvm/bring_your_own_datatypes.html b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
index c6f315a669..e6af5638e1 100644
--- a/docs/how_to/extend_tvm/bring_your_own_datatypes.html
+++ b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
@@ -621,7 +621,7 @@ In this alpha state of the Bring Your Own Datatypes framework, we have not imple
 <span class="n">module</span><span class="p">,</span> <a href="https://docs.python.org/3/library/stdtypes.html#dict" title="builtins.dict" class="sphx-glr-backref-module-builtins sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">params</span></a> <span class="o">=</span> <span class="n">get_mobilenet</span><span class="p">()</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip45149bf0-8821-4ad1-8130-cdb110608ac0 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip2692ce48-4cfd-490f-a4fe-279be8b87c67 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
 </pre></div>
 </div>
 <p>It’s easy to execute MobileNet with native TVM:</p>
diff --git a/docs/how_to/extend_tvm/sg_execution_times.html b/docs/how_to/extend_tvm/sg_execution_times.html
index 6480316282..b3ba7bbe01 100644
--- a/docs/how_to/extend_tvm/sg_execution_times.html
+++ b/docs/how_to/extend_tvm/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-extend-tvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:49.747</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
+<p><strong>00:47.355</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 84%" />
@@ -349,19 +349,19 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="bring_your_own_datatypes.html#sphx-glr-how-to-extend-tvm-bring-your-own-datatypes-py"><span class="std std-ref">Bring Your Own Datatypes to TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">bring_your_own_datatypes.py</span></code>)</p></td>
-<td><p>00:46.247</p></td>
+<td><p>00:43.952</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="use_pass_instrument.html#sphx-glr-how-to-extend-tvm-use-pass-instrument-py"><span class="std std-ref">How to Use TVM Pass Instrument</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_instrument.py</span></code>)</p></td>
-<td><p>00:02.438</p></td>
+<td><p>00:02.367</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="use_pass_infra.html#sphx-glr-how-to-extend-tvm-use-pass-infra-py"><span class="std std-ref">How to Use TVM Pass Infra</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_infra.py</span></code>)</p></td>
-<td><p>00:01.053</p></td>
+<td><p>00:01.028</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="low_level_custom_pass.html#sphx-glr-how-to-extend-tvm-low-level-custom-pass-py"><span class="std std-ref">Writing a Customized Pass</span></a> (<code class="docutils literal notranslate"><span class="pre">low_level_custom_pass.py</span></code>)</p></td>
-<td><p>00:00.009</p></td>
+<td><p>00:00.008</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 </tbody>
diff --git a/docs/how_to/extend_tvm/use_pass_instrument.html b/docs/how_to/extend_tvm/use_pass_instrument.html
index bac21edf97..9a4bb6566d 100644
--- a/docs/how_to/extend_tvm/use_pass_instrument.html
+++ b/docs/how_to/extend_tvm/use_pass_instrument.html
@@ -525,10 +525,10 @@ profile the execution time of each passes.</p>
 </pre></div>
 </div>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 6756us [6756us] (46.50%; 46.50%)
-FoldScaleAxis: 7773us [6us] (53.50%; 53.50%)
-        FoldConstant: 7767us [1534us] (53.46%; 99.92%)
-                InferType: 6233us [6233us] (42.90%; 80.25%)
+InferType: 7305us [7305us] (47.99%; 47.99%)
+FoldScaleAxis: 7916us [7us] (52.01%; 52.01%)
+        FoldConstant: 7909us [1631us] (51.96%; 99.92%)
+                InferType: 6278us [6278us] (41.25%; 79.38%)
 </pre></div>
 </div>
 </div>
@@ -550,10 +550,10 @@ Refer to following sections and <a class="reference internal" href="../../refere
 </pre></div>
 </div>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 6221us [6221us] (44.87%; 44.87%)
-FoldScaleAxis: 7644us [5us] (55.13%; 55.13%)
-        FoldConstant: 7639us [1559us] (55.10%; 99.93%)
-                InferType: 6081us [6081us] (43.86%; 79.59%)
+InferType: 6272us [6272us] (44.65%; 44.65%)
+FoldScaleAxis: 7775us [4us] (55.35%; 55.35%)
+        FoldConstant: 7771us [1613us] (55.32%; 99.94%)
+                InferType: 6158us [6158us] (43.84%; 79.24%)
 </pre></div>
 </div>
 <p>Register empty list to clear existing instruments.</p>
diff --git a/docs/how_to/optimize_operators/opt_conv_cuda.html b/docs/how_to/optimize_operators/opt_conv_cuda.html
index 08c05b58b8..de5fa8eecb 100644
--- a/docs/how_to/optimize_operators/opt_conv_cuda.html
+++ b/docs/how_to/optimize_operators/opt_conv_cuda.html
@@ -577,7 +577,7 @@ latency of convolution.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Convolution: </span><span class="si">%f</span><span class="s2"> ms&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">b</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span> <span class="o">*</span> <span cl [...]
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 43.150497 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 54.192222 ms
 </pre></div>
 </div>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-optimize-operators-opt-conv-cuda-py">
diff --git a/docs/how_to/optimize_operators/opt_conv_tensorcore.html b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
index cf8f4298ea..3af6e84603 100644
--- a/docs/how_to/optimize_operators/opt_conv_tensorcore.html
+++ b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
@@ -916,7 +916,7 @@ be able to run on our build server</p>
     <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;conv2d with tensor core: </span><span class="si">%f</span><span class="s2"> ms&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span> <span class="o">* [...]
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 13.345908 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 13.365604 ms
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/optimize_operators/opt_gemm.html b/docs/how_to/optimize_operators/opt_gemm.html
index bb6f0ebdb6..7a624a9516 100644
--- a/docs/how_to/optimize_operators/opt_gemm.html
+++ b/docs/how_to/optimize_operators/opt_gemm.html
@@ -474,8 +474,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Baseline: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.018938
-Baseline: 3.437834
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.018726
+Baseline: 3.452490
 </pre></div>
 </div>
 <p>In TVM, we can always inspect lower level IR to debug or optimize our schedule.
@@ -535,7 +535,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt1: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.312362
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.303881
 </pre></div>
 </div>
 <p>Here is the generated IR after blocking.</p>
@@ -602,7 +602,7 @@ vastly.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt2: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.337694
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.345987
 </pre></div>
 </div>
 <p>Here is the generated IR after vectorization.</p>
@@ -663,7 +663,7 @@ the access pattern for A matrix is more cache friendly.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt3: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.118729
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.118088
 </pre></div>
 </div>
 <p>Here is the generated IR after loop permutation.</p>
@@ -746,7 +746,7 @@ flattening.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt4: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.109678
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.109866
 </pre></div>
 </div>
 <p>Here is the generated IR after array packing.</p>
@@ -832,7 +832,7 @@ write to C when all the block results are ready.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt5: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.111738
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.111457
 </pre></div>
 </div>
 <p>Here is the generated IR after blocking.</p>
@@ -922,7 +922,7 @@ write to C when all the block results are ready.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt6: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">opt6_time</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.147516
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.147367
 </pre></div>
 </div>
 <p>Here is the generated IR after parallelization.</p>
diff --git a/docs/how_to/optimize_operators/sg_execution_times.html b/docs/how_to/optimize_operators/sg_execution_times.html
index 91a1396d25..827ec49e34 100644
--- a/docs/how_to/optimize_operators/sg_execution_times.html
+++ b/docs/how_to/optimize_operators/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-optimize-operators-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:35.273</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
+<p><strong>00:35.178</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 83%" />
@@ -349,15 +349,15 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="opt_gemm.html#sphx-glr-how-to-optimize-operators-opt-gemm-py"><span class="std std-ref">How to optimize GEMM on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_gemm.py</span></code>)</p></td>
-<td><p>00:32.697</p></td>
+<td><p>00:32.664</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="opt_conv_tensorcore.html#sphx-glr-how-to-optimize-operators-opt-conv-tensorcore-py"><span class="std std-ref">How to optimize convolution using TensorCores</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_tensorcore.py</span></code>)</p></td>
-<td><p>00:01.477</p></td>
+<td><p>00:01.459</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="opt_conv_cuda.html#sphx-glr-how-to-optimize-operators-opt-conv-cuda-py"><span class="std std-ref">How to optimize convolution on GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_cuda.py</span></code>)</p></td>
-<td><p>00:01.099</p></td>
+<td><p>00:01.055</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 </tbody>
diff --git a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
index 76516a2c14..4a61b65546 100644
--- a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
+++ b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-tune-with-autoscheduler-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>09:04.707</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
+<p><strong>09:11.143</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 85%" />
@@ -349,27 +349,27 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="tune_conv2d_layer_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py"><span class="std std-ref">Auto-scheduling a Convolution Layer for GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_layer_cuda.py</span></code>)</p></td>
-<td><p>05:33.921</p></td>
+<td><p>05:43.674</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tune_network_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-x86-py"><span class="std std-ref">Auto-scheduling a Neural Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_x86.py</span></code>)</p></td>
-<td><p>01:33.757</p></td>
+<td><p>01:32.818</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="tune_network_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-cuda-py"><span class="std std-ref">Auto-scheduling a Neural Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_cuda.py</span></code>)</p></td>
-<td><p>01:04.112</p></td>
+<td><p>01:03.505</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tune_sparse_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-sparse-x86-py"><span class="std std-ref">Auto-scheduling Sparse Matrix Multiplication on CPU with Custom Sketch Rule</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_sparse_x86.py</span></code>)</p></td>
-<td><p>00:29.522</p></td>
+<td><p>00:27.992</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="tune_network_arm.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-arm-py"><span class="std std-ref">Auto-scheduling a Neural Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_arm.py</span></code>)</p></td>
-<td><p>00:12.100</p></td>
+<td><p>00:11.975</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tune_network_mali.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-mali-py"><span class="std std-ref">Auto-scheduling a Neural Network for mali GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_mali.py</span></code>)</p></td>
-<td><p>00:11.296</p></td>
+<td><p>00:11.178</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 </tbody>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
index c85ca2f9be..577e09a014 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
@@ -504,45 +504,153 @@ cooperative fetching, unrolling and operator fusion.</p>
              compute: Buffer(compute_2: Pointer(float32), float32, [25088], [])}
   buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, compute_1: compute}
   preflattened_buffer_map = {data_1: data_3: Buffer(data_2, float32, [1, 512, 7, 7], []), kernel_1: kernel_3: Buffer(kernel_2, float32, [512, 512, 3, 3], []), bias_1: bias_3: Buffer(bias_2, float32, [1, 512, 1, 1], []), compute_1: compute_3: Buffer(compute_2, float32, [1, 512, 7, 7], [])} {
-  attr [IterVar(blockIdx.x: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;blockIdx.x&quot;)] &quot;thread_extent&quot; = 32;
-  allocate(conv2d_nchw: Pointer(local float32), float32, [4]), storage_scope = local;
-  allocate(pad_temp.shared: Pointer(shared float32), float32, [252]), storage_scope = shared;
-  allocate(kernel.shared: Pointer(shared float32), float32, [192]), storage_scope = shared;
-  attr [IterVar(threadIdx.x: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 196 {
-    for (ff.inner.init: int32, 0, 2) {
-      conv2d_nchw_1: Buffer(conv2d_nchw, float32, [4], [], scope=&quot;local&quot;, align=8)[ff.inner.init] = 0f32
-      conv2d_nchw_1[(ff.inner.init + 2)] = 0f32
-    }
-    for (rc.outer.outer: int32, 0, 128) {
+  attr [IterVar(blockIdx.x: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;blockIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+  allocate(conv2d_nchw: Pointer(local float32), float32, [7]), storage_scope = local;
+  allocate(pad_temp.shared: Pointer(shared float32), float32, [2016]), storage_scope = shared;
+  allocate(kernel.shared: Pointer(shared float32), float32, [768]), storage_scope = shared;
+  attr [IterVar(threadIdx.x: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56 {
+    conv2d_nchw_1: Buffer(conv2d_nchw, float32, [7], [], scope=&quot;local&quot;, align=16)[0] = 0f32
+    conv2d_nchw_1[1] = 0f32
+    conv2d_nchw_1[2] = 0f32
+    conv2d_nchw_1[3] = 0f32
+    conv2d_nchw_1[4] = 0f32
+    conv2d_nchw_1[5] = 0f32
+    conv2d_nchw_1[6] = 0f32
+    for (rc.outer.outer: int32, 0, 16) {
       for (rx.outer.outer: int32, 0, 3) {
-        for (ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer: int32, 0, 2) {
-          attr [IterVar(threadIdx.x_1: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 196;
-          if @tir.likely((((ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer*7) + floordiv(threadIdx.x_1, 28)) &lt; 9), dtype=bool) {
-            pad_temp.shared_1: Buffer(pad_temp.shared, float32, [252], [], scope=&quot;shared&quot;)[((ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer*196) + threadIdx.x_1)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.ou [...]
+        let cse_var_2: int32 = (rc.outer.outer*1568)
+        let cse_var_1: int32 = (rc.outer.outer*288)
+         {
+          attr [IterVar(threadIdx.x_1: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1: Buffer(pad_temp.shared, float32, [2016], [], scope=&quot;shared&quot;)[threadIdx.x_1] = @tir.if_then_else((((7 &lt;= threadIdx.x_1) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((cse_var_2 + threadIdx.x_1) + rx.outer.outer) - 8)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 56)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 8), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 8), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 56), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 8), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1,  [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 112)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 7), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 7), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 112), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 7), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1 [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 168)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 6), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 6), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 168), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 6), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1 [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 224)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 5), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 5), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 224), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 5), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1 [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 280)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 4), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 4), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 280), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 4), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1 [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 336)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 3), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 3), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 336), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 3), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1 [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 392)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 2), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 2), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 392), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 2), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1 [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 448)] = @tir.if_then_else((((threadIdx.x_1 &lt; 49) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 448), 63)*49)) + ((floordiv(threadIdx.x_1, 7) + 1)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 504)] = @tir.if_then_else((((7 &lt;= threadIdx.x_1) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((cse_var_2 + threadIdx.x_1) + rx.outer.outer) + 384)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 560)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 8), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 8), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 560), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 8), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1 [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 616)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 7), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 7), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 616), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 7), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1 [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 672)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 6), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 6), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 672), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 6), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1 [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 728)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 5), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 5), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 728), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 5), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1 [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 784)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 4), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 4), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 784), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 4), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1 [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 840)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 3), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 3), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 840), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 3), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1 [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 896)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 2), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 2), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 896), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 2), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1 [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 952)] = @tir.if_then_else((((threadIdx.x_1 &lt; 49) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 952), 63)*49)) + ((floordiv(threadIdx.x_1, 7) + 1)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 1008)] = @tir.if_then_else((((7 &lt;= threadIdx.x_1) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((cse_var_2 + threadIdx.x_1) + rx.outer.outer) + 776)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 1064)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 8), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 8), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1064), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 8), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 1120)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 7), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 7), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1120), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 7), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 1176)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 6), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 6), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1176), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 6), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 1232)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 5), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 5), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1232), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 5), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 1288)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 4), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 4), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1288), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 4), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 1344)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 3), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 3), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1344), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 3), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 1400)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 2), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 2), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1400), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 2), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 1456)] = @tir.if_then_else((((threadIdx.x_1 &lt; 49) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1456), 63)*49)) + ((floordiv(threadIdx.x_1, 7) + 1)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 1512)] = @tir.if_then_else((((7 &lt;= threadIdx.x_1) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((cse_var_2 + threadIdx.x_1) + rx.outer.outer) + 1168)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 1568)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 8), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 8), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1568), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 8), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 1624)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 7), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 7), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1624), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 7), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 1680)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 6), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 6), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1680), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 6), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 1736)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 5), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 5), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1736), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 5), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 1792)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 4), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 4), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1792), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 4), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 1848)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 3), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 3), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1848), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 3), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 1904)] = @tir.if_then_else(((((1 &lt;= floormod((floordiv(threadIdx.x_1, 7) + 2), 9)) &amp;&amp; (floormod((floordiv(threadIdx.x_1, 7) + 2), 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1904), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 2), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          pad_temp.shared_1[(threadIdx.x_1 + 1960)] = @tir.if_then_else((((threadIdx.x_1 &lt; 49) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1960), 63)*49)) + ((floordiv(threadIdx.x_1, 7) + 1)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_2: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          kernel.shared_1: Buffer(kernel.shared, float32, [768], [], scope=&quot;shared&quot;)[threadIdx.x_2] = kernel[((((blockIdx.x*36864) + cse_var_1) + (threadIdx.x_2*3)) + rx.outer.outer)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          kernel.shared_1[(threadIdx.x_2 + 56)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 56), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 56), 96), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          kernel.shared_1[(threadIdx.x_2 + 112)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 112), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 16), 96), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          kernel.shared_1[(threadIdx.x_2 + 168)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 168), 96)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 24), 32)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          kernel.shared_1[(threadIdx.x_2 + 224)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 224), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 96), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          kernel.shared_1[(threadIdx.x_2 + 280)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 280), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 88), 96), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          kernel.shared_1[(threadIdx.x_2 + 336)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 336), 96)*4608)) + cse_var_1) + (floormod((floordiv(threadIdx.x_2, 3) + 16), 32)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          kernel.shared_1[(threadIdx.x_2 + 392)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 392), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 8), 96), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          kernel.shared_1[(threadIdx.x_2 + 448)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 448), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 64), 96), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          kernel.shared_1[(threadIdx.x_2 + 504)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 504), 96)*4608)) + cse_var_1) + ((floordiv(threadIdx.x_2, 3) + 8)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          kernel.shared_1[(threadIdx.x_2 + 560)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 560), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 80), 96), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          kernel.shared_1[(threadIdx.x_2 + 616)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 616), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 40), 96), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          kernel.shared_1[(threadIdx.x_2 + 672)] = kernel[(((((blockIdx.x*36864) + cse_var_1) + (threadIdx.x_2*3)) + rx.outer.outer) + 32256)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 56;
+          if @tir.likely((threadIdx.x_2 &lt; 40), dtype=bool) {
+            kernel.shared_1[(threadIdx.x_2 + 728)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 728), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 56), 96), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
           }
-        }
-        attr [IterVar(threadIdx.x_2: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 196;
-        if @tir.likely((threadIdx.x_2 &lt; 192), dtype=bool) {
-          kernel.shared_1: Buffer(kernel.shared, float32, [192], [], scope=&quot;shared&quot;)[threadIdx.x_2] = kernel[(((((blockIdx.x*73728) + (floordiv(threadIdx.x_2, 12)*4608)) + (rc.outer.outer*36)) + (floormod(threadIdx.x_2, 12)*3)) + rx.outer.outer)]
-        }
-        for (rc.outer.inner: int32, 0, 2) {
-          for (ry.outer.inner: int32, 0, 3) {
-            for (rc.inner: int32, 0, 2) {
-              for (ff.inner: int32, 0, 2) {
-                let cse_var_1: int32 = (ff.inner + 2)
-                 {
-                  conv2d_nchw_1[ff.inner] = (conv2d_nchw_1[ff.inner] + (pad_temp.shared_1[((((rc.outer.inner*126) + (rc.inner*63)) + (ry.outer.inner*7)) + floormod(threadIdx.x, 49))]*kernel.shared_1[(((((floordiv(threadIdx.x, 49)*24) + (ff.inner*12)) + (rc.outer.inner*6)) + (rc.inner*3)) + ry.outer.inner)]))
-                  conv2d_nchw_1[cse_var_1] = (conv2d_nchw_1[cse_var_1] + (pad_temp.shared_1[((((rc.outer.inner*126) + (rc.inner*63)) + (ry.outer.inner*7)) + floormod(threadIdx.x, 49))]*kernel.shared_1[((((((floordiv(threadIdx.x, 49)*24) + (ff.inner*12)) + (rc.outer.inner*6)) + (rc.inner*3)) + ry.outer.inner) + 96)]))
-                }
-              }
-            }
+          for (rc.outer.inner: int32, 0, 32) {
+            conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7))]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3))]))
+            conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 1)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3))]))
+            conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 2)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3))]))
+            conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 3)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3))]))
+            conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 4)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3))]))
+            conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 5)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3))]))
+            conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 6)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3))]))
+            conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 7)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 1)]))
+            conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 8)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 1)]))
+            conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 9)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 1)]))
+            conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 10)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 1)]))
+            conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 11)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 1)]))
+            conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 12)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 1)]))
+            conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 13)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 1)]))
+            conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 14)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 2)]))
+            conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 15)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 2)]))
+            conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 16)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 2)]))
+            conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 17)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 2)]))
+            conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 18)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 2)]))
+            conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 19)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 2)]))
+            conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*63) + (floormod(threadIdx.x, 7)*7)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*3)) + 2)]))
           }
         }
       }
     }
-    for (i1.inner: int32, 0, 2) {
-      compute[((((blockIdx.x*784) + (floordiv(threadIdx.x, 49)*98)) + (i1.inner*49)) + floormod(threadIdx.x, 49))] = max((conv2d_nchw_1[i1.inner] + bias[(((blockIdx.x*16) + (floordiv(threadIdx.x, 49)*2)) + i1.inner)]), 0f32)
-      compute[(((((blockIdx.x*784) + (floordiv(threadIdx.x, 49)*98)) + (i1.inner*49)) + floormod(threadIdx.x, 49)) + 392)] = max((conv2d_nchw_1[(i1.inner + 2)] + bias[((((blockIdx.x*16) + (floordiv(threadIdx.x, 49)*2)) + i1.inner) + 8)]), 0f32)
+    for (i3.inner: int32, 0, 7) {
+      compute[(((blockIdx.x*392) + (threadIdx.x*7)) + i3.inner)] = max((conv2d_nchw_1[i3.inner] + bias[((blockIdx.x*8) + floordiv(threadIdx.x, 7))]), 0f32)
     }
   }
 }
@@ -579,7 +687,7 @@ cooperative fetching, unrolling and operator fusion.</p>
 <span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 0.349 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 0.368 ms
 </pre></div>
 </div>
 </div>
@@ -608,20 +716,20 @@ conv2d_nchw_nn_o_i, conv2d_nchw_nn_i = s[conv2d_nchw].split(conv2d_nchw_nn, fact
 conv2d_nchw_nn_o_o_i, conv2d_nchw_nn_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_i, factor=1)
 conv2d_nchw_nn_o_o_o_i, conv2d_nchw_nn_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_i, factor=1)
 conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
-conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=2)
+conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
 conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=1)
-conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=4)
-conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=2)
+conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=8)
+conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
 conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
 conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
 conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=7)
 conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
 conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
-conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
-conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=7)
+conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=7)
+conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
 conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
-conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=2)
-conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=2)
+conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=1)
+conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=32)
 conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1)
 conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=3)
 conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=1)
@@ -630,14 +738,14 @@ s[conv2d_nchw].reorder(conv2d_nchw_nn_o_o_o_o, conv2d_nchw_ff_o_o_o_o, conv2d_nc
 compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1)
 compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
 compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
-compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=2)
-compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=4)
-compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=2)
+compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=1)
+compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=8)
+compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
 compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
 compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=7)
 compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
-compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
-compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=7)
+compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
+compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
 compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
 s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
 s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
@@ -657,14 +765,14 @@ s[compute].bind(compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused, te.thread
 kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
 kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
 s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=196)
+kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=56)
 s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis(&quot;threadIdx.x&quot;))
 pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
 pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
 s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=196)
+pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=56)
 s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis(&quot;threadIdx.x&quot;))
-s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, &quot;auto_unroll_max_step&quot;, 0)
+s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, &quot;auto_unroll_max_step&quot;, 64)
 s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, &quot;unroll_explicit&quot;, True)
 
 CUDA source code:
@@ -682,41 +790,100 @@ CUDA source code:
   #define int64_t long long
   #define uint64_t unsigned long long
 #endif
-extern &quot;C&quot; __global__ void __launch_bounds__(196) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
-  float conv2d_nchw[4];
-  __shared__ float pad_temp_shared[252];
-  __shared__ float kernel_shared[192];
-  for (int ff_inner_init = 0; ff_inner_init &lt; 2; ++ff_inner_init) {
-    conv2d_nchw[ff_inner_init] = 0.000000e+00f;
-    conv2d_nchw[(ff_inner_init + 2)] = 0.000000e+00f;
-  }
-  for (int rc_outer_outer = 0; rc_outer_outer &lt; 128; ++rc_outer_outer) {
+extern &quot;C&quot; __global__ void __launch_bounds__(56) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+  float conv2d_nchw[7];
+  __shared__ float pad_temp_shared[2016];
+  __shared__ float kernel_shared[768];
+  conv2d_nchw[0] = 0.000000e+00f;
+  conv2d_nchw[1] = 0.000000e+00f;
+  conv2d_nchw[2] = 0.000000e+00f;
+  conv2d_nchw[3] = 0.000000e+00f;
+  conv2d_nchw[4] = 0.000000e+00f;
+  conv2d_nchw[5] = 0.000000e+00f;
+  conv2d_nchw[6] = 0.000000e+00f;
+  for (int rc_outer_outer = 0; rc_outer_outer &lt; 16; ++rc_outer_outer) {
     for (int rx_outer_outer = 0; rx_outer_outer &lt; 3; ++rx_outer_outer) {
       __syncthreads();
-      for (int ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = 0; ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer &lt; 2; ++ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer) {
-        if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) / 28)) &lt; 9) {
-          pad_temp_shared[((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 196) + ((int)threadIdx.x))] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 196) + ((((ax0_ax1_fused_ax2 [...]
-        }
-      }
-      if (((int)threadIdx.x) &lt; 192) {
-        kernel_shared[((int)threadIdx.x)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((int)threadIdx.x) / 12) * 4608)) + (rc_outer_outer * 36)) + ((((int)threadIdx.x) % 12) * 3)) + rx_outer_outer)];
+      pad_temp_shared[((int)threadIdx.x)] = ((((7 &lt;= ((int)threadIdx.x)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((rc_outer_outer * 1568) + ((int)threadIdx.x)) + rx_outer_outer) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 56)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 8) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 8) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 56) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 8) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 112)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 7) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 7) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 112) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 7) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 168)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 6) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 6) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 168) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 6) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 224)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 5) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 5) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 224) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 5) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 280)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 4) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 4) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 280) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 4) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 336)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 3) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 3) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 336) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 3) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 392)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 2) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 2) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 392) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 2) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 448)] = ((((((int)threadIdx.x) &lt; 49) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 448) / 63) * 49)) + ((int)threadIdx.x)) + rx_outer_outer) - 1)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 504)] = ((((7 &lt;= ((int)threadIdx.x)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((rc_outer_outer * 1568) + ((int)threadIdx.x)) + rx_outer_outer) + 384)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 560)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 8) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 8) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 560) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 8) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 616)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 7) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 7) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 616) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 7) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 672)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 6) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 6) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 672) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 6) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 728)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 5) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 5) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 728) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 5) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 784)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 4) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 4) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 784) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 4) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 840)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 3) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 3) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 840) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 3) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 896)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 2) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 2) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 896) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 2) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 952)] = ((((((int)threadIdx.x) &lt; 49) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 952) / 63) * 49)) + ((int)threadIdx.x)) + rx_outer_outer) - 1)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 1008)] = ((((7 &lt;= ((int)threadIdx.x)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((rc_outer_outer * 1568) + ((int)threadIdx.x)) + rx_outer_outer) + 776)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 1064)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 8) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 8) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1064) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 8) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 1120)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 7) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 7) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1120) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 7) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 1176)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 6) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 6) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1176) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 6) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 1232)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 5) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 5) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1232) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 5) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 1288)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 4) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 4) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1288) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 4) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 1344)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 3) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 3) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1344) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 3) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 1400)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 2) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 2) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1400) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 2) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 1456)] = ((((((int)threadIdx.x) &lt; 49) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1456) / 63) * 49)) + ((int)threadIdx.x)) + rx_outer_outer) - 1)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 1512)] = ((((7 &lt;= ((int)threadIdx.x)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((rc_outer_outer * 1568) + ((int)threadIdx.x)) + rx_outer_outer) + 1168)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 1568)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 8) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 8) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1568) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 8) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 1624)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 7) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 7) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1624) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 7) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 1680)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 6) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 6) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1680) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 6) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 1736)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 5) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 5) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1736) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 5) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 1792)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 4) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 4) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1792) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 4) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 1848)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 3) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 3) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1848) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 3) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 1904)] = (((((1 &lt;= (((((int)threadIdx.x) / 7) + 2) % 9)) &amp;&amp; ((((((int)threadIdx.x) / 7) + 2) % 9) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1904) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 2) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) + 1960)] = ((((((int)threadIdx.x) &lt; 49) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1960) / 63) * 49)) + ((int)threadIdx.x)) + rx_outer_outer) - 1)] : 0.000000e+00f);
+      kernel_shared[((int)threadIdx.x)] = kernel[((((((int)blockIdx.x) * 36864) + (rc_outer_outer * 288)) + (((int)threadIdx.x) * 3)) + rx_outer_outer)];
+      kernel_shared[(((int)threadIdx.x) + 56)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 56) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 56) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+      kernel_shared[(((int)threadIdx.x) + 112)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 112) / 96) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 16) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+      kernel_shared[(((int)threadIdx.x) + 168)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 168) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) / 3) + 24) &amp; 31) * 9)) + ((((int)threadIdx.x) % 3) * 3)) + rx_outer_outer)];
+      kernel_shared[(((int)threadIdx.x) + 224)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 224) / 96) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 32) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+      kernel_shared[(((int)threadIdx.x) + 280)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 280) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 88) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+      kernel_shared[(((int)threadIdx.x) + 336)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 336) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) / 3) + 16) &amp; 31) * 9)) + ((((int)threadIdx.x) % 3) * 3)) + rx_outer_outer)];
+      kernel_shared[(((int)threadIdx.x) + 392)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 392) / 96) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 8) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+      kernel_shared[(((int)threadIdx.x) + 448)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 448) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 64) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+      kernel_shared[(((int)threadIdx.x) + 504)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 504) / 96) * 4608)) + (rc_outer_outer * 288)) + (((int)threadIdx.x) * 3)) + rx_outer_outer) + 72)];
+      kernel_shared[(((int)threadIdx.x) + 560)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 560) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 80) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+      kernel_shared[(((int)threadIdx.x) + 616)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 616) / 96) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 40) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+      kernel_shared[(((int)threadIdx.x) + 672)] = kernel[(((((((int)blockIdx.x) * 36864) + (rc_outer_outer * 288)) + (((int)threadIdx.x) * 3)) + rx_outer_outer) + 32256)];
+      if (((int)threadIdx.x) &lt; 40) {
+        kernel_shared[(((int)threadIdx.x) + 728)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 728) / 96) * 4608)) + (rc_outer_outer * 288)) + (((((int)threadIdx.x) + 56) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
       }
       __syncthreads();
-      for (int rc_outer_inner = 0; rc_outer_inner &lt; 2; ++rc_outer_inner) {
-        for (int ry_outer_inner = 0; ry_outer_inner &lt; 3; ++ry_outer_inner) {
-          for (int rc_inner = 0; rc_inner &lt; 2; ++rc_inner) {
-            for (int ff_inner = 0; ff_inner &lt; 2; ++ff_inner) {
-              conv2d_nchw[ff_inner] = (conv2d_nchw[ff_inner] + (pad_temp_shared[((((rc_outer_inner * 126) + (rc_inner * 63)) + (ry_outer_inner * 7)) + (((int)threadIdx.x) % 49))] * kernel_shared[((((((((int)threadIdx.x) / 49) * 24) + (ff_inner * 12)) + (rc_outer_inner * 6)) + (rc_inner * 3)) + ry_outer_inner)]));
-              conv2d_nchw[(ff_inner + 2)] = (conv2d_nchw[(ff_inner + 2)] + (pad_temp_shared[((((rc_outer_inner * 126) + (rc_inner * 63)) + (ry_outer_inner * 7)) + (((int)threadIdx.x) % 49))] * kernel_shared[(((((((((int)threadIdx.x) / 49) * 24) + (ff_inner * 12)) + (rc_outer_inner * 6)) + (rc_inner * 3)) + ry_outer_inner) + 96)]));
-            }
-          }
-        }
+      for (int rc_outer_inner = 0; rc_outer_inner &lt; 32; ++rc_outer_inner) {
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7))] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3))]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 1)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3))]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 2)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3))]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 3)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3))]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 4)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3))]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 5)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3))]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 6)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3))]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 7)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 1)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 8)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 1)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 9)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 1)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 10)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 1)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 1)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 12)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 1)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 13)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 1)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 2)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 15)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 2)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 16)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 2)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 17)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 2)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 18)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 2)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 2)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 63) + ((((int)threadIdx.x) % 7) * 7)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 3)) + 2)]));
       }
     }
   }
-  for (int i1_inner = 0; i1_inner &lt; 2; ++i1_inner) {
-    compute[((((((int)blockIdx.x) * 784) + ((((int)threadIdx.x) / 49) * 98)) + (i1_inner * 49)) + (((int)threadIdx.x) % 49))] = max((conv2d_nchw[i1_inner] + bias[(((((int)blockIdx.x) * 16) + ((((int)threadIdx.x) / 49) * 2)) + i1_inner)]), 0.000000e+00f);
-    compute[(((((((int)blockIdx.x) * 784) + ((((int)threadIdx.x) / 49) * 98)) + (i1_inner * 49)) + (((int)threadIdx.x) % 49)) + 392)] = max((conv2d_nchw[(i1_inner + 2)] + bias[((((((int)blockIdx.x) * 16) + ((((int)threadIdx.x) / 49) * 2)) + i1_inner) + 8)]), 0.000000e+00f);
+  for (int i3_inner = 0; i3_inner &lt; 7; ++i3_inner) {
+    compute[(((((int)blockIdx.x) * 392) + (((int)threadIdx.x) * 7)) + i3_inner)] = max((conv2d_nchw[i3_inner] + bias[((((int)blockIdx.x) * 8) + (((int)threadIdx.x) / 7))]), 0.000000e+00f);
   }
 }
 </pre></div>
@@ -753,7 +920,7 @@ In the example below we resume the status and do more 5 trials.</p>
 Get devices for measurement successfully!
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 5 minutes  33.921 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 5 minutes  43.674 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/e3e540f3b477c0c52d8eb73e674e8ffd/tune_conv2d_layer_cuda.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_conv2d_layer_cuda.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
index 6f8c831baf..6bf4ed36de 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
@@ -915,7 +915,7 @@ so we can read the log file and load the best schedules.</p>
 Evaluate inference time cost...
 Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-   8.2136       8.2150       8.2171       8.2088       0.0035
+   8.1730       8.1716       8.1796       8.1679       0.0049
 </pre></div>
 </div>
 </div>
@@ -937,7 +937,7 @@ to learn how to use the RPC Tracker and RPC Server.
 To use the RPC Tracker in auto-scheduler, replace the runner in <code class="code docutils literal notranslate"><span class="pre">TuningOptions</span></code>
 with <a class="reference internal" href="../../reference/api/python/auto_scheduler.html#tvm.auto_scheduler.RPCRunner" title="tvm.auto_scheduler.RPCRunner"><code class="xref any py py-class docutils literal notranslate"><span class="pre">auto_scheduler.RPCRunner</span></code></a>.</p></li>
 </ol>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  4.112 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  3.505 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-network-cuda-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/eafe360d52540634c9eea0fa89e804bd/tune_network_cuda.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_network_cuda.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_x86.html b/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
index a7961a42c7..e04eea0c5d 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
@@ -934,7 +934,7 @@ so we can read the log file and load the best schedules.</p>
 Evaluate inference time cost...
 Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  760.2186     760.4327     761.2765     758.9464      0.9632
+  755.8821     755.1885     757.4651     754.9928      1.1222
 </pre></div>
 </div>
 </div>
@@ -956,7 +956,7 @@ to learn how to use the RPC Tracker and RPC Server.
 To use the RPC Tracker in auto-scheduler, replace the runner in <code class="code docutils literal notranslate"><span class="pre">TuningOptions</span></code>
 with <a class="reference internal" href="../../reference/api/python/auto_scheduler.html#tvm.auto_scheduler.RPCRunner" title="tvm.auto_scheduler.RPCRunner"><code class="xref any py py-class docutils literal notranslate"><span class="pre">auto_scheduler.RPCRunner</span></code></a>.</p></li>
 </ol>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  33.757 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  32.818 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-network-x86-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/e416b94ca1090b0897c0f6e0df95b911/tune_network_x86.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_network_x86.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html b/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
index b4408b759f..39889c2544 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
@@ -632,13 +632,13 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
              placeholder_4: Buffer(placeholder_14: Pointer(float32), float32, [65536], []),
              compute: Buffer(compute_2: Pointer(float32), float32, [65536], [])}
   buffer_map = {placeholder_5: placeholder, placeholder_6: placeholder_1, placeholder_7: placeholder_2, placeholder_8: placeholder_3, placeholder_9: placeholder_4, compute_1: compute}
-  preflattened_buffer_map = {placeholder_8: placeholder_15: Buffer(placeholder_13, int32, [33], []), placeholder_7: placeholder_16: Buffer(placeholder_12, int32, [4916], []), placeholder_5: placeholder_17: Buffer(placeholder_10, float32, [128, 256], []), placeholder_9: placeholder_18: Buffer(placeholder_14, float32, [128, 512], []), compute_1: compute_3: Buffer(compute_2, float32, [128, 512], []), placeholder_6: placeholder_19: Buffer(placeholder_11, float32, [4916, 16, 1], [])} {
-  for (i0.outer.i1.outer.fused: int32, 0, 512) &quot;parallel&quot; {
-    allocate(compute_4: Pointer(global float32), float32, [128]), storage_scope = global {
-      for (i.inner.init: int32, 0, 8) {
+  preflattened_buffer_map = {compute_1: compute_3: Buffer(compute_2, float32, [128, 512], []), placeholder_6: placeholder_15: Buffer(placeholder_11, float32, [4916, 16, 1], []), placeholder_5: placeholder_16: Buffer(placeholder_10, float32, [128, 256], []), placeholder_7: placeholder_17: Buffer(placeholder_12, int32, [4916], []), placeholder_8: placeholder_18: Buffer(placeholder_13, int32, [33], []), placeholder_9: placeholder_19: Buffer(placeholder_14, float32, [128, 512], [])} {
+  for (i0.outer.i1.outer.fused: int32, 0, 64) &quot;parallel&quot; {
+    allocate(compute_4: Pointer(global float32), float32, [1024]), storage_scope = global {
+      for (i.inner.init: int32, 0, 64) {
         let cse_var_1: int32 = (i.inner.init*16)
          {
-          compute_5: Buffer(compute_4, float32, [128], [])[cse_var_1] = 0f32
+          compute_5: Buffer(compute_4, float32, [1024], [])[cse_var_1] = 0f32
           compute_5[(cse_var_1 + 1)] = 0f32
           compute_5[(cse_var_1 + 2)] = 0f32
           compute_5[(cse_var_1 + 3)] = 0f32
@@ -657,78 +657,78 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
         }
       }
       for (elem_idx: int32, 0, let cse_var_2: int32 = floormod(i0.outer.i1.outer.fused, 32) in (placeholder_3[(cse_var_2 + 1)] - placeholder_3[cse_var_2])) {
-        for (i.inner: int32, 0, 8) {
+        for (i.inner: int32, 0, 64) {
           let cse_var_3: int32 = floormod(i0.outer.i1.outer.fused, 32)
            {
             if @tir.likely((elem_idx &lt; (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
               let cse_var_4: int32 = (i.inner*16)
-              compute_5[cse_var_4] = (compute_5[cse_var_4] + (placeholder_1[((placeholder_3[cse_var_3]*16) + (elem_idx*16))]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+              compute_5[cse_var_4] = (compute_5[cse_var_4] + (placeholder_1[((placeholder_3[cse_var_3]*16) + (elem_idx*16))]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
             }
             if @tir.likely((elem_idx &lt; (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
               let cse_var_5: int32 = ((i.inner*16) + 1)
-              compute_5[cse_var_5] = (compute_5[cse_var_5] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 1)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+              compute_5[cse_var_5] = (compute_5[cse_var_5] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 1)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
             }
             if @tir.likely((elem_idx &lt; (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
               let cse_var_6: int32 = ((i.inner*16) + 2)
-              compute_5[cse_var_6] = (compute_5[cse_var_6] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 2)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+              compute_5[cse_var_6] = (compute_5[cse_var_6] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 2)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
             }
             if @tir.likely((elem_idx &lt; (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
               let cse_var_7: int32 = ((i.inner*16) + 3)
-              compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 3)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+              compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 3)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
             }
             if @tir.likely((elem_idx &lt; (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
               let cse_var_8: int32 = ((i.inner*16) + 4)
-              compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 4)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+              compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 4)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
             }
             if @tir.likely((elem_idx &lt; (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
               let cse_var_9: int32 = ((i.inner*16) + 5)
-              compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 5)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+              compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 5)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
             }
             if @tir.likely((elem_idx &lt; (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
               let cse_var_10: int32 = ((i.inner*16) + 6)
-              compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 6)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+              compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 6)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
             }
             if @tir.likely((elem_idx &lt; (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
               let cse_var_11: int32 = ((i.inner*16) + 7)
-              compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 7)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+              compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 7)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
             }
             if @tir.likely((elem_idx &lt; (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
               let cse_var_12: int32 = ((i.inner*16) + 8)
-              compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 8)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+              compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 8)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
             }
             if @tir.likely((elem_idx &lt; (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
               let cse_var_13: int32 = ((i.inner*16) + 9)
-              compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 9)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+              compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 9)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
             }
             if @tir.likely((elem_idx &lt; (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
               let cse_var_14: int32 = ((i.inner*16) + 10)
-              compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 10)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+              compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 10)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
             }
             if @tir.likely((elem_idx &lt; (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
               let cse_var_15: int32 = ((i.inner*16) + 11)
-              compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 11)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+              compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 11)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
             }
             if @tir.likely((elem_idx &lt; (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
               let cse_var_16: int32 = ((i.inner*16) + 12)
-              compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 12)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+              compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 12)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
             }
             if @tir.likely((elem_idx &lt; (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
               let cse_var_17: int32 = ((i.inner*16) + 13)
-              compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 13)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+              compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 13)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
             }
             if @tir.likely((elem_idx &lt; (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
               let cse_var_18: int32 = ((i.inner*16) + 14)
-              compute_5[cse_var_18] = (compute_5[cse_var_18] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 14)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+              compute_5[cse_var_18] = (compute_5[cse_var_18] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 14)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
             }
             if @tir.likely((elem_idx &lt; (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
               let cse_var_19: int32 = ((i.inner*16) + 15)
-              compute_5[cse_var_19] = (compute_5[cse_var_19] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 15)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*2048) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+              compute_5[cse_var_19] = (compute_5[cse_var_19] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 15)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
             }
           }
         }
       }
-      for (i0.inner: int32, 0, 8) {
-        let cse_var_20: int32 = (((floordiv(i0.outer.i1.outer.fused, 32)*4096) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 32)*16))
+      for (i0.inner: int32, 0, 64) {
+        let cse_var_20: int32 = (((floordiv(i0.outer.i1.outer.fused, 32)*32768) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 32)*16))
         compute[ramp(cse_var_20, 1, 16)] = max((compute_5[ramp((i0.inner*16), 1, 16)] + placeholder_4[ramp(cse_var_20, 1, 16)]), broadcast(0f32, 16))
       }
     }
@@ -767,7 +767,7 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
 <span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 1.868 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 1.844 ms
 </pre></div>
 </div>
 <div class="admonition note">
diff --git a/docs/how_to/tune_with_autotvm/sg_execution_times.html b/docs/how_to/tune_with_autotvm/sg_execution_times.html
index dd98ca8fee..6a12e6e0e3 100644
--- a/docs/how_to/tune_with_autotvm/sg_execution_times.html
+++ b/docs/how_to/tune_with_autotvm/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-tune-with-autotvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:37.796</strong> total execution time for <strong>how_to_tune_with_autotvm</strong> files:</p>
+<p><strong>00:42.071</strong> total execution time for <strong>how_to_tune_with_autotvm</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 84%" />
@@ -349,11 +349,11 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="tune_conv2d_cuda.html#sphx-glr-how-to-tune-with-autotvm-tune-conv2d-cuda-py"><span class="std std-ref">Tuning High Performance Convolution on NVIDIA GPUs</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_cuda.py</span></code>)</p></td>
-<td><p>00:37.760</p></td>
+<td><p>00:42.036</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tune_relay_x86.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-x86-py"><span class="std std-ref">Auto-tuning a Convolutional Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_x86.py</span></code>)</p></td>
-<td><p>00:00.021</p></td>
+<td><p>00:00.020</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="tune_relay_cuda.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-cuda-py"><span class="std std-ref">Auto-tuning a Convolutional Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_cuda.py</span></code>)</p></td>
diff --git a/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html b/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
index 0eb04579c7..90666fa30f 100644
--- a/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
+++ b/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
@@ -567,7 +567,9 @@ for this template</p>
 waiting for device...
 device available
 Get devices for measurement successfully!
-No: 1   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
+No: 1   GFLOPS: 1.43/1.43       result: MeasureResult(costs=(0.16210441574999998,), error_no=MeasureErrorNo.NO_ERROR, all_cost=9.74929690361023, timestamp=1667866553.1932828)  [(&#39;tile_f&#39;, [-1, 32, 4, 2]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 1, 1]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9682937
+No: 2   GFLOPS: 148.64/148.64   result: MeasureResult(costs=(0.0015574340776699027,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.8080589771270752, timestamp=1667866554.1134543)      [(&#39;tile_f&#39;, [-1, 1, 16, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 4, 16]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,1677314
+No: 3   GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -689,8 +691,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 8, 8, 2]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 1, 512]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 1)],None,8128642
-No: 2   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 2, 32, 1]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 64, 2]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,1027441
+No: 4   GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -812,8 +814,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 16, 4, 4]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 64, 4]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,10348919
-No: 3   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 4, 2, 64]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 1, 64]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 1)],None,5775206
+No: 5   GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -935,9 +937,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 16, 8, 2]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 1, 512]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 1)],None,7549163
-No: 4   GFLOPS: 248.27/248.27   result: MeasureResult(costs=(0.0009324433863636363,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.7833104133605957, timestamp=1667865851.5936587)      [(&#39;tile_f&#39;, [-1, 1, 64, 2]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 2, 4]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,653714
-No: 5   GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 32, 4, 4]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 2, 4]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,2203420
+No: 6   GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -1059,9 +1060,9 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 2, 8, 8]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 1, 512]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 1)],None,6581895
-No: 6   GFLOPS: 108.35/248.27   result: MeasureResult(costs=(0.002136543255319149,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.1016724109649658, timestamp=1667865852.8906536)       [(&#39;tile_f&#39;, [-1, 2, 1, 2]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 2, 8]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,293536
-No: 7   GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 1, 64, 8]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 8, 32]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 1)],None,6348043
+No: 7   GFLOPS: 50.89/148.64    result: MeasureResult(costs=(0.004549476272727272,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.3591265678405762, timestamp=1667866558.40512) [(&#39;tile_f&#39;, [-1, 4, 32, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 2, 1]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,2326762
+No: 8   GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -1183,8 +1184,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 4, 8, 16]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 32, 2]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,2183021
-No: 8   GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 512, 1, 1]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 1, 512]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9292369
+No: 9   GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -1306,8 +1307,26 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 8, 1, 2]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 8, 64]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 1)],None,5592898
-No: 9   GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 4, 128, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 1, 512]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,2710231
+No: 10  GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 142, in build
+    res = future.result()
+  File &quot;/usr/lib/python3.7/concurrent/futures/_base.py&quot;, line 435, in result
+    return self.__get_result()
+  File &quot;/usr/lib/python3.7/concurrent/futures/_base.py&quot;, line 384, in __get_result
+    raise self._exception
+  File &quot;/usr/lib/python3.7/concurrent/futures/thread.py&quot;, line 57, in run
+    result = self.fn(*self.args, **self.kwargs)
+  File &quot;/workspace/python/tvm/contrib/popen_pool.py&quot;, line 432, in &lt;lambda&gt;
+    worker = lambda *args: self._worker_run(*args)
+  File &quot;/workspace/python/tvm/contrib/popen_pool.py&quot;, line 401, in _worker_run
+    return proc.recv()
+  File &quot;/workspace/python/tvm/contrib/popen_pool.py&quot;, line 309, in recv
+    raise TimeoutError()
+TimeoutError
+
+        [(&#39;tile_f&#39;, [-1, 16, 2, 4]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 4, 2]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 1)],None,7400252
+No: 11  GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -1429,8 +1448,9 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 128, 1, 2]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 1, 8]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,2612342
-No: 10  GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 4, 2, 16]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 8, 4]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,10339952
+No: 12  GFLOPS: 86.81/148.64    result: MeasureResult(costs=(0.0026666176739130435,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.2049615383148193, timestamp=1667866569.3942392)      [(&#39;tile_f&#39;, [-1, 1, 2, 2]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 1, 1]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 1)],None,5228584
+No: 13  GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -1552,8 +1572,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 1, 2, 64]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 16, 2]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9344264
-No: 11  GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 64, 2, 4]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 32, 2]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,55114
+No: 14  GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -1675,8 +1695,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 2, 16, 4]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 128, 4]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 0)],None,4160547
-No: 12  GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 4, 8, 4]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 256, 1]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 1)],None,7193023
+No: 15  GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -1798,8 +1818,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 4, 8, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 16, 32]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 1)],None,5575709
-No: 13  GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 1, 64, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 1, 512]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,3096765
+No: 16  GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -1921,8 +1941,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 1, 512, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 1, 128]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,3076534
-No: 14  GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 128, 2, 2]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 2, 64]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 0)],None,3648331
+No: 17  GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -2044,161 +2064,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 4, 32, 4]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 4, 64]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,10235852
-No: 15  GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
-  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 738, in __call__
-    yield remote, remote.load_module(os.path.split(build_result.filename)[1])
-  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 702, in run_through_rpc
-    costs = time_f(*args).results
-  File &quot;/workspace/python/tvm/runtime/module.py&quot;, line 357, in evaluator
-    blob = feval(*args)
-  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
-  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 262, in tvm._ffi._cy3.core.FuncCall
-  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 251, in tvm._ffi._cy3.core.FuncCall3
-  File &quot;tvm/_ffi/_cython/./base.pxi&quot;, line 181, in tvm._ffi._cy3.core.CHECK_CALL
-tvm._ffi.base.TVMError: Traceback (most recent call last):
-  4: TVMFuncCall
-        at ../src/runtime/c_runtime_api.cc:477
-  3: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
-        at ../include/tvm/runtime/packed_func.h:1217
-  2: tvm::runtime::RPCWrappedFunc::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
-        at ../src/runtime/rpc/rpc_module.cc:129
-  1: tvm::runtime::RPCClientSession::CallFunc(void*, TVMValue const*, int const*, int, std::function&lt;void (tvm::runtime::TVMArgs)&gt; const&amp;)
-        at ../src/runtime/rpc/rpc_endpoint.cc:1012
-  0: tvm::runtime::RPCEndpoint::CallFunc(void*, TVMValue const*, int const*, int, std::function&lt;void (tvm::runtime::TVMArgs)&gt;)
-        at ../src/runtime/rpc/rpc_endpoint.cc:804
-  File &quot;../src/runtime/rpc/rpc_endpoint.cc&quot;, line 804
-TVMError:
----------------------------------------------------------------
-An error occurred during the execution of TVM.
-For more information, please see: https://tvm.apache.org/docs/errors.html
----------------------------------------------------------------
-  Check failed: (code == RPCCode::kReturn) is false: code=kShutdown
-
-During handling of the above exception, another exception occurred:
-
-Traceback (most recent call last):
-  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 702, in run_through_rpc
-    costs = time_f(*args).results
-  File &quot;/usr/lib/python3.7/contextlib.py&quot;, line 130, in __exit__
-    self.gen.throw(type, value, traceback)
-  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 742, in __call__
-    remote.remove(build_result.filename)
-  File &quot;/workspace/python/tvm/rpc/client.py&quot;, line 144, in remove
-    self._remote_funcs[&quot;remove&quot;] = self.get_function(&quot;tvm.rpc.server.remove&quot;)
-  File &quot;/workspace/python/tvm/rpc/client.py&quot;, line 72, in get_function
-    return self._sess.get_function(name)
-  File &quot;/workspace/python/tvm/runtime/module.py&quot;, line 171, in get_function
-    self.handle, c_str(name), ctypes.c_int(query_imports), ctypes.byref(ret_handle)
-  File &quot;/workspace/python/tvm/_ffi/base.py&quot;, line 348, in check_call
-    raise get_last_ffi_error()
-tvm._ffi.base.TVMError: Traceback (most recent call last):
-  52: 0xffffffffffffffff
-  51: _start
-  50: __libc_start_main
-  49: _Py_UnixMain
-  48: 0x0000000000650da0
-  47: 0x0000000000650afa
-  46: _PyFunction_FastCallDict
-  45: _PyEval_EvalCodeWithName
-  44: _PyEval_EvalFrameDefault
-  43: _PyFunction_FastCallKeywords
-  42: _PyEval_EvalCodeWithName
-  41: _PyEval_EvalFrameDefault
-  40: _PyMethodDef_RawFastCallKeywords
-  39: 0x0000000000546369
-  38: _PyEval_EvalCodeWithName
-  37: _PyEval_EvalFrameDefault
-  36: _PyFunction_FastCallKeywords
-  35: _PyEval_EvalCodeWithName
-  34: _PyEval_EvalFrameDefault
-  33: _PyFunction_FastCallDict
-  32: _PyEval_EvalCodeWithName
-  31: _PyEval_EvalFrameDefault
-  30: _PyObject_FastCallDict
-  29: 0x00000000004c06e1
-  28: _PyFunction_FastCallDict
-  27: _PyEval_EvalFrameDefault
-  26: _PyMethodDescr_FastCallKeywords
-  25: 0x00000000005dcb58
-  24: 0x00000000005dc83f
-  23: 0x00000000004ba127
-  22: _PyEval_EvalFrameDefault
-  21: _PyFunction_FastCallKeywords
-  20: _PyEval_EvalFrameDefault
-  19: _PyFunction_FastCallKeywords
-  18: _PyEval_EvalFrameDefault
-  17: _PyFunction_FastCallKeywords
-  16: _PyEval_EvalCodeWithName
-  15: _PyEval_EvalFrameDefault
-  14: 0x0000000000537c30
-  13: _PyObject_FastCallKeywords
-  12: 0x00007f8879914fa2
-  11: _ctypes_callproc
-  10: ffi_call
-  9: ffi_call_unix64
-  8: TVMModGetFunction
-        at ../src/runtime/c_runtime_api.cc:408
-  7: tvm::runtime::ModuleNode::GetFunction(std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;, bool)
-        at ../src/runtime/module.cc:66
-  6: tvm::runtime::RPCModuleNode::GetFunction(std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;, tvm::runtime::ObjectPtr&lt;tvm::runtime::Object&gt; const&amp;)
-        at ../src/runtime/rpc/rpc_module.cc:185
-  5: tvm::runtime::RPCClientSession::GetFunction(std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;)
-        at ../src/runtime/rpc/rpc_endpoint.cc:1007
-  4: tvm::runtime::TVMRetValue tvm::runtime::RPCEndpoint::SysCallRemote&lt;std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;&gt;(tvm::runtime::RPCCode, std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;)
-        at ../src/runtime/rpc/rpc_endpoint.h:223
-  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()&lt;int, std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;&gt;(int&amp;&amp;, std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;) const
-        at ../include/tvm/runtime/packed_func.h:1618
-  2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
-        at ../include/tvm/runtime/packed_func.h:1217
-  1: Call
-        at ../include/tvm/runtime/packed_func.h:1213
-  0: operator()
-        at ../src/runtime/rpc/rpc_endpoint.cc:684
-  File &quot;../src/runtime/rpc/rpc_endpoint.cc&quot;, line 684
-TVMError:
----------------------------------------------------------------
-An error occurred during the execution of TVM.
-For more information, please see: https://tvm.apache.org/docs/errors.html
----------------------------------------------------------------
-  Check failed: (code == RPCCode::kReturn) is false: code=1
-
-Traceback (most recent call last):
-  52: 0xffffffffffffffff
-  51: _start
-  50: __libc_start_main
-  49: _Py_UnixMain
-  48: 0x0000000000650da0
-  47: 0x0000000000650afa
-  46: _PyFunction_FastCallDict
-  45: _PyEval_EvalCodeWithName
-  44: _PyEval_EvalFrameDefault
-  43: _PyFunction_FastCallKeywords
-  42: _PyEval_EvalCodeWithName
-  41: _PyEval_EvalFrameDefault
-  40: _PyMethodDef_RawFastCallKeywords
-  39: 0x0000000000546369
-  38: _PyEval_EvalCodeWithName
-  37: _PyEval_EvalFrameDefault
-  36: _PyFunction_FastCallKeywords
-  35: _PyEval_EvalCodeWithName
-  34: _PyEval_EvalFrameDefault
-  33: _PyFunction_FastCallDict
-  32: _PyEval_EvalCodeWithName
-  31: _PyEval_EvalFrameDefault
-  30: _PyObject_FastCallDict
-  29: 0x00000000004c06e1
-  28: _PyFunction_FastCallDict
-  27: _PyEval_EvalFrameDefault
-  26: _PyMethodDescr_FastCallKeywords
-  25: 0x00000000005dcb58
-  24: 0x00000000005dc83f
-  23: 0x00000000004ba127
-  22: _PyEval_EvalFrameDefault
-  21: _PyFunction_FastCallKeywords
-  20: _PyEval_EvalFrameDefault
-  19: _PyFunction_FastCall      [(&#39;tile_f&#39;, [-1, 2, 1, 8]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 1, 8]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 1)],None,7068077
-No: 16  GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 64, 2, 1]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 8, 64]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 0)],None,4430156
+No: 18  GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -2320,9 +2187,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 2, 64, 4]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 32, 1]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9120014
-No: 17  GFLOPS: 73.22/248.27    result: MeasureResult(costs=(0.0031619162285714286,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.0219228267669678, timestamp=1667865870.0502481)      [(&#39;tile_f&#39;, [-1, 4, 16, 4]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 2, 4]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 0)],None,4137008
-No: 18  GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 16, 1, 2]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 128, 1]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 1)],None,7962299
+No: 19  GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -2444,9 +2310,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 4, 128, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 8, 8]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 0)],None,4947191
-No: 19  GFLOPS: 182.70/248.27   result: MeasureResult(costs=(0.001267116582278481,), error_no=MeasureErrorNo.NO_ERROR, all_cost=7.761154413223267, timestamp=1667865870.6874251)        [(&#39;tile_f&#39;, [-1, 1, 2, 8]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 64, 1]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9122443
-No: 20  GFLOPS: 0.00/248.27     result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 32, 8, 2]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 8, 1]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,2142884
+No: 20  GFLOPS: 0.00/148.64     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -2568,7 +2433,7 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 8, 1, 64]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 2, 256]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,190063
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 1, 32, 8]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 4, 16]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 1)],None,8647701
 </pre></div>
 </div>
 <p>Finally we can inspect the best config from log file, check correctness,
@@ -2607,9 +2472,9 @@ and measure running time.</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Finish loading 20 records
 
 Best config:
-[(&#39;tile_f&#39;, [-1, 1, 64, 2]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 2, 4]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,653714
+[(&#39;tile_f&#39;, [-1, 1, 16, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 4, 16]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,1677314
 Finish loading 20 records
-Time cost of this operator: 0.001150
+Time cost of this operator: 0.001912
 </pre></div>
 </div>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autotvm-tune-conv2d-cuda-py">
diff --git a/docs/how_to/work_with_microtvm/micro_autotune.html b/docs/how_to/work_with_microtvm/micro_autotune.html
index 2865f724db..d060733a5e 100644
--- a/docs/how_to/work_with_microtvm/micro_autotune.html
+++ b/docs/how_to/work_with_microtvm/micro_autotune.html
@@ -595,10 +595,10 @@ the tuned operator.</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>########## Build without Autotuning ##########
 Node Name                                     Ops                                           Time(us)  Time(%)  Shape              Inputs  Outputs  Measurements(us)
 ---------                                     ---                                           --------  -------  -----              ------  -------  ----------------
-tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  310.4     98.711   (1, 2, 10, 10, 3)  2       1        [310.4]
-tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       3.087     0.982    (1, 6, 10, 10)     1       1        [3.087]
-tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.966     0.307    (1, 1, 10, 10, 3)  1       1        [0.966]
-Total_time                                    -                                             314.453   -        -                  -       -        -
+tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  311.3     98.729   (1, 2, 10, 10, 3)  2       1        [311.3]
+tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       3.017     0.957    (1, 6, 10, 10)     1       1        [3.017]
+tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.991     0.314    (1, 1, 10, 10, 3)  1       1        [0.991]
+Total_time                                    -                                             315.308   -        -                  -       -        -
 </pre></div>
 </div>
 </div>
@@ -649,10 +649,10 @@ Total_time                                    -
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>########## Build with Autotuning ##########
 Node Name                                     Ops                                           Time(us)  Time(%)  Shape              Inputs  Outputs  Measurements(us)
 ---------                                     ---                                           --------  -------  -----              ------  -------  ----------------
-tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  104.9     97.537   (1, 6, 10, 10, 1)  2       1        [104.9]
-tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       1.795     1.669    (1, 6, 10, 10)     1       1        [1.795]
-tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.854     0.794    (1, 3, 10, 10, 1)  1       1        [0.854]
-Total_time                                    -                                             107.549   -        -                  -       -        -
+tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  104.8     97.581   (1, 6, 10, 10, 1)  2       1        [104.8]
+tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       1.759     1.638    (1, 6, 10, 10)     1       1        [1.759]
+tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.839     0.782    (1, 3, 10, 10, 1)  1       1        [0.839]
+Total_time                                    -                                             107.398   -        -                  -       -        -
 </pre></div>
 </div>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-autotune-py">
diff --git a/docs/how_to/work_with_microtvm/micro_train.html b/docs/how_to/work_with_microtvm/micro_train.html
index d75464cfc8..e510fab696 100644
--- a/docs/how_to/work_with_microtvm/micro_train.html
+++ b/docs/how_to/work_with_microtvm/micro_train.html
@@ -529,7 +529,7 @@ take about <strong>2 minutes</strong> to download the Stanford Cars, while COCO
 <a href="https://docs.python.org/3/library/shutil.html#shutil.move" title="shutil.move" class="sphx-glr-backref-module-shutil sphx-glr-backref-type-py-function"><span class="n">shutil</span><span class="o">.</span><span class="n">move</span></a><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><a href="https://docs.python.org/3/library/stdtypes.html#str" title="builtins.str" class="sphx-glr-backref-module-builtins sphx-glr-backref-typ [...]
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>&#39;/tmp/tmp1s4ra1nk/images/random&#39;
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>&#39;/tmp/tmpc_z4g3ef/images/random&#39;
 </pre></div>
 </div>
 </div>
@@ -589,8 +589,8 @@ objects to other stuff? We can display some examples from our datasets using <co
     <span class="n">plt</span><span class="o">.</span><span class="n">axis</span><span class="p">(</span><span class="s2">&quot;off&quot;</span><span class="p">)</span>
 </pre></div>
 </div>
-<img src="../../_images/sphx_glr_micro_train_001.png" srcset="../../_images/sphx_glr_micro_train_001.png" alt="[0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>/tmp/tmp1s4ra1nk/images/target contains 8144 images
-/tmp/tmp1s4ra1nk/images/random contains 5000 images
+<img src="../../_images/sphx_glr_micro_train_001.png" srcset="../../_images/sphx_glr_micro_train_001.png" alt="[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>/tmp/tmpc_z4g3ef/images/target contains 8144 images
+/tmp/tmpc_z4g3ef/images/random contains 5000 images
 </pre></div>
 </div>
 </div>
@@ -702,13 +702,13 @@ the time on our validation set).</p>
 </pre></div>
 </div>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Epoch 1/3
-328/328 - 48s - loss: 0.2227 - accuracy: 0.9201 - val_loss: 0.1678 - val_accuracy: 0.9411 - 48s/epoch - 147ms/step
+328/328 - 47s - loss: 0.2142 - accuracy: 0.9254 - val_loss: 0.1351 - val_accuracy: 0.9539 - 47s/epoch - 142ms/step
 Epoch 2/3
-328/328 - 43s - loss: 0.1024 - accuracy: 0.9609 - val_loss: 0.1075 - val_accuracy: 0.9573 - 43s/epoch - 132ms/step
+328/328 - 43s - loss: 0.0911 - accuracy: 0.9666 - val_loss: 0.1195 - val_accuracy: 0.9622 - 43s/epoch - 132ms/step
 Epoch 3/3
-328/328 - 43s - loss: 0.0729 - accuracy: 0.9735 - val_loss: 0.1108 - val_accuracy: 0.9600 - 43s/epoch - 132ms/step
+328/328 - 43s - loss: 0.0675 - accuracy: 0.9757 - val_loss: 0.1146 - val_accuracy: 0.9607 - 43s/epoch - 131ms/step
 
-&lt;keras.callbacks.History object at 0x7fd5ccb95e10&gt;
+&lt;keras.callbacks.History object at 0x7fb56c504290&gt;
 </pre></div>
 </div>
 </div>
@@ -970,7 +970,7 @@ as intended.</p>
 <p>From here, we could modify the model to read live images from the camera - we have another
 Arduino tutorial for how to do that <a class="reference external" href="https://github.com/guberti/tvm-arduino-demos/tree/master/examples/person_detection">on GitHub</a>. Alternatively, we could also
 <a class="reference external" href="https://tvm.apache.org/docs/how_to/work_with_microtvm/micro_autotune.html">use TVM’s autotuning capabilities</a> to dramatically improve the model’s performance.</p>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 4 minutes  56.709 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 4 minutes  54.885 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-train-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/b52cec46baf4f78d6bcd94cbe269c8a6/micro_train.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">micro_train.py</span></code></a></p>
diff --git a/docs/how_to/work_with_microtvm/sg_execution_times.html b/docs/how_to/work_with_microtvm/sg_execution_times.html
index 1f73047277..e8b1eaabe7 100644
--- a/docs/how_to/work_with_microtvm/sg_execution_times.html
+++ b/docs/how_to/work_with_microtvm/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-work-with-microtvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>05:58.685</strong> total execution time for <strong>how_to_work_with_microtvm</strong> files:</p>
+<p><strong>05:56.380</strong> total execution time for <strong>how_to_work_with_microtvm</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 83%" />
@@ -349,19 +349,19 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="micro_train.html#sphx-glr-how-to-work-with-microtvm-micro-train-py"><span class="std std-ref">Training Vision Models for microTVM on Arduino</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_train.py</span></code>)</p></td>
-<td><p>04:56.709</p></td>
+<td><p>04:54.885</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="micro_autotune.html#sphx-glr-how-to-work-with-microtvm-micro-autotune-py"><span class="std std-ref">Autotuning with microTVM</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_autotune.py</span></code>)</p></td>
-<td><p>00:49.857</p></td>
+<td><p>00:49.642</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="micro_aot.html#sphx-glr-how-to-work-with-microtvm-micro-aot-py"><span class="std std-ref">microTVM Host-Driven AoT</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_aot.py</span></code>)</p></td>
-<td><p>00:08.347</p></td>
+<td><p>00:08.090</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="micro_tflite.html#sphx-glr-how-to-work-with-microtvm-micro-tflite-py"><span class="std std-ref">microTVM with TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_tflite.py</span></code>)</p></td>
-<td><p>00:03.771</p></td>
+<td><p>00:03.762</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="micro_ethosu.html#sphx-glr-how-to-work-with-microtvm-micro-ethosu-py"><span class="std std-ref">Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_ethosu.py</span></code>)</p></td>
diff --git a/docs/how_to/work_with_relay/sg_execution_times.html b/docs/how_to/work_with_relay/sg_execution_times.html
index 1155fef2a0..e116d9fa4d 100644
--- a/docs/how_to/work_with_relay/sg_execution_times.html
+++ b/docs/how_to/work_with_relay/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-work-with-relay-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:44.507</strong> total execution time for <strong>how_to_work_with_relay</strong> files:</p>
+<p><strong>00:43.969</strong> total execution time for <strong>how_to_work_with_relay</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 84%" />
@@ -349,15 +349,15 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="using_pipeline_executor.html#sphx-glr-how-to-work-with-relay-using-pipeline-executor-py"><span class="std std-ref">Using Pipeline Executor in Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_pipeline_executor.py</span></code>)</p></td>
-<td><p>00:32.626</p></td>
+<td><p>00:32.087</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="using_external_lib.html#sphx-glr-how-to-work-with-relay-using-external-lib-py"><span class="std std-ref">Using External Libraries in Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_external_lib.py</span></code>)</p></td>
-<td><p>00:10.275</p></td>
+<td><p>00:10.306</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="build_gcn.html#sphx-glr-how-to-work-with-relay-build-gcn-py"><span class="std std-ref">Building a Graph Convolutional Network</span></a> (<code class="docutils literal notranslate"><span class="pre">build_gcn.py</span></code>)</p></td>
-<td><p>00:01.598</p></td>
+<td><p>00:01.569</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="using_relay_viz.html#sphx-glr-how-to-work-with-relay-using-relay-viz-py"><span class="std std-ref">Use Relay Visualizer to Visualize Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_relay_viz.py</span></code>)</p></td>
diff --git a/docs/how_to/work_with_schedules/intrin_math.html b/docs/how_to/work_with_schedules/intrin_math.html
index d7c2a03165..867d18c465 100644
--- a/docs/how_to/work_with_schedules/intrin_math.html
+++ b/docs/how_to/work_with_schedules/intrin_math.html
@@ -535,7 +535,7 @@ The following example customizes CUDA lowering rule for <code class="code docuti
 <a href="../../reference/api/python/ir.html#tvm.ir.register_intrin_lowering" title="tvm.ir.register_intrin_lowering" class="sphx-glr-backref-module-tvm-ir sphx-glr-backref-type-py-function"><span class="n">register_intrin_lowering</span></a><span class="p">(</span><span class="s2">&quot;tir.exp&quot;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&quot;cuda&quot;</span><span class="p">,</span> <span class="n">f</span><span class="o">= [...]
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>&lt;function my_cuda_math_rule at 0x7fd5c5158a70&gt;
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>&lt;function my_cuda_math_rule at 0x7fb51ba909e0&gt;
 </pre></div>
 </div>
 <p>Register the rule to TVM with override option to override existing rule.
diff --git a/docs/how_to/work_with_schedules/sg_execution_times.html b/docs/how_to/work_with_schedules/sg_execution_times.html
index e6ed26f71e..33403e894c 100644
--- a/docs/how_to/work_with_schedules/sg_execution_times.html
+++ b/docs/how_to/work_with_schedules/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-work-with-schedules-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:07.545</strong> total execution time for <strong>how_to_work_with_schedules</strong> files:</p>
+<p><strong>00:06.612</strong> total execution time for <strong>how_to_work_with_schedules</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 83%" />
@@ -349,27 +349,27 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="intrin_math.html#sphx-glr-how-to-work-with-schedules-intrin-math-py"><span class="std std-ref">Intrinsics and Math Functions</span></a> (<code class="docutils literal notranslate"><span class="pre">intrin_math.py</span></code>)</p></td>
-<td><p>00:05.160</p></td>
+<td><p>00:04.187</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tensorize.html#sphx-glr-how-to-work-with-schedules-tensorize-py"><span class="std std-ref">Use Tensorize to Leverage Hardware Intrinsics</span></a> (<code class="docutils literal notranslate"><span class="pre">tensorize.py</span></code>)</p></td>
-<td><p>00:01.050</p></td>
+<td><p>00:01.127</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="reduction.html#sphx-glr-how-to-work-with-schedules-reduction-py"><span class="std std-ref">Reduction</span></a> (<code class="docutils literal notranslate"><span class="pre">reduction.py</span></code>)</p></td>
-<td><p>00:00.573</p></td>
+<td><p>00:00.552</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="scan.html#sphx-glr-how-to-work-with-schedules-scan-py"><span class="std std-ref">Scan and Recurrent Kernel</span></a> (<code class="docutils literal notranslate"><span class="pre">scan.py</span></code>)</p></td>
-<td><p>00:00.548</p></td>
+<td><p>00:00.532</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="extern_op.html#sphx-glr-how-to-work-with-schedules-extern-op-py"><span class="std std-ref">External Tensor Functions</span></a> (<code class="docutils literal notranslate"><span class="pre">extern_op.py</span></code>)</p></td>
-<td><p>00:00.115</p></td>
+<td><p>00:00.117</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="schedule_primitives.html#sphx-glr-how-to-work-with-schedules-schedule-primitives-py"><span class="std std-ref">Schedule Primitives in TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">schedule_primitives.py</span></code>)</p></td>
-<td><p>00:00.051</p></td>
+<td><p>00:00.050</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="tedd.html#sphx-glr-how-to-work-with-schedules-tedd-py"><span class="std std-ref">Use Tensor Expression Debug Display (TEDD) for Visualization</span></a> (<code class="docutils literal notranslate"><span class="pre">tedd.py</span></code>)</p></td>
diff --git a/docs/how_to/work_with_schedules/tensorize.html b/docs/how_to/work_with_schedules/tensorize.html
index cd19991013..1c719acfc8 100644
--- a/docs/how_to/work_with_schedules/tensorize.html
+++ b/docs/how_to/work_with_schedules/tensorize.html
@@ -590,7 +590,7 @@ The importing needs to happen before the tensorized GEMV being executed.</p>
              C: Buffer(C_2: Pointer(float32), float32, [524288], [])}
   buffer_map = {A_1: A, B_1: B, C_1: C}
   preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [1024, 64], []), B_1: B_3: Buffer(B_2, float32, [512, 64], []), C_1: C_3: Buffer(C_2, float32, [1024, 512], [])} {
-  attr [IterVar(i: int32, (nullptr), &quot;DataPar&quot;, &quot;&quot;)] &quot;pragma_import_llvm&quot; = &quot;; ModuleID = &#39;/tmp/tmp5h3amj8c/input0.cc&#39;\nsource_filename = \&quot;/tmp/tmp5h3amj8c/input0.cc\&quot;\ntarget datalayout = \&quot;e-m:e-i64:64-f80:128-n8:16:32:64-S128\&quot;\ntarget triple = \&quot;x86_64-pc-linux-gnu\&quot;\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n  %7 = allo [...]
+  attr [IterVar(i: int32, (nullptr), &quot;DataPar&quot;, &quot;&quot;)] &quot;pragma_import_llvm&quot; = &quot;; ModuleID = &#39;/tmp/tmphe7nb_dg/input0.cc&#39;\nsource_filename = \&quot;/tmp/tmphe7nb_dg/input0.cc\&quot;\ntarget datalayout = \&quot;e-m:e-i64:64-f80:128-n8:16:32:64-S128\&quot;\ntarget triple = \&quot;x86_64-pc-linux-gnu\&quot;\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n  %7 = allo [...]
   for (i, 0, 1024) {
     for (j.outer: int32, 0, 32) {
       @tir.call_extern(&quot;gemv_update&quot;, @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), C_2, ((i*512) + (j.outer*16)), 16, 2, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), A_2, (i*64), 64, 1, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), B_2, (j.outer*1024), 1024, 1, dtype=handle), 16, 64, 64, dtype=int32)
diff --git a/docs/objects.inv b/docs/objects.inv
index 6a9599879b..71514a65d4 100644
Binary files a/docs/objects.inv and b/docs/objects.inv differ
diff --git a/docs/reference/api/doxygen/array_8h__dep__incl.svg b/docs/reference/api/doxygen/array_8h__dep__incl.svg
index 9dc87a72a8..b1dea95b39 100644
--- a/docs/reference/api/doxygen/array_8h__dep__incl.svg
+++ b/docs/reference/api/doxygen/array_8h__dep__incl.svg
@@ -269,39 +269,39 @@
 <path fill="none" stroke="#191970" d="M1239.5578,-805.0933C1113.1871,-798.2509 853.0508,-781.3786 820,-757 790.0366,-734.8987 787,-717.2327 787,-680 787,-680 787,-680 787,-484.5 787,-446.4351 785.8338,-433.1805 764,-402 753.3335,-386.7674 736.6651,-374.429 722.0514,-365.652"/>
 <polygon fill="#191970" stroke="#191970" points="1239.7176,-808.6067 1249.8904,-805.6465 1240.0919,-801.6168 1239.7176,-808.6067"/>
 </g>
-<!-- Node156 -->
+<!-- Node158 -->
 <g id="node38" class="node">
-<title>Node156</title>
+<title>Node158</title>
 <g id="a_node38"><a xlink:href="ir_2function_8h.html" target="_top" xlink:title="Function nodes. ">
 <polygon fill="#ffffff" stroke="#000000" points="2736,-670.5 2736,-689.5 2872,-689.5 2872,-670.5 2736,-670.5"/>
 <text text-anchor="middle" x="2804" y="-677.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/ir/function.h</text>
 </a>
 </g>
 </g>
-<!-- Node20&#45;&gt;Node156 -->
+<!-- Node20&#45;&gt;Node158 -->
 <g id="edge85" class="edge">
-<title>Node20&#45;&gt;Node156</title>
+<title>Node20&#45;&gt;Node158</title>
 <path fill="none" stroke="#191970" d="M1376.3966,-806.6566C1568.7228,-801.1655 2105.1598,-783.6265 2180,-757 2202.1968,-749.1029 2200.8834,-734.1189 2223,-726 2315.4331,-692.0683 2603.1308,-683.1427 2735.7667,-680.8132"/>
 <polygon fill="#191970" stroke="#191970" points="1376.0321,-803.1655 1366.1353,-806.9475 1376.2305,-810.1626 1376.0321,-803.1655"/>
 </g>
-<!-- Node163 -->
+<!-- Node165 -->
 <g id="node43" class="node">
-<title>Node163</title>
+<title>Node165</title>
 <g id="a_node43"><a xlink:href="ir_2type_8h.html" target="_top" xlink:title="IR/AST nodes for the unified type system in TVM. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="1640,-732 1640,-751 1758,-751 1758,-732 1640,-732"/>
 <text text-anchor="middle" x="1699" y="-739" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/ir/type.h</text>
 </a>
 </g>
 </g>
-<!-- Node20&#45;&gt;Node163 -->
+<!-- Node20&#45;&gt;Node165 -->
 <g id="edge100" class="edge">
-<title>Node20&#45;&gt;Node163</title>
+<title>Node20&#45;&gt;Node165</title>
 <path fill="none" stroke="#191970" d="M1376.2208,-798.4341C1440.7279,-788.6839 1540.1593,-773.0543 1626,-757 1635.6789,-755.1898 1646.0447,-753.0883 1655.8367,-751.0281"/>
 <polygon fill="#191970" stroke="#191970" points="1375.542,-794.9968 1366.1751,-799.9476 1376.5849,-801.9186 1375.542,-794.9968"/>
 </g>
-<!-- Node155 -->
+<!-- Node157 -->
 <g id="node44" class="node">
-<title>Node155</title>
+<title>Node157</title>
 <g id="a_node44"><a xlink:href="schedule__rule_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/schedule_rule.h">
 <polygon fill="#ffffff" stroke="#000000" points="218,-335.5 218,-365.5 370,-365.5 370,-335.5 218,-335.5"/>
 <text text-anchor="start" x="226" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
@@ -309,15 +309,15 @@
 </a>
 </g>
 </g>
-<!-- Node20&#45;&gt;Node155 -->
+<!-- Node20&#45;&gt;Node157 -->
 <g id="edge113" class="edge">
-<title>Node20&#45;&gt;Node155</title>
+<title>Node20&#45;&gt;Node157</title>
 <path fill="none" stroke="#191970" d="M1239.7711,-806.1775C1026.2553,-798.0125 385,-766.3063 385,-680 385,-680 385,-680 385,-484.5 385,-433.8818 340.011,-388.1979 313.2988,-365.5343"/>
 <polygon fill="#191970" stroke="#191970" points="1239.7878,-809.6805 1249.9126,-806.5598 1240.0516,-802.6855 1239.7878,-809.6805"/>
 </g>
-<!-- Node202 -->
+<!-- Node204 -->
 <g id="node45" class="node">
-<title>Node202</title>
+<title>Node204</title>
 <g id="a_node45"><a xlink:href="structural__equal_8h.html" target="_top" xlink:title="Structural equality comparison. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="1814.5,-726.5 1814.5,-756.5 1965.5,-756.5 1965.5,-726.5 1814.5,-726.5"/>
 <text text-anchor="start" x="1822.5" y="-744.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/node/structural</text>
@@ -325,15 +325,15 @@
 </a>
 </g>
 </g>
-<!-- Node20&#45;&gt;Node202 -->
+<!-- Node20&#45;&gt;Node204 -->
 <g id="edge119" class="edge">
-<title>Node20&#45;&gt;Node202</title>
+<title>Node20&#45;&gt;Node204</title>
 <path fill="none" stroke="#191970" d="M1376.1555,-802.8039C1471.2847,-794.5263 1649.1729,-777.8584 1800,-757 1804.7117,-756.3484 1809.555,-755.6367 1814.4344,-754.8878"/>
 <polygon fill="#191970" stroke="#191970" points="1375.7526,-799.3256 1366.0917,-803.6751 1376.3563,-806.2995 1375.7526,-799.3256"/>
 </g>
-<!-- Node214 -->
+<!-- Node216 -->
 <g id="node46" class="node">
-<title>Node214</title>
+<title>Node216</title>
 <g id="a_node46"><a xlink:href="papi_8h.html" target="_top" xlink:title="include/tvm/runtime\l/contrib/papi.h">
 <polygon fill="#ffffff" stroke="#000000" points="1984,-726.5 1984,-756.5 2100,-756.5 2100,-726.5 1984,-726.5"/>
 <text text-anchor="start" x="1992" y="-744.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -341,15 +341,15 @@
 </a>
 </g>
 </g>
-<!-- Node20&#45;&gt;Node214 -->
+<!-- Node20&#45;&gt;Node216 -->
 <g id="edge120" class="edge">
-<title>Node20&#45;&gt;Node214</title>
+<title>Node20&#45;&gt;Node216</title>
 <path fill="none" stroke="#191970" d="M1376.5776,-806.6021C1497.8965,-802.49 1758.0363,-790.2795 1975,-757 1977.9503,-756.5475 1980.9633,-756.0355 1983.9987,-755.48"/>
 <polygon fill="#191970" stroke="#191970" points="1376.2068,-803.1124 1366.3281,-806.9413 1376.4384,-810.1085 1376.2068,-803.1124"/>
 </g>
-<!-- Node215 -->
+<!-- Node217 -->
 <g id="node47" class="node">
-<title>Node215</title>
+<title>Node217</title>
 <g id="a_node47"><a xlink:href="packed__func_8h.html" target="_top" xlink:title="Type&#45;erased function used across TVM API. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="639,-402.5 639,-432.5 755,-432.5 755,-402.5 639,-402.5"/>
 <text text-anchor="start" x="647" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -357,45 +357,45 @@
 </a>
 </g>
 </g>
-<!-- Node20&#45;&gt;Node215 -->
+<!-- Node20&#45;&gt;Node217 -->
 <g id="edge121" class="edge">
-<title>Node20&#45;&gt;Node215</title>
+<title>Node20&#45;&gt;Node217</title>
 <path fill="none" stroke="#191970" d="M1239.3326,-805.2111C1103.8933,-798.2447 812.0193,-780.6779 773,-757 738.1161,-735.8316 723,-720.8043 723,-680 723,-680 723,-680 723,-551.5 723,-507.6591 709.422,-457.2029 701.8904,-432.5877"/>
 <polygon fill="#191970" stroke="#191970" points="1239.4271,-808.7203 1249.5923,-805.7339 1239.7834,-801.7294 1239.4271,-808.7203"/>
 </g>
-<!-- Node192 -->
+<!-- Node194 -->
 <g id="node48" class="node">
-<title>Node192</title>
+<title>Node194</title>
 <g id="a_node48"><a xlink:href="buffer_8h.html" target="_top" xlink:title="Symbolic n&#45;dimensional array, to represent a memory buffer. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="2443,-609 2443,-628 2571,-628 2571,-609 2443,-609"/>
 <text text-anchor="middle" x="2507" y="-616" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/buffer.h</text>
 </a>
 </g>
 </g>
-<!-- Node20&#45;&gt;Node192 -->
+<!-- Node20&#45;&gt;Node194 -->
 <g id="edge137" class="edge">
-<title>Node20&#45;&gt;Node192</title>
+<title>Node20&#45;&gt;Node194</title>
 <path fill="none" stroke="#191970" d="M1376.1645,-805.9882C1559.6875,-798.9747 2055.158,-778.2951 2126,-757 2152.5021,-749.0335 2154.7245,-737.2685 2180,-726 2281.5211,-680.7393 2408.1404,-644.4995 2469.7948,-628.065"/>
 <polygon fill="#191970" stroke="#191970" points="1375.9549,-802.4935 1366.0953,-806.3712 1376.2211,-809.4885 1375.9549,-802.4935"/>
 </g>
-<!-- Node193 -->
+<!-- Node195 -->
 <g id="node49" class="node">
-<title>Node193</title>
+<title>Node195</title>
 <g id="a_node49"><a xlink:href="tir_2expr_8h.html" target="_top" xlink:title="TIR expressions. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="2446.5,-542 2446.5,-561 2567.5,-561 2567.5,-542 2446.5,-542"/>
 <text text-anchor="middle" x="2507" y="-549" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/expr.h</text>
 </a>
 </g>
 </g>
-<!-- Node20&#45;&gt;Node193 -->
+<!-- Node20&#45;&gt;Node195 -->
 <g id="edge143" class="edge">
-<title>Node20&#45;&gt;Node193</title>
+<title>Node20&#45;&gt;Node195</title>
 <path fill="none" stroke="#191970" d="M1376.3938,-805.4566C1486.5769,-799.7834 1698.0034,-785.4961 1767,-757 1787.4653,-748.5477 1787.4885,-738.1489 1806,-726 1850.2169,-696.981 1861.1939,-687.7928 1911,-670 1962.2886,-651.6776 2318.6331,-585.8101 2454.5206,-561.0229"/>
 <polygon fill="#191970" stroke="#191970" points="1376.1885,-801.9625 1366.378,-805.963 1376.542,-808.9535 1376.1885,-801.9625"/>
 </g>
-<!-- Node198 -->
+<!-- Node200 -->
 <g id="node50" class="node">
-<title>Node198</title>
+<title>Node200</title>
 <g id="a_node50"><a xlink:href="index__map_8h.html" target="_top" xlink:title="Defines a remapping of buffer indices. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="2232,-726.5 2232,-756.5 2350,-756.5 2350,-726.5 2232,-726.5"/>
 <text text-anchor="start" x="2240" y="-744.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/index</text>
@@ -403,9 +403,9 @@
 </a>
 </g>
 </g>
-<!-- Node20&#45;&gt;Node198 -->
+<!-- Node20&#45;&gt;Node200 -->
 <g id="edge144" class="edge">
-<title>Node20&#45;&gt;Node198</title>
+<title>Node20&#45;&gt;Node200</title>
 <path fill="none" stroke="#191970" d="M1376.2127,-805.9924C1558.7098,-799.0671 2055.3112,-778.7458 2218,-757 2222.4947,-756.3992 2227.1249,-755.673 2231.7695,-754.8668"/>
 <polygon fill="#191970" stroke="#191970" points="1376.0639,-802.4954 1366.2033,-806.3707 1376.3284,-809.4904 1376.0639,-802.4954"/>
 </g>
@@ -1097,21 +1097,21 @@
 <path fill="none" stroke="#191970" d="M778.2064,-334.456C838.3648,-323.2599 917.9193,-308.4539 975.7633,-297.6885"/>
 <polygon fill="#191970" stroke="#191970" points="777.5375,-331.0203 768.3467,-336.291 778.8183,-337.9022 777.5375,-331.0203"/>
 </g>
-<!-- Node156&#45;&gt;Node22 -->
+<!-- Node158&#45;&gt;Node22 -->
 <g id="edge86" class="edge">
-<title>Node156&#45;&gt;Node22</title>
+<title>Node158&#45;&gt;Node22</title>
 <path fill="none" stroke="#191970" d="M2725.7538,-675.259C2532.5964,-663.5553 2035.7581,-633.4513 1855.5511,-622.5324"/>
 <polygon fill="#191970" stroke="#191970" points="2725.6335,-678.758 2735.8269,-675.8693 2726.0569,-671.7708 2725.6335,-678.758"/>
 </g>
-<!-- Node156&#45;&gt;Node92 -->
+<!-- Node158&#45;&gt;Node92 -->
 <g id="edge87" class="edge">
-<title>Node156&#45;&gt;Node92</title>
+<title>Node158&#45;&gt;Node92</title>
 <path fill="none" stroke="#191970" d="M2804,-660.2188C2804,-634.9855 2804,-589.9988 2804,-551.5 2804,-551.5 2804,-551.5 2804,-350.5 2804,-241.5654 2657.4396,-180.8997 2591.0295,-159.0022"/>
 <polygon fill="#191970" stroke="#191970" points="2800.5001,-660.3281 2804,-670.3281 2807.5001,-660.3282 2800.5001,-660.3281"/>
 </g>
-<!-- Node157 -->
+<!-- Node159 -->
 <g id="node39" class="node">
-<title>Node157</title>
+<title>Node159</title>
 <g id="a_node39"><a xlink:href="script_2ir__builder_2base_8h.html" target="_top" xlink:title="include/tvm/script\l/ir_builder/base.h">
 <polygon fill="#ffffff" stroke="#ff0000" points="2832,-603.5 2832,-633.5 2936,-633.5 2936,-603.5 2832,-603.5"/>
 <text text-anchor="start" x="2840" y="-621.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/script</text>
@@ -1119,15 +1119,15 @@
 </a>
 </g>
 </g>
-<!-- Node156&#45;&gt;Node157 -->
+<!-- Node158&#45;&gt;Node159 -->
 <g id="edge88" class="edge">
-<title>Node156&#45;&gt;Node157</title>
+<title>Node158&#45;&gt;Node159</title>
 <path fill="none" stroke="#191970" d="M2824.763,-664.0385C2837.0048,-654.6276 2852.4018,-642.7911 2864.4298,-633.5446"/>
 <polygon fill="#191970" stroke="#191970" points="2822.2949,-661.521 2816.5,-670.3906 2826.5613,-667.0707 2822.2949,-661.521"/>
 </g>
-<!-- Node158 -->
+<!-- Node160 -->
 <g id="node40" class="node">
-<title>Node158</title>
+<title>Node160</title>
 <g id="a_node40"><a xlink:href="ir__builder_2ir_2frame_8h.html" target="_top" xlink:title="include/tvm/script\l/ir_builder/ir/frame.h">
 <polygon fill="#ffffff" stroke="#ff0000" points="2858,-536.5 2858,-566.5 2974,-566.5 2974,-536.5 2858,-536.5"/>
 <text text-anchor="start" x="2866" y="-554.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/script</text>
@@ -1135,15 +1135,15 @@
 </a>
 </g>
 </g>
-<!-- Node156&#45;&gt;Node158 -->
+<!-- Node158&#45;&gt;Node160 -->
 <g id="edge91" class="edge">
-<title>Node156&#45;&gt;Node158</title>
+<title>Node158&#45;&gt;Node160</title>
 <path fill="none" stroke="#191970" d="M2862.4542,-668.1751C2896.5694,-659.9561 2934.8492,-647.9889 2945,-634 2959.9796,-613.3564 2942.8092,-584.2777 2929.2345,-566.7255"/>
 <polygon fill="#191970" stroke="#191970" points="2861.4467,-664.8161 2852.5042,-670.4978 2863.038,-671.6329 2861.4467,-664.8161"/>
 </g>
-<!-- Node159 -->
+<!-- Node161 -->
 <g id="node41" class="node">
-<title>Node159</title>
+<title>Node161</title>
 <g id="a_node41"><a xlink:href="ir_2ir_8h.html" target="_top" xlink:title="include/tvm/script\l/ir_builder/ir/ir.h">
 <polygon fill="#ffffff" stroke="#000000" points="2907,-469.5 2907,-499.5 3011,-499.5 3011,-469.5 2907,-469.5"/>
 <text text-anchor="start" x="2915" y="-487.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/script</text>
@@ -1151,204 +1151,204 @@
 </a>
 </g>
 </g>
-<!-- Node156&#45;&gt;Node159 -->
+<!-- Node158&#45;&gt;Node161 -->
 <g id="edge92" class="edge">
-<title>Node156&#45;&gt;Node159</title>
+<title>Node158&#45;&gt;Node161</title>
 <path fill="none" stroke="#191970" d="M2864.5975,-668.3231C2902.5946,-659.8803 2946.9516,-647.6025 2961,-634 2993.0698,-602.9481 2992.6566,-579.5826 2983,-536 2980.1406,-523.095 2973.699,-509.5896 2968.2087,-499.6481"/>
 <polygon fill="#191970" stroke="#191970" points="2863.621,-664.9534 2854.5896,-670.4928 2865.1042,-671.7945 2863.621,-664.9534"/>
 </g>
-<!-- Node161 -->
+<!-- Node163 -->
 <g id="node42" class="node">
-<title>Node161</title>
+<title>Node163</title>
 <g id="a_node42"><a xlink:href="tir_2function_8h.html" target="_top" xlink:title="TIR Function. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="2525,-475 2525,-494 2665,-494 2665,-475 2525,-475"/>
 <text text-anchor="middle" x="2595" y="-482" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/function.h</text>
 </a>
 </g>
 </g>
-<!-- Node156&#45;&gt;Node161 -->
+<!-- Node158&#45;&gt;Node163 -->
 <g id="edge93" class="edge">
-<title>Node156&#45;&gt;Node161</title>
+<title>Node158&#45;&gt;Node163</title>
 <path fill="none" stroke="#191970" d="M2786.3137,-663.4561C2744.1893,-624.0527 2638.9444,-525.6059 2605.4269,-494.2534"/>
 <polygon fill="#191970" stroke="#191970" points="2783.9417,-666.0299 2793.6357,-670.3051 2788.7236,-660.9178 2783.9417,-666.0299"/>
 </g>
-<!-- Node157&#45;&gt;Node158 -->
+<!-- Node159&#45;&gt;Node160 -->
 <g id="edge89" class="edge">
-<title>Node157&#45;&gt;Node158</title>
+<title>Node159&#45;&gt;Node160</title>
 <path fill="none" stroke="#191970" d="M2895.6092,-594.1932C2900.0075,-584.9844 2904.8855,-574.771 2908.7674,-566.6432"/>
 <polygon fill="#191970" stroke="#191970" points="2892.4129,-592.7646 2891.2613,-603.2967 2898.7294,-595.7815 2892.4129,-592.7646"/>
 </g>
-<!-- Node158&#45;&gt;Node159 -->
+<!-- Node160&#45;&gt;Node161 -->
 <g id="edge90" class="edge">
-<title>Node158&#45;&gt;Node159</title>
+<title>Node160&#45;&gt;Node161</title>
 <path fill="none" stroke="#191970" d="M2931.2275,-527.7735C2937.2334,-518.4154 2943.9551,-507.9421 2949.2812,-499.6432"/>
 <polygon fill="#191970" stroke="#191970" points="2928.2131,-525.9903 2925.7574,-536.2967 2934.1042,-529.7712 2928.2131,-525.9903"/>
 </g>
-<!-- Node161&#45;&gt;Node23 -->
+<!-- Node163&#45;&gt;Node23 -->
 <g id="edge94" class="edge">
-<title>Node161&#45;&gt;Node23</title>
+<title>Node163&#45;&gt;Node23</title>
 <path fill="none" stroke="#191970" d="M2624.2361,-470.1711C2653.9853,-453.7573 2692.8433,-426.0285 2671,-402 2614.3897,-339.7265 2374.5925,-390.2396 2294,-366 2243.2054,-350.7226 2189.8536,-318.1 2160.8866,-298.7686"/>
 <polygon fill="#191970" stroke="#191970" points="2622.5585,-467.099 2615.3733,-474.8851 2625.8457,-473.2792 2622.5585,-467.099"/>
 </g>
-<!-- Node161&#45;&gt;Node96 -->
+<!-- Node163&#45;&gt;Node96 -->
 <g id="edge96" class="edge">
-<title>Node161&#45;&gt;Node96</title>
+<title>Node163&#45;&gt;Node96</title>
 <path fill="none" stroke="#191970" d="M2563.5647,-470.9118C2533.4385,-457.8895 2488.7358,-438.5664 2462.1665,-427.0817"/>
 <polygon fill="#191970" stroke="#191970" points="2562.2244,-474.1454 2572.7923,-474.9005 2565.0019,-467.72 2562.2244,-474.1454"/>
 </g>
-<!-- Node161&#45;&gt;Node146 -->
+<!-- Node163&#45;&gt;Node146 -->
 <g id="edge95" class="edge">
-<title>Node161&#45;&gt;Node146</title>
+<title>Node163&#45;&gt;Node146</title>
 <path fill="none" stroke="#191970" d="M2514.8883,-479.7558C2376.9568,-471.4263 2085.7502,-453.1343 1840,-433 1811.9817,-430.7045 1781.2706,-427.8518 1754.1925,-425.2219"/>
 <polygon fill="#191970" stroke="#191970" points="2514.76,-483.2543 2524.9524,-480.3623 2515.1811,-476.267 2514.76,-483.2543"/>
 </g>
-<!-- Node161&#45;&gt;Node151 -->
+<!-- Node163&#45;&gt;Node151 -->
 <g id="edge97" class="edge">
-<title>Node161&#45;&gt;Node151</title>
+<title>Node163&#45;&gt;Node151</title>
 <path fill="none" stroke="#191970" d="M2595,-464.7758C2595,-454.4641 2595,-442.0437 2595,-432.5218"/>
 <polygon fill="#191970" stroke="#191970" points="2591.5001,-464.9005 2595,-474.9005 2598.5001,-464.9006 2591.5001,-464.9005"/>
 </g>
-<!-- Node163&#45;&gt;Node21 -->
+<!-- Node165&#45;&gt;Node21 -->
 <g id="edge101" class="edge">
-<title>Node163&#45;&gt;Node21</title>
+<title>Node165&#45;&gt;Node21</title>
 <path fill="none" stroke="#191970" d="M1751.7866,-729.7802C1804.35,-718.11 1884.0553,-700.4137 1932.9219,-689.5643"/>
 <polygon fill="#191970" stroke="#191970" points="1750.981,-726.3738 1741.9773,-731.9581 1752.4982,-733.2074 1750.981,-726.3738"/>
 </g>
-<!-- Node163&#45;&gt;Node22 -->
+<!-- Node165&#45;&gt;Node22 -->
 <g id="edge102" class="edge">
-<title>Node163&#45;&gt;Node22</title>
+<title>Node165&#45;&gt;Node22</title>
 <path fill="none" stroke="#191970" d="M1712.1755,-723.4934C1731.5239,-697.0507 1766.9506,-648.6342 1781.9707,-628.1068"/>
 <polygon fill="#191970" stroke="#191970" points="1709.1427,-721.7113 1706.0621,-731.8484 1714.7919,-725.8449 1709.1427,-721.7113"/>
 </g>
-<!-- Node163&#45;&gt;Node112 -->
+<!-- Node165&#45;&gt;Node112 -->
 <g id="edge103" class="edge">
-<title>Node163&#45;&gt;Node112</title>
+<title>Node165&#45;&gt;Node112</title>
 <path fill="none" stroke="#191970" d="M1695.0799,-721.7396C1688.5052,-684.4379 1677.8446,-601.5151 1698,-536 1702.1323,-522.5678 1711.0933,-509.4972 1718.9133,-499.8848"/>
 <polygon fill="#191970" stroke="#191970" points="1691.6783,-722.5916 1696.9505,-731.7815 1698.56,-721.3097 1691.6783,-722.5916"/>
 </g>
-<!-- Node155&#45;&gt;Node53 -->
+<!-- Node157&#45;&gt;Node53 -->
 <g id="edge114" class="edge">
-<title>Node155&#45;&gt;Node53</title>
+<title>Node157&#45;&gt;Node53</title>
 <path fill="none" stroke="#191970" d="M264.3225,-329.3469C250.3357,-319.3776 233.9692,-307.7121 221.2456,-298.6432"/>
 <polygon fill="#191970" stroke="#191970" points="262.4953,-332.3426 272.6699,-335.2967 266.5582,-326.6423 262.4953,-332.3426"/>
 </g>
-<!-- Node215&#45;&gt;Node23 -->
+<!-- Node217&#45;&gt;Node23 -->
 <g id="edge122" class="edge">
-<title>Node215&#45;&gt;Node23</title>
+<title>Node217&#45;&gt;Node23</title>
 <path fill="none" stroke="#191970" d="M765.3181,-413.8835C875.4998,-407.5245 1099.1003,-392.4428 1287,-366 1358.2947,-355.9668 1374.5624,-343.9599 1446,-335 1724.7168,-300.0424 1799.2952,-341.2507 2077,-299 2079.8519,-298.5661 2082.7641,-298.0605 2085.6948,-297.5021"/>
 <polygon fill="#191970" stroke="#191970" points="764.7986,-410.4073 755.0144,-414.4717 765.1977,-417.3959 764.7986,-410.4073"/>
 </g>
-<!-- Node215&#45;&gt;Node27 -->
+<!-- Node217&#45;&gt;Node27 -->
 <g id="edge135" class="edge">
-<title>Node215&#45;&gt;Node27</title>
+<title>Node217&#45;&gt;Node27</title>
 <path fill="none" stroke="#191970" d="M765.2722,-413.919C900.6106,-406.6116 1212.456,-388.7341 1474,-366 1503.6842,-363.4198 1536.7419,-359.9071 1563.6896,-356.8781"/>
 <polygon fill="#191970" stroke="#191970" points="764.8555,-410.4363 755.058,-414.4685 765.2316,-417.4262 764.8555,-410.4363"/>
 </g>
-<!-- Node215&#45;&gt;Node45 -->
+<!-- Node217&#45;&gt;Node45 -->
 <g id="edge123" class="edge">
-<title>Node215&#45;&gt;Node45</title>
+<title>Node217&#45;&gt;Node45</title>
 <path fill="none" stroke="#191970" d="M765.4493,-414.4087C864.466,-409.0875 1053.104,-395.7575 1211,-366 1301.4104,-348.961 1404.3462,-317.2752 1461.5899,-298.52"/>
 <polygon fill="#191970" stroke="#191970" points="765.0812,-410.9232 755.279,-414.944 765.4492,-417.9135 765.0812,-410.9232"/>
 </g>
-<!-- Node215&#45;&gt;Node46 -->
+<!-- Node217&#45;&gt;Node46 -->
 <g id="edge127" class="edge">
-<title>Node215&#45;&gt;Node46</title>
+<title>Node217&#45;&gt;Node46</title>
 <path fill="none" stroke="#191970" d="M646.9122,-398.2246C630.0334,-390.0609 611.958,-379.3134 598,-366 567.3033,-336.7209 550,-325.9212 550,-283.5 550,-283.5 550,-283.5 550,-216.5 550,-158.3676 613.7289,-117.985 655.8,-97.6597"/>
 <polygon fill="#191970" stroke="#191970" points="645.5874,-401.4681 656.1325,-402.494 648.5288,-395.1161 645.5874,-401.4681"/>
 </g>
-<!-- Node215&#45;&gt;Node47 -->
+<!-- Node217&#45;&gt;Node47 -->
 <g id="edge133" class="edge">
-<title>Node215&#45;&gt;Node47</title>
+<title>Node217&#45;&gt;Node47</title>
 <path fill="none" stroke="#191970" d="M629.271,-400.9882C606.4955,-393.0841 582.0577,-381.7938 563,-366 529.8093,-338.4936 512,-326.6071 512,-283.5 512,-283.5 512,-283.5 512,-149.5 512,-93.579 557.1197,-92.2807 607,-67 654.04,-43.1589 712.1096,-30.2417 757.7401,-23.3065"/>
 <polygon fill="#191970" stroke="#191970" points="628.3164,-404.3588 638.9096,-404.1716 630.5118,-397.7119 628.3164,-404.3588"/>
 </g>
-<!-- Node215&#45;&gt;Node48 -->
+<!-- Node217&#45;&gt;Node48 -->
 <g id="edge134" class="edge">
-<title>Node215&#45;&gt;Node48</title>
+<title>Node217&#45;&gt;Node48</title>
 <path fill="none" stroke="#191970" d="M628.5264,-411.5097C578.3015,-405.124 509.4775,-392.133 454,-366 433.969,-356.5643 434.19,-346.0461 415,-335 376.9461,-313.0956 347.7525,-335.2659 323,-299 315.2329,-287.6202 314.5862,-278.9103 323,-268 361.4458,-218.1465 399.843,-250.5633 460,-232 528.6247,-210.8237 607.0935,-181.9115 652.9319,-164.5366"/>
 <polygon fill="#191970" stroke="#191970" points="628.2825,-415.0057 638.6317,-412.7376 629.1269,-408.0569 628.2825,-415.0057"/>
 </g>
-<!-- Node215&#45;&gt;Node49 -->
+<!-- Node217&#45;&gt;Node49 -->
 <g id="edge125" class="edge">
-<title>Node215&#45;&gt;Node49</title>
+<title>Node217&#45;&gt;Node49</title>
 <path fill="none" stroke="#191970" d="M765.4035,-414.4394C879.141,-408.6348 1101.8771,-394.0241 1175,-366 1196.9995,-357.5688 1197.5319,-346.6671 1218,-335 1242.9534,-320.7762 1272.5869,-307.856 1295.9148,-298.5145"/>
 <polygon fill="#191970" stroke="#191970" points="764.8899,-410.9607 755.0778,-414.9575 765.2407,-417.9519 764.8899,-410.9607"/>
 </g>
-<!-- Node215&#45;&gt;Node50 -->
+<!-- Node217&#45;&gt;Node50 -->
 <g id="edge131" class="edge">
-<title>Node215&#45;&gt;Node50</title>
+<title>Node217&#45;&gt;Node50</title>
 <path fill="none" stroke="#191970" d="M641.7544,-397.7188C627.9982,-390.0189 614.927,-379.669 607,-366 600.0881,-354.0814 600.09,-346.9197 607,-335 637.0529,-283.1585 668.4874,-290.5625 724,-268 763.6933,-251.8671 809.9091,-239.5739 847.9088,-231.0511"/>
 <polygon fill="#191970" stroke="#191970" points="640.3535,-400.9343 650.8458,-402.4044 643.5603,-394.712 640.3535,-400.9343"/>
 </g>
-<!-- Node215&#45;&gt;Node52 -->
+<!-- Node217&#45;&gt;Node52 -->
 <g id="edge128" class="edge">
-<title>Node215&#45;&gt;Node52</title>
+<title>Node217&#45;&gt;Node52</title>
 <path fill="none" stroke="#191970" d="M733.7768,-397.2074C747.999,-388.5178 763.9567,-377.6806 777,-366 800.092,-345.3205 821.7302,-316.2531 833.8779,-298.7128"/>
 <polygon fill="#191970" stroke="#191970" points="731.984,-394.2013 725.1872,-402.3287 735.5688,-400.2138 731.984,-394.2013"/>
 </g>
-<!-- Node215&#45;&gt;Node53 -->
+<!-- Node217&#45;&gt;Node53 -->
 <g id="edge132" class="edge">
-<title>Node215&#45;&gt;Node53</title>
+<title>Node217&#45;&gt;Node53</title>
 <path fill="none" stroke="#191970" d="M628.7685,-413.0577C571.6052,-407.4073 488.8609,-394.6539 422,-366 400.3451,-356.7196 400.1898,-345.2983 379,-335 345.8441,-318.8861 306.7214,-306.911 273.521,-298.5841"/>
 <polygon fill="#191970" stroke="#191970" points="628.4952,-416.5474 638.7794,-414.0006 629.1517,-409.5782 628.4952,-416.5474"/>
 </g>
-<!-- Node215&#45;&gt;Node57 -->
+<!-- Node217&#45;&gt;Node57 -->
 <g id="edge136" class="edge">
-<title>Node215&#45;&gt;Node57</title>
+<title>Node217&#45;&gt;Node57</title>
 <path fill="none" stroke="#191970" d="M765.3605,-414.6456C888.7284,-408.9847 1156.6274,-394.3601 1381,-366 1458.931,-356.1497 1476.9767,-344.0908 1555,-335 1840.6244,-301.7209 1916.6063,-341.5326 2201,-299 2204.0077,-298.5502 2207.0818,-298.0228 2210.1748,-297.4393"/>
 <polygon fill="#191970" stroke="#191970" points="765.1339,-411.1522 755.3029,-415.1019 765.4512,-418.145 765.1339,-411.1522"/>
 </g>
-<!-- Node215&#45;&gt;Node147 -->
+<!-- Node217&#45;&gt;Node147 -->
 <g id="edge124" class="edge">
-<title>Node215&#45;&gt;Node147</title>
+<title>Node217&#45;&gt;Node147</title>
 <path fill="none" stroke="#191970" d="M746.6574,-398.7561C817.6926,-371.9428 946.9181,-323.1647 1012.1134,-298.5558"/>
 <polygon fill="#191970" stroke="#191970" points="745.1526,-395.583 737.0329,-402.389 747.6247,-402.132 745.1526,-395.583"/>
 </g>
-<!-- Node215&#45;&gt;Node149 -->
+<!-- Node217&#45;&gt;Node149 -->
 <g id="edge126" class="edge">
-<title>Node215&#45;&gt;Node149</title>
+<title>Node217&#45;&gt;Node149</title>
 <path fill="none" stroke="#191970" d="M629.0186,-408.3651C589.0754,-400.9834 538.6175,-388.0627 498,-366 465.9633,-348.5983 436.3485,-317.3383 420.363,-298.6975"/>
 <polygon fill="#191970" stroke="#191970" points="628.477,-411.8236 638.9347,-410.1247 629.7001,-404.9313 628.477,-411.8236"/>
 </g>
-<!-- Node215&#45;&gt;Node150 -->
+<!-- Node217&#45;&gt;Node150 -->
 <g id="edge129" class="edge">
-<title>Node215&#45;&gt;Node150</title>
+<title>Node217&#45;&gt;Node150</title>
 <path fill="none" stroke="#191970" d="M695.1207,-392.3179C694.4509,-383.3414 693.7175,-373.5143 693.1301,-365.6432"/>
 <polygon fill="#191970" stroke="#191970" points="691.6308,-392.5849 695.8654,-402.2967 698.6114,-392.0639 691.6308,-392.5849"/>
 </g>
-<!-- Node215&#45;&gt;Node155 -->
+<!-- Node217&#45;&gt;Node157 -->
 <g id="edge130" class="edge">
-<title>Node215&#45;&gt;Node155</title>
+<title>Node217&#45;&gt;Node157</title>
 <path fill="none" stroke="#191970" d="M628.8319,-406.4344C565.4097,-396.1035 468.2822,-380.1912 384,-366 379.4659,-365.2366 374.8041,-364.4472 370.1017,-363.6475"/>
 <polygon fill="#191970" stroke="#191970" points="628.2818,-409.8908 638.7143,-408.0434 629.4067,-402.9818 628.2818,-409.8908"/>
 </g>
-<!-- Node192&#45;&gt;Node161 -->
+<!-- Node194&#45;&gt;Node163 -->
 <g id="edge142" class="edge">
-<title>Node192&#45;&gt;Node161</title>
+<title>Node194&#45;&gt;Node163</title>
 <path fill="none" stroke="#191970" d="M2535.7315,-603.9305C2550.2382,-595.1429 2566.8654,-582.6162 2577,-567 2591.9238,-544.0042 2594.6494,-510.5831 2595.0345,-494.2376"/>
 <polygon fill="#191970" stroke="#191970" points="2533.981,-600.8996 2527.0502,-608.913 2537.4655,-606.9708 2533.981,-600.8996"/>
 </g>
-<!-- Node192&#45;&gt;Node193 -->
+<!-- Node194&#45;&gt;Node195 -->
 <g id="edge138" class="edge">
-<title>Node192&#45;&gt;Node193</title>
+<title>Node194&#45;&gt;Node195</title>
 <path fill="none" stroke="#191970" d="M2507,-598.6079C2507,-586.214 2507,-570.8263 2507,-561.0817"/>
 <polygon fill="#191970" stroke="#191970" points="2503.5001,-598.9005 2507,-608.9005 2510.5001,-598.9006 2503.5001,-598.9005"/>
 </g>
-<!-- Node193&#45;&gt;Node57 -->
+<!-- Node195&#45;&gt;Node57 -->
 <g id="edge139" class="edge">
-<title>Node193&#45;&gt;Node57</title>
+<title>Node195&#45;&gt;Node57</title>
 <path fill="none" stroke="#191970" d="M2566.5137,-539.8181C2608.3171,-530.4173 2659.2932,-516.1445 2674,-500 2703.6291,-467.4743 2714.5965,-437.0489 2688,-402 2643.0677,-342.7881 2421.6601,-305.1472 2317.8738,-290.4791"/>
 <polygon fill="#191970" stroke="#191970" points="2565.6757,-536.4186 2556.66,-541.9836 2567.1782,-543.2555 2565.6757,-536.4186"/>
 </g>
-<!-- Node193&#45;&gt;Node96 -->
+<!-- Node195&#45;&gt;Node96 -->
 <g id="edge140" class="edge">
-<title>Node193&#45;&gt;Node96</title>
+<title>Node195&#45;&gt;Node96</title>
 <path fill="none" stroke="#191970" d="M2497.6382,-532.7765C2483.1425,-503.7849 2455.79,-449.08 2444.7748,-427.0496"/>
 <polygon fill="#191970" stroke="#191970" points="2494.5788,-534.484 2502.1815,-541.8631 2500.8398,-531.3535 2494.5788,-534.484"/>
 </g>
-<!-- Node193&#45;&gt;Node161 -->
+<!-- Node195&#45;&gt;Node163 -->
 <g id="edge141" class="edge">
-<title>Node193&#45;&gt;Node161</title>
+<title>Node195&#45;&gt;Node163</title>
 <path fill="none" stroke="#191970" d="M2527.8101,-535.6559C2544.775,-522.7395 2568.1796,-504.9201 2582.4151,-494.0817"/>
 <polygon fill="#191970" stroke="#191970" points="2525.4445,-533.058 2519.6083,-541.9005 2529.6849,-538.6275 2525.4445,-533.058"/>
 </g>
diff --git a/docs/reference/api/doxygen/block__scope_8h.html b/docs/reference/api/doxygen/block__scope_8h.html
index 672fb77271..9f6d379200 100644
--- a/docs/reference/api/doxygen/block__scope_8h.html
+++ b/docs/reference/api/doxygen/block__scope_8h.html
@@ -84,7 +84,7 @@ Include dependency graph for block_scope.h:</div>
 </div><div class="textblock"><div class="dynheader">
 This graph shows which files directly or indirectly include this file:</div>
 <div class="dyncontent">
-<div class="center"><iframe scrolling="no" frameborder="0" src="block__scope_8h__dep__incl.svg" width="1186" height="767"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
+<div class="center"><iframe scrolling="no" frameborder="0" src="block__scope_8h__dep__incl.svg" width="1546" height="767"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
 </div>
 </div>
 </div>
diff --git a/docs/reference/api/doxygen/block__scope_8h__dep__incl.svg b/docs/reference/api/doxygen/block__scope_8h__dep__incl.svg
index c846678451..fcc8c3d861 100644
--- a/docs/reference/api/doxygen/block__scope_8h__dep__incl.svg
+++ b/docs/reference/api/doxygen/block__scope_8h__dep__incl.svg
@@ -4,313 +4,345 @@
 <!-- Generated by graphviz version 2.40.1 (20161225.0304)
  -->
 <!-- Title: include/tvm/tir/schedule/block_scope.h Pages: 1 -->
-<svg width="889pt" height="575pt"
- viewBox="0.00 0.00 888.86 575.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<svg width="1159pt" height="575pt"
+ viewBox="0.00 0.00 1158.53 575.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
 <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 571)">
 <title>include/tvm/tir/schedule/block_scope.h</title>
-<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-571 884.8596,-571 884.8596,4 -4,4"/>
+<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-571 1154.53,-571 1154.53,4 -4,4"/>
 <!-- Node54 -->
 <g id="node1" class="node">
 <title>Node54</title>
-<polygon fill="#bfbfbf" stroke="#000000" points="420.3823,-536.5 420.3823,-566.5 554.3823,-566.5 554.3823,-536.5 420.3823,-536.5"/>
-<text text-anchor="start" x="428.3823" y="-554.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/schedule</text>
-<text text-anchor="middle" x="487.3823" y="-543.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/block_scope.h</text>
+<polygon fill="#bfbfbf" stroke="#000000" points="510,-536.5 510,-566.5 644,-566.5 644,-536.5 510,-536.5"/>
+<text text-anchor="start" x="518" y="-554.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/schedule</text>
+<text text-anchor="middle" x="577" y="-543.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/block_scope.h</text>
 </g>
 <!-- Node55 -->
 <g id="node2" class="node">
 <title>Node55</title>
 <g id="a_node2"><a xlink:href="state_8h.html" target="_top" xlink:title="This file defines ScheduleState, the core data structure of TensorIR scheduling. ">
-<polygon fill="#ffffff" stroke="#000000" points="420.3823,-469.5 420.3823,-499.5 554.3823,-499.5 554.3823,-469.5 420.3823,-469.5"/>
-<text text-anchor="start" x="428.3823" y="-487.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/schedule</text>
-<text text-anchor="middle" x="487.3823" y="-476.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/state.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="510,-469.5 510,-499.5 644,-499.5 644,-469.5 510,-469.5"/>
+<text text-anchor="start" x="518" y="-487.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/schedule</text>
+<text text-anchor="middle" x="577" y="-476.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/state.h</text>
 </a>
 </g>
 </g>
 <!-- Node54&#45;&gt;Node55 -->
 <g id="edge1" class="edge">
 <title>Node54&#45;&gt;Node55</title>
-<path fill="none" stroke="#191970" d="M487.3823,-526.0249C487.3823,-517.128 487.3823,-507.4287 487.3823,-499.6432"/>
-<polygon fill="#191970" stroke="#191970" points="483.8824,-526.2966 487.3823,-536.2967 490.8824,-526.2967 483.8824,-526.2966"/>
+<path fill="none" stroke="#191970" d="M577,-526.0249C577,-517.128 577,-507.4287 577,-499.6432"/>
+<polygon fill="#191970" stroke="#191970" points="573.5001,-526.2966 577,-536.2967 580.5001,-526.2967 573.5001,-526.2966"/>
 </g>
 <!-- Node56 -->
 <g id="node3" class="node">
 <title>Node56</title>
 <g id="a_node3"><a xlink:href="tir_2schedule_2schedule_8h.html" target="_top" xlink:title="include/tvm/tir/schedule\l/schedule.h">
-<polygon fill="#ffffff" stroke="#000000" points="420.3823,-402.5 420.3823,-432.5 554.3823,-432.5 554.3823,-402.5 420.3823,-402.5"/>
-<text text-anchor="start" x="428.3823" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/schedule</text>
-<text text-anchor="middle" x="487.3823" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/schedule.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="510,-402.5 510,-432.5 644,-432.5 644,-402.5 510,-402.5"/>
+<text text-anchor="start" x="518" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/schedule</text>
+<text text-anchor="middle" x="577" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/schedule.h</text>
 </a>
 </g>
 </g>
 <!-- Node55&#45;&gt;Node56 -->
 <g id="edge2" class="edge">
 <title>Node55&#45;&gt;Node56</title>
-<path fill="none" stroke="#191970" d="M487.3823,-459.0249C487.3823,-450.128 487.3823,-440.4287 487.3823,-432.6432"/>
-<polygon fill="#191970" stroke="#191970" points="483.8824,-459.2966 487.3823,-469.2967 490.8824,-459.2967 483.8824,-459.2966"/>
+<path fill="none" stroke="#191970" d="M577,-459.0249C577,-450.128 577,-440.4287 577,-432.6432"/>
+<polygon fill="#191970" stroke="#191970" points="573.5001,-459.2966 577,-469.2967 580.5001,-459.2967 573.5001,-459.2966"/>
 </g>
 <!-- Node57 -->
 <g id="node4" class="node">
 <title>Node57</title>
 <g id="a_node4"><a xlink:href="meta__schedule_2cost__model_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/cost_model.h">
-<polygon fill="#ffffff" stroke="#000000" points="298.3823,-268.5 298.3823,-298.5 450.3823,-298.5 450.3823,-268.5 298.3823,-268.5"/>
-<text text-anchor="start" x="306.3823" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
-<text text-anchor="middle" x="374.3823" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/cost_model.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="870,-268.5 870,-298.5 1022,-298.5 1022,-268.5 870,-268.5"/>
+<text text-anchor="start" x="878" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="946" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/cost_model.h</text>
 </a>
 </g>
 </g>
 <!-- Node56&#45;&gt;Node57 -->
 <g id="edge3" class="edge">
 <title>Node56&#45;&gt;Node57</title>
-<path fill="none" stroke="#191970" d="M508.0713,-394.492C521.1057,-377.094 533.2474,-353.1801 520.3823,-335 510.4069,-320.9035 473.6601,-307.9828 439.5313,-298.6092"/>
-<polygon fill="#191970" stroke="#191970" points="505.2157,-392.459 501.7065,-402.4558 510.6839,-396.8292 505.2157,-392.459"/>
+<path fill="none" stroke="#191970" d="M654.4944,-416.0863C801.2322,-412.6026 1107.9141,-401.1549 1140,-366 1179.8467,-322.342 1092.2093,-300.9345 1022.0785,-291.0787"/>
+<polygon fill="#191970" stroke="#191970" points="654.1283,-412.5938 644.2117,-416.3236 654.2898,-419.5919 654.1283,-412.5938"/>
 </g>
 <!-- Node58 -->
 <g id="node5" class="node">
 <title>Node58</title>
 <g id="a_node5"><a xlink:href="search__strategy_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/search_strategy.h">
-<polygon fill="#ffffff" stroke="#000000" points="487.3823,-201.5 487.3823,-231.5 639.3823,-231.5 639.3823,-201.5 487.3823,-201.5"/>
-<text text-anchor="start" x="495.3823" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
-<text text-anchor="middle" x="563.3823" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/search_strategy.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="558,-201.5 558,-231.5 710,-231.5 710,-201.5 558,-201.5"/>
+<text text-anchor="start" x="566" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="634" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/search_strategy.h</text>
 </a>
 </g>
 </g>
 <!-- Node56&#45;&gt;Node58 -->
-<g id="edge25" class="edge">
+<g id="edge27" class="edge">
 <title>Node56&#45;&gt;Node58</title>
-<path fill="none" stroke="#191970" d="M529.4354,-397.6125C564.6415,-378.1838 611.8568,-344.8405 629.3823,-299 634.3024,-286.1307 635.4258,-280.3815 629.3823,-268 621.7957,-252.4574 607.001,-240.2364 593.3388,-231.6075"/>
-<polygon fill="#191970" stroke="#191970" points="527.4845,-394.6865 520.3146,-402.4866 530.7838,-400.8602 527.4845,-394.6865"/>
+<path fill="none" stroke="#191970" d="M584.0563,-392.6171C596.0386,-350.364 620.0091,-265.8364 629.7438,-231.5088"/>
+<polygon fill="#191970" stroke="#191970" points="580.6454,-391.8168 581.2843,-402.3923 587.3798,-393.7266 580.6454,-391.8168"/>
 </g>
 <!-- Node62 -->
 <g id="node9" class="node">
 <title>Node62</title>
 <g id="a_node9"><a xlink:href="database_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/database.h">
-<polygon fill="#ffffff" stroke="#000000" points="468.3823,-268.5 468.3823,-298.5 620.3823,-298.5 620.3823,-268.5 468.3823,-268.5"/>
-<text text-anchor="start" x="476.3823" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
-<text text-anchor="middle" x="544.3823" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/database.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="416,-268.5 416,-298.5 568,-298.5 568,-268.5 416,-268.5"/>
+<text text-anchor="start" x="424" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="492" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/database.h</text>
 </a>
 </g>
 </g>
 <!-- Node56&#45;&gt;Node62 -->
 <g id="edge11" class="edge">
 <title>Node56&#45;&gt;Node62</title>
-<path fill="none" stroke="#191970" d="M525.7345,-396.6919C536.714,-388.6618 547.3367,-378.3855 553.3823,-366 563.9366,-344.3772 556.8908,-315.8573 550.7162,-298.6405"/>
-<polygon fill="#191970" stroke="#191970" points="523.6459,-393.8795 517.3388,-402.3925 527.5781,-399.6707 523.6459,-393.8795"/>
+<path fill="none" stroke="#191970" d="M565.9609,-393.2189C557.9237,-376.3851 546.3261,-353.7192 534,-335 525.4977,-322.0879 514.3888,-308.5828 505.7316,-298.6433"/>
+<polygon fill="#191970" stroke="#191970" points="562.8441,-394.8161 570.2603,-402.3826 569.1813,-391.8428 562.8441,-394.8161"/>
 </g>
 <!-- Node63 -->
 <g id="node10" class="node">
 <title>Node63</title>
 <g id="a_node10"><a xlink:href="measure__candidate_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/measure_candidate.h">
-<polygon fill="#ffffff" stroke="#000000" points="676.3823,-335.5 676.3823,-365.5 828.3823,-365.5 828.3823,-335.5 676.3823,-335.5"/>
-<text text-anchor="start" x="684.3823" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
-<text text-anchor="middle" x="752.3823" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/measure_candidate.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="624,-335.5 624,-365.5 776,-365.5 776,-335.5 624,-335.5"/>
+<text text-anchor="start" x="632" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="700" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/measure_candidate.h</text>
 </a>
 </g>
 </g>
 <!-- Node56&#45;&gt;Node63 -->
 <g id="edge13" class="edge">
 <title>Node56&#45;&gt;Node63</title>
-<path fill="none" stroke="#191970" d="M556.7767,-399.955C598.9821,-389.2842 652.3327,-375.7955 692.6517,-365.6017"/>
-<polygon fill="#191970" stroke="#191970" points="555.6904,-396.6194 546.8534,-402.4639 557.4063,-403.4059 555.6904,-396.6194"/>
+<path fill="none" stroke="#191970" d="M613.4683,-397.6352C632.2177,-387.422 654.7099,-375.1702 672.063,-365.7177"/>
+<polygon fill="#191970" stroke="#191970" points="611.711,-394.6067 604.6036,-402.4639 615.0595,-400.7539 611.711,-394.6067"/>
 </g>
 <!-- Node65 -->
 <g id="node12" class="node">
 <title>Node65</title>
 <g id="a_node12"><a xlink:href="mutator_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/mutator.h">
-<polygon fill="#ffffff" stroke="#000000" points="19.3823,-335.5 19.3823,-365.5 171.3823,-365.5 171.3823,-335.5 19.3823,-335.5"/>
-<text text-anchor="start" x="27.3823" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
-<text text-anchor="middle" x="95.3823" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/mutator.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="340,-335.5 340,-365.5 492,-365.5 492,-335.5 340,-335.5"/>
+<text text-anchor="start" x="348" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="416" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/mutator.h</text>
 </a>
 </g>
 </g>
 <!-- Node56&#45;&gt;Node65 -->
 <g id="edge18" class="edge">
 <title>Node56&#45;&gt;Node65</title>
-<path fill="none" stroke="#191970" d="M410.1483,-404.8548C348.0347,-394.6102 258.5436,-379.6816 180.3823,-366 177.5749,-365.5086 174.716,-365.0047 171.8288,-364.493"/>
-<polygon fill="#191970" stroke="#191970" points="409.8247,-408.3486 420.2608,-406.5211 410.9629,-401.4417 409.8247,-408.3486"/>
+<path fill="none" stroke="#191970" d="M531.5587,-398.5897C506.4767,-388.1518 475.7512,-375.3654 452.2891,-365.6017"/>
+<polygon fill="#191970" stroke="#191970" points="530.2912,-401.8531 540.8685,-402.4639 532.9807,-395.3904 530.2912,-401.8531"/>
 </g>
 <!-- Node66 -->
 <g id="node13" class="node">
 <title>Node66</title>
 <g id="a_node13"><a xlink:href="space__generator_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/space_generator.h">
-<polygon fill="#ffffff" stroke="#000000" points="128.3823,-268.5 128.3823,-298.5 280.3823,-298.5 280.3823,-268.5 128.3823,-268.5"/>
-<text text-anchor="start" x="136.3823" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
-<text text-anchor="middle" x="204.3823" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/space_generator.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="246,-268.5 246,-298.5 398,-298.5 398,-268.5 246,-268.5"/>
+<text text-anchor="start" x="254" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="322" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/space_generator.h</text>
 </a>
 </g>
 </g>
 <!-- Node56&#45;&gt;Node66 -->
-<g id="edge26" class="edge">
+<g id="edge28" class="edge">
 <title>Node56&#45;&gt;Node66</title>
-<path fill="none" stroke="#191970" d="M410.1474,-414.7087C281.9287,-409.2727 36.5577,-395.1537 10.3823,-366 -29.0456,-322.0862 58.1664,-300.7614 128.1248,-290.9917"/>
-<polygon fill="#191970" stroke="#191970" points="410.1629,-418.2123 420.2999,-415.1313 410.4542,-411.2183 410.1629,-418.2123"/>
+<path fill="none" stroke="#191970" d="M558.4137,-394.6926C540.5001,-373.2191 514.2227,-343.1664 501,-335 481.8786,-323.1905 430.75,-309.0224 388.0092,-298.5386"/>
+<polygon fill="#191970" stroke="#191970" points="555.7528,-396.9669 564.8251,-402.439 561.1453,-392.5037 555.7528,-396.9669"/>
 </g>
 <!-- Node67 -->
 <g id="node14" class="node">
 <title>Node67</title>
 <g id="a_node14"><a xlink:href="postproc_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/postproc.h">
-<polygon fill="#ffffff" stroke="#000000" points="189.3823,-335.5 189.3823,-365.5 341.3823,-365.5 341.3823,-335.5 189.3823,-335.5"/>
-<text text-anchor="start" x="197.3823" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
-<text text-anchor="middle" x="265.3823" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/postproc.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="0,-335.5 0,-365.5 152,-365.5 152,-335.5 0,-335.5"/>
+<text text-anchor="start" x="8" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="76" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/postproc.h</text>
 </a>
 </g>
 </g>
 <!-- Node56&#45;&gt;Node67 -->
 <g id="edge21" class="edge">
 <title>Node56&#45;&gt;Node67</title>
-<path fill="none" stroke="#191970" d="M427.9009,-399.5484C392.7613,-388.9432 348.7698,-375.6665 315.4207,-365.6017"/>
-<polygon fill="#191970" stroke="#191970" points="426.9763,-402.9252 437.5611,-402.4639 428.9989,-396.2238 426.9763,-402.9252"/>
+<path fill="none" stroke="#191970" d="M499.6462,-409.263C415.9745,-400.0409 278.7417,-384.0083 161,-366 158.1827,-365.5691 155.3154,-365.1168 152.421,-364.6486"/>
+<polygon fill="#191970" stroke="#191970" points="499.5087,-412.7688 509.8309,-410.381 500.2726,-405.8106 499.5087,-412.7688"/>
 </g>
 <!-- Node68 -->
 <g id="node15" class="node">
 <title>Node68</title>
-<g id="a_node15"><a xlink:href="schedule__rule_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/schedule_rule.h">
-<polygon fill="#ffffff" stroke="#000000" points="359.3823,-335.5 359.3823,-365.5 511.3823,-365.5 511.3823,-335.5 359.3823,-335.5"/>
-<text text-anchor="start" x="367.3823" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
-<text text-anchor="middle" x="435.3823" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/schedule_rule.h</text>
+<g id="a_node15"><a xlink:href="thread__bind_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/schedule/cuda/thread_bind.h">
+<polygon fill="#ffffff" stroke="#000000" points="794.5,-335.5 794.5,-365.5 953.5,-365.5 953.5,-335.5 794.5,-335.5"/>
+<text text-anchor="start" x="802.5" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="874" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/schedule/cuda/thread_bind.h</text>
 </a>
 </g>
 </g>
 <!-- Node56&#45;&gt;Node68 -->
 <g id="edge23" class="edge">
 <title>Node56&#45;&gt;Node68</title>
-<path fill="none" stroke="#191970" d="M469.4158,-394.3509C462.0405,-384.8482 453.7089,-374.1132 447.1352,-365.6432"/>
-<polygon fill="#191970" stroke="#191970" points="466.6864,-396.5427 475.5827,-402.2967 472.2163,-392.2508 466.6864,-396.5427"/>
+<path fill="none" stroke="#191970" d="M653.5971,-400.2205C701.1701,-389.4886 761.7205,-375.8291 807.3134,-365.5438"/>
+<polygon fill="#191970" stroke="#191970" points="652.6372,-396.849 643.6526,-402.4639 654.1777,-403.6774 652.6372,-396.849"/>
+</g>
+<!-- Node69 -->
+<g id="node16" class="node">
+<title>Node69</title>
+<g id="a_node16"><a xlink:href="winograd_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/schedule/generic/winograd.h">
+<polygon fill="#ffffff" stroke="#000000" points="971.5,-335.5 971.5,-365.5 1130.5,-365.5 1130.5,-335.5 971.5,-335.5"/>
+<text text-anchor="start" x="979.5" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="1051" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/schedule/generic/winograd.h</text>
+</a>
+</g>
+</g>
+<!-- Node56&#45;&gt;Node69 -->
+<g id="edge24" class="edge">
+<title>Node56&#45;&gt;Node69</title>
+<path fill="none" stroke="#191970" d="M654.2914,-408.1504C732.3159,-398.4859 856.2169,-382.5203 963,-366 965.7595,-365.5731 968.566,-365.1296 971.3993,-364.6739"/>
+<polygon fill="#191970" stroke="#191970" points="653.5606,-404.714 644.0651,-409.4135 654.4188,-411.6612 653.5606,-404.714"/>
+</g>
+<!-- Node70 -->
+<g id="node17" class="node">
+<title>Node70</title>
+<g id="a_node17"><a xlink:href="schedule__rule_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/schedule_rule.h">
+<polygon fill="#ffffff" stroke="#000000" points="170,-335.5 170,-365.5 322,-365.5 322,-335.5 170,-335.5"/>
+<text text-anchor="start" x="178" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="246" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/schedule_rule.h</text>
+</a>
+</g>
+</g>
+<!-- Node56&#45;&gt;Node70 -->
+<g id="edge25" class="edge">
+<title>Node56&#45;&gt;Node70</title>
+<path fill="none" stroke="#191970" d="M499.725,-401.8582C445.5863,-390.8996 373.6492,-376.3383 320.2015,-365.5196"/>
+<polygon fill="#191970" stroke="#191970" points="499.1226,-405.3072 509.6182,-403.8608 500.5114,-398.4464 499.1226,-405.3072"/>
 </g>
 <!-- Node57&#45;&gt;Node58 -->
 <g id="edge4" class="edge">
 <title>Node57&#45;&gt;Node58</title>
-<path fill="none" stroke="#191970" d="M426.5625,-265.0023C456.2157,-254.4903 492.8755,-241.4945 520.782,-231.6017"/>
-<polygon fill="#191970" stroke="#191970" points="425.0534,-261.8237 416.7975,-268.4639 427.3923,-268.4214 425.0534,-261.8237"/>
+<path fill="none" stroke="#191970" d="M866.1609,-266.3551C816.0912,-255.6029 752.1506,-241.8721 704.0546,-231.5438"/>
+<polygon fill="#191970" stroke="#191970" points="865.4691,-269.7862 875.9811,-268.4639 866.9389,-262.9423 865.4691,-269.7862"/>
 </g>
 <!-- Node60 -->
 <g id="node7" class="node">
 <title>Node60</title>
 <g id="a_node7"><a xlink:href="task__scheduler_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/task_scheduler.h">
-<polygon fill="#ffffff" stroke="#000000" points="402.3823,-.5 402.3823,-30.5 554.3823,-30.5 554.3823,-.5 402.3823,-.5"/>
-<text text-anchor="start" x="410.3823" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
-<text text-anchor="middle" x="478.3823" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/task_scheduler.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="662,-.5 662,-30.5 814,-30.5 814,-.5 662,-.5"/>
+<text text-anchor="start" x="670" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="738" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/task_scheduler.h</text>
 </a>
 </g>
 </g>
 <!-- Node57&#45;&gt;Node60 -->
 <g id="edge10" class="edge">
 <title>Node57&#45;&gt;Node60</title>
-<path fill="none" stroke="#191970" d="M373.8678,-258.2953C374.0728,-227.8082 377.1085,-175.4249 393.3823,-134 409.6755,-92.5255 443.7413,-52.0687 463.4413,-30.8334"/>
-<polygon fill="#191970" stroke="#191970" points="370.3683,-258.4874 373.901,-268.4759 377.3682,-258.4645 370.3683,-258.4874"/>
+<path fill="none" stroke="#191970" d="M936.9253,-259.0313C919.9913,-215.9747 880.0533,-125.6782 823,-67 808.3257,-51.9078 788.3334,-39.5041 771.4244,-30.654"/>
+<polygon fill="#191970" stroke="#191970" points="933.688,-260.3641 940.5515,-268.4352 940.2193,-257.8455 933.688,-260.3641"/>
 </g>
 <!-- Node59 -->
 <g id="node6" class="node">
 <title>Node59</title>
 <g id="a_node6"><a xlink:href="measure__callback_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/measure_callback.h">
-<polygon fill="#ffffff" stroke="#000000" points="506.3823,-67.5 506.3823,-97.5 658.3823,-97.5 658.3823,-67.5 506.3823,-67.5"/>
-<text text-anchor="start" x="514.3823" y="-85.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
-<text text-anchor="middle" x="582.3823" y="-74.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/measure_callback.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="662,-67.5 662,-97.5 814,-97.5 814,-67.5 662,-67.5"/>
+<text text-anchor="start" x="670" y="-85.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="738" y="-74.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/measure_callback.h</text>
 </a>
 </g>
 </g>
 <!-- Node58&#45;&gt;Node59 -->
 <g id="edge5" class="edge">
 <title>Node58&#45;&gt;Node59</title>
-<path fill="none" stroke="#191970" d="M566.9495,-191.3415C570.8528,-163.8131 576.9841,-120.5714 580.2183,-97.7614"/>
-<polygon fill="#191970" stroke="#191970" points="563.4635,-190.9966 565.5249,-201.389 570.3942,-191.9794 563.4635,-190.9966"/>
+<path fill="none" stroke="#191970" d="M681.4668,-196.6042C695.4092,-188.6503 709.4604,-178.1885 719,-165 733.5578,-144.8738 737.1751,-115.3664 737.9524,-97.6351"/>
+<polygon fill="#191970" stroke="#191970" points="679.5347,-193.6684 672.3689,-201.4723 682.8373,-199.8404 679.5347,-193.6684"/>
 </g>
 <!-- Node61 -->
 <g id="node8" class="node">
 <title>Node61</title>
 <g id="a_node8"><a xlink:href="tune__context_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/tune_context.h">
-<polygon fill="#ffffff" stroke="#000000" points="402.3823,-134.5 402.3823,-164.5 554.3823,-164.5 554.3823,-134.5 402.3823,-134.5"/>
-<text text-anchor="start" x="410.3823" y="-152.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
-<text text-anchor="middle" x="478.3823" y="-141.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/tune_context.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="558,-134.5 558,-164.5 710,-164.5 710,-134.5 558,-134.5"/>
+<text text-anchor="start" x="566" y="-152.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="634" y="-141.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/tune_context.h</text>
 </a>
 </g>
 </g>
 <!-- Node58&#45;&gt;Node61 -->
 <g id="edge7" class="edge">
 <title>Node58&#45;&gt;Node61</title>
-<path fill="none" stroke="#191970" d="M536.1877,-195.0643C523.6196,-185.1577 508.9905,-173.6265 497.5937,-164.6432"/>
-<polygon fill="#191970" stroke="#191970" points="534.0742,-197.8549 544.0944,-201.2967 538.4075,-192.3574 534.0742,-197.8549"/>
+<path fill="none" stroke="#191970" d="M634,-191.0249C634,-182.128 634,-172.4287 634,-164.6432"/>
+<polygon fill="#191970" stroke="#191970" points="630.5001,-191.2966 634,-201.2967 637.5001,-191.2967 630.5001,-191.2966"/>
 </g>
 <!-- Node59&#45;&gt;Node60 -->
 <g id="edge6" class="edge">
 <title>Node59&#45;&gt;Node60</title>
-<path fill="none" stroke="#191970" d="M550.2433,-61.7951C534.66,-51.7558 516.2762,-39.9124 502.0038,-30.7177"/>
-<polygon fill="#191970" stroke="#191970" points="548.7406,-64.9904 559.0426,-67.4639 552.5316,-59.1058 548.7406,-64.9904"/>
+<path fill="none" stroke="#191970" d="M738,-57.0249C738,-48.128 738,-38.4287 738,-30.6432"/>
+<polygon fill="#191970" stroke="#191970" points="734.5001,-57.2966 738,-67.2967 741.5001,-57.2967 734.5001,-57.2966"/>
 </g>
 <!-- Node61&#45;&gt;Node59 -->
 <g id="edge8" class="edge">
 <title>Node61&#45;&gt;Node59</title>
-<path fill="none" stroke="#191970" d="M510.5212,-128.7951C526.1045,-118.7558 544.4884,-106.9124 558.7607,-97.7177"/>
-<polygon fill="#191970" stroke="#191970" points="508.2329,-126.1058 501.7219,-134.4639 512.024,-131.9904 508.2329,-126.1058"/>
+<path fill="none" stroke="#191970" d="M666.139,-128.7951C681.7223,-118.7558 700.1061,-106.9124 714.3784,-97.7177"/>
+<polygon fill="#191970" stroke="#191970" points="663.8506,-126.1058 657.3396,-134.4639 667.6417,-131.9904 663.8506,-126.1058"/>
 </g>
 <!-- Node61&#45;&gt;Node60 -->
 <g id="edge9" class="edge">
 <title>Node61&#45;&gt;Node60</title>
-<path fill="none" stroke="#191970" d="M478.3823,-124.3415C478.3823,-96.8131 478.3823,-53.5714 478.3823,-30.7614"/>
-<polygon fill="#191970" stroke="#191970" points="474.8824,-124.3889 478.3823,-134.389 481.8824,-124.389 474.8824,-124.3889"/>
+<path fill="none" stroke="#191970" d="M634.7413,-124.286C636.3916,-106.7675 640.9443,-83.667 653,-67 664.6292,-50.9226 682.963,-38.8972 699.6311,-30.5277"/>
+<polygon fill="#191970" stroke="#191970" points="631.2425,-124.1481 634.0476,-134.3649 638.226,-124.6288 631.2425,-124.1481"/>
 </g>
 <!-- Node62&#45;&gt;Node58 -->
 <g id="edge12" class="edge">
 <title>Node62&#45;&gt;Node58</title>
-<path fill="none" stroke="#191970" d="M551.4405,-258.6103C554.0084,-249.5553 556.8316,-239.5998 559.0879,-231.6432"/>
-<polygon fill="#191970" stroke="#191970" points="548.0547,-257.7211 548.6937,-268.2967 554.7892,-259.6309 548.0547,-257.7211"/>
+<path fill="none" stroke="#191970" d="M532.9253,-264.1902C554.8011,-253.8685 581.3496,-241.3421 601.7475,-231.7177"/>
+<polygon fill="#191970" stroke="#191970" points="531.4179,-261.0313 523.8676,-268.4639 534.405,-267.362 531.4179,-261.0313"/>
 </g>
 <!-- Node63&#45;&gt;Node57 -->
 <g id="edge14" class="edge">
 <title>Node63&#45;&gt;Node57</title>
-<path fill="none" stroke="#191970" d="M666.2453,-335.5087C607.8562,-325.3101 528.9477,-311.4563 459.3823,-299 456.5768,-298.4977 453.7196,-297.9845 450.8337,-297.4649"/>
-<polygon fill="#191970" stroke="#191970" points="665.7141,-338.9688 676.1671,-337.241 666.9181,-332.0731 665.7141,-338.9688"/>
+<path fill="none" stroke="#191970" d="M764.9155,-332.8198C804.016,-322.1704 853.2817,-308.7525 890.552,-298.6017"/>
+<polygon fill="#191970" stroke="#191970" points="763.936,-329.459 755.2072,-335.4639 765.7755,-336.213 763.936,-329.459"/>
 </g>
 <!-- Node63&#45;&gt;Node58 -->
 <g id="edge17" class="edge">
 <title>Node63&#45;&gt;Node58</title>
-<path fill="none" stroke="#191970" d="M730.7417,-327.6115C713.3313,-309.9825 687.679,-285.711 662.3823,-268 642.0676,-253.7771 617.3176,-240.9021 597.6463,-231.5828"/>
-<polygon fill="#191970" stroke="#191970" points="728.6454,-330.4754 738.1303,-335.1965 733.6597,-325.591 728.6454,-330.4754"/>
+<path fill="none" stroke="#191970" d="M675.0608,-328.2584C666.9452,-319.8272 658.596,-309.6694 653,-299 641.5991,-277.263 636.9873,-248.7779 635.1586,-231.5991"/>
+<polygon fill="#191970" stroke="#191970" points="672.7917,-330.9369 682.3626,-335.4811 677.7145,-325.9602 672.7917,-330.9369"/>
 </g>
 <!-- Node63&#45;&gt;Node59 -->
 <g id="edge16" class="edge">
 <title>Node63&#45;&gt;Node59</title>
-<path fill="none" stroke="#191970" d="M835.5873,-331.2937C851.2949,-323.8097 865.7872,-313.4197 875.3823,-299 883.0149,-287.5296 882.0263,-280.07 875.3823,-268 823.704,-174.1177 702.2372,-121.1087 633.0159,-97.5219"/>
-<polygon fill="#191970" stroke="#191970" points="833.9956,-328.1694 826.2264,-335.373 836.792,-334.5866 833.9956,-328.1694"/>
+<path fill="none" stroke="#191970" d="M783.2051,-331.2937C798.9126,-323.8097 813.4049,-313.4197 823,-299 868.4104,-230.7565 788.3548,-134.6079 753.2604,-97.7214"/>
+<polygon fill="#191970" stroke="#191970" points="781.6133,-328.1694 773.8441,-335.373 784.4098,-334.5866 781.6133,-328.1694"/>
 </g>
 <!-- Node64 -->
 <g id="node11" class="node">
 <title>Node64</title>
 <g id="a_node11"><a xlink:href="feature__extractor_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/feature_extractor.h">
-<polygon fill="#ffffff" stroke="#000000" points="714.3823,-268.5 714.3823,-298.5 866.3823,-298.5 866.3823,-268.5 714.3823,-268.5"/>
-<text text-anchor="start" x="722.3823" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
-<text text-anchor="middle" x="790.3823" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/feature_extractor.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="662,-268.5 662,-298.5 814,-298.5 814,-268.5 662,-268.5"/>
+<text text-anchor="start" x="670" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="738" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/feature_extractor.h</text>
 </a>
 </g>
 </g>
 <!-- Node63&#45;&gt;Node64 -->
 <g id="edge15" class="edge">
 <title>Node63&#45;&gt;Node64</title>
-<path fill="none" stroke="#191970" d="M766.0034,-326.4837C771.269,-317.1996 777.1353,-306.8565 781.7936,-298.6432"/>
-<polygon fill="#191970" stroke="#191970" points="762.8941,-324.8716 761.005,-335.2967 768.9829,-328.325 762.8941,-324.8716"/>
+<path fill="none" stroke="#191970" d="M713.6212,-326.4837C718.8868,-317.1996 724.753,-306.8565 729.4113,-298.6432"/>
+<polygon fill="#191970" stroke="#191970" points="710.5118,-324.8716 708.6228,-335.2967 716.6007,-328.325 710.5118,-324.8716"/>
 </g>
 <!-- Node65&#45;&gt;Node66 -->
 <g id="edge19" class="edge">
 <title>Node65&#45;&gt;Node66</title>
-<path fill="none" stroke="#191970" d="M128.6092,-330.0761C145.0384,-319.9774 164.5267,-307.9983 179.625,-298.7177"/>
-<polygon fill="#191970" stroke="#191970" points="126.5304,-327.2455 119.844,-335.4639 130.1961,-333.209 126.5304,-327.2455"/>
+<path fill="none" stroke="#191970" d="M386.3225,-329.3469C372.3357,-319.3776 355.9692,-307.7121 343.2456,-298.6432"/>
+<polygon fill="#191970" stroke="#191970" points="384.4953,-332.3426 394.6699,-335.2967 388.5582,-326.6423 384.4953,-332.3426"/>
 </g>
 <!-- Node66&#45;&gt;Node61 -->
 <g id="edge20" class="edge">
 <title>Node66&#45;&gt;Node61</title>
-<path fill="none" stroke="#191970" d="M244.4052,-263.9267C299.5711,-236.9478 397.7913,-188.9131 447.5965,-164.5558"/>
-<polygon fill="#191970" stroke="#191970" points="242.7265,-260.8515 235.2809,-268.389 245.8019,-267.1398 242.7265,-260.8515"/>
+<path fill="none" stroke="#191970" d="M366.4111,-264.426C429.0016,-237.5442 541.8807,-189.0641 598.9447,-164.5558"/>
+<polygon fill="#191970" stroke="#191970" points="364.9911,-261.2267 357.1839,-268.389 367.7535,-267.6586 364.9911,-261.2267"/>
 </g>
 <!-- Node67&#45;&gt;Node66 -->
 <g id="edge22" class="edge">
 <title>Node67&#45;&gt;Node66</title>
-<path fill="none" stroke="#191970" d="M244.568,-327.6385C235.8521,-318.0653 225.9587,-307.1987 218.1693,-298.6432"/>
-<polygon fill="#191970" stroke="#191970" points="242.2201,-330.2585 251.5404,-335.2967 247.3962,-325.5459 242.2201,-330.2585"/>
+<path fill="none" stroke="#191970" d="M140.9155,-332.8198C180.016,-322.1704 229.2817,-308.7525 266.552,-298.6017"/>
+<polygon fill="#191970" stroke="#191970" points="139.936,-329.459 131.2072,-335.4639 141.7755,-336.213 139.936,-329.459"/>
 </g>
-<!-- Node68&#45;&gt;Node66 -->
-<g id="edge24" class="edge">
-<title>Node68&#45;&gt;Node66</title>
-<path fill="none" stroke="#191970" d="M373.4895,-332.5484C336.9254,-321.9432 291.1504,-308.6665 256.4493,-298.6017"/>
-<polygon fill="#191970" stroke="#191970" points="372.9622,-336.0397 383.5413,-335.4639 374.9122,-329.3167 372.9622,-336.0397"/>
+<!-- Node70&#45;&gt;Node66 -->
+<g id="edge26" class="edge">
+<title>Node70&#45;&gt;Node66</title>
+<path fill="none" stroke="#191970" d="M270.9592,-328.4965C282.0497,-318.7193 294.8267,-307.4554 304.8227,-298.6432"/>
+<polygon fill="#191970" stroke="#191970" points="268.4323,-326.0582 263.2456,-335.2967 273.0614,-331.3091 268.4323,-326.0582"/>
 </g>
 </g>
 </svg>
diff --git a/docs/reference/api/doxygen/c__runtime__api_8h__dep__incl.svg b/docs/reference/api/doxygen/c__runtime__api_8h__dep__incl.svg
index cc6afb7aa2..a55a480ee0 100644
--- a/docs/reference/api/doxygen/c__runtime__api_8h__dep__incl.svg
+++ b/docs/reference/api/doxygen/c__runtime__api_8h__dep__incl.svg
@@ -47,48 +47,48 @@
 <path fill="none" stroke="#191970" d="M2091.9783,-817.8079C1773.5981,-809.4502 446,-769.56 446,-685.5 446,-685.5 446,-685.5 446,-618.5 446,-518.5332 787.7969,-492.9774 937.333,-486.5803"/>
 <polygon fill="#191970" stroke="#191970" points="2091.9021,-821.307 2101.99,-818.0691 2092.0847,-814.3094 2091.9021,-821.307"/>
 </g>
-<!-- Node177 -->
+<!-- Node179 -->
 <g id="node20" class="node">
-<title>Node177</title>
+<title>Node179</title>
 <g id="a_node20"><a xlink:href="tir_2expr_8h.html" target="_top" xlink:title="TIR expressions. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="909.5,-140 909.5,-159 1030.5,-159 1030.5,-140 909.5,-140"/>
 <text text-anchor="middle" x="970" y="-147" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/expr.h</text>
 </a>
 </g>
 </g>
-<!-- Node4&#45;&gt;Node177 -->
+<!-- Node4&#45;&gt;Node179 -->
 <g id="edge122" class="edge">
-<title>Node4&#45;&gt;Node177</title>
+<title>Node4&#45;&gt;Node179</title>
 <path fill="none" stroke="#191970" d="M2091.634,-818.4163C1836.3978,-814.1752 939.3832,-797.4683 655,-768 458.3997,-747.6279 218,-883.153 218,-685.5 218,-685.5 218,-685.5 218,-484.5 218,-437.9897 231.747,-423.5184 265,-391 282.5893,-373.7993 641.5364,-208.3962 665,-201 747.4588,-175.0073 846.8344,-161.3391 909.2343,-154.7742"/>
 <polygon fill="#191970" stroke="#191970" points="2091.8181,-821.9198 2101.8746,-818.5855 2091.9338,-814.9207 2091.8181,-821.9198"/>
 </g>
-<!-- Node191 -->
+<!-- Node193 -->
 <g id="node29" class="node">
-<title>Node191</title>
+<title>Node193</title>
 <g id="a_node29"><a xlink:href="reflection_8h.html" target="_top" xlink:title="Reflection and serialization of compiler IR/AST nodes. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="1205.5,-542 1205.5,-561 1364.5,-561 1364.5,-542 1205.5,-542"/>
 <text text-anchor="middle" x="1285" y="-549" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/node/reflection.h</text>
 </a>
 </g>
 </g>
-<!-- Node4&#45;&gt;Node191 -->
+<!-- Node4&#45;&gt;Node193 -->
 <g id="edge57" class="edge">
-<title>Node4&#45;&gt;Node191</title>
+<title>Node4&#45;&gt;Node193</title>
 <path fill="none" stroke="#191970" d="M2091.5384,-817.5661C1909.5971,-812.082 1423.3151,-795.0524 1356,-768 1270.2499,-733.539 1235.9665,-717.7989 1197,-634 1191.1907,-621.5068 1190.1463,-614.9521 1197,-603 1208.9758,-582.1156 1233.0632,-568.8863 1253.0138,-561.0725"/>
 <polygon fill="#191970" stroke="#191970" points="2091.6847,-821.072 2101.7849,-817.8725 2091.894,-814.0751 2091.6847,-821.072"/>
 </g>
-<!-- Node197 -->
+<!-- Node199 -->
 <g id="node30" class="node">
-<title>Node197</title>
+<title>Node199</title>
 <g id="a_node30"><a xlink:href="serialization_8h.html" target="_top" xlink:title="include/tvm/node/serialization.h">
 <polygon fill="#ffffff" stroke="#000000" points="1920.5,-676 1920.5,-695 2093.5,-695 2093.5,-676 1920.5,-676"/>
 <text text-anchor="middle" x="2007" y="-683" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/node/serialization.h</text>
 </a>
 </g>
 </g>
-<!-- Node4&#45;&gt;Node197 -->
+<!-- Node4&#45;&gt;Node199 -->
 <g id="edge60" class="edge">
-<title>Node4&#45;&gt;Node197</title>
+<title>Node4&#45;&gt;Node199</title>
 <path fill="none" stroke="#191970" d="M2135.1335,-797.7215C2101.3294,-768.1153 2042.3003,-716.4166 2017.9659,-695.1042"/>
 <polygon fill="#191970" stroke="#191970" points="2132.9176,-800.4334 2142.7464,-804.389 2137.5296,-795.1675 2132.9176,-800.4334"/>
 </g>
@@ -108,9 +108,9 @@
 <path fill="none" stroke="#191970" d="M2227.9806,-806.2747C2280.884,-795.9493 2356.1322,-781.1903 2422,-768 2426.6214,-767.0746 2431.3997,-766.1124 2436.2056,-765.141"/>
 <polygon fill="#191970" stroke="#191970" points="2227.1835,-802.8641 2218.0388,-808.2142 2228.5239,-809.7346 2227.1835,-802.8641"/>
 </g>
-<!-- Node198 -->
+<!-- Node200 -->
 <g id="node32" class="node">
-<title>Node198</title>
+<title>Node200</title>
 <g id="a_node32"><a xlink:href="builtin__fp16_8h.html" target="_top" xlink:title="Functions for conversion between fp32 and fp16. ">
 <polygon fill="#ffffff" stroke="#000000" points="2578,-737.5 2578,-767.5 2694,-767.5 2694,-737.5 2578,-737.5"/>
 <text text-anchor="start" x="2586" y="-755.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -118,15 +118,15 @@
 </a>
 </g>
 </g>
-<!-- Node4&#45;&gt;Node198 -->
+<!-- Node4&#45;&gt;Node200 -->
 <g id="edge62" class="edge">
-<title>Node4&#45;&gt;Node198</title>
+<title>Node4&#45;&gt;Node200</title>
 <path fill="none" stroke="#191970" d="M2228.1377,-812.974C2309.3967,-804.7133 2449.1665,-788.9908 2568,-768 2571.2194,-767.4313 2574.5161,-766.8102 2577.8381,-766.154"/>
 <polygon fill="#191970" stroke="#191970" points="2227.6369,-809.5067 2218.0388,-813.9928 2228.3396,-816.4714 2227.6369,-809.5067"/>
 </g>
-<!-- Node199 -->
+<!-- Node201 -->
 <g id="node33" class="node">
-<title>Node199</title>
+<title>Node201</title>
 <g id="a_node33"><a xlink:href="c__backend__api_8h.html" target="_top" xlink:title="TVM runtime backend API. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="2712,-737.5 2712,-767.5 2828,-767.5 2828,-737.5 2712,-737.5"/>
 <text text-anchor="start" x="2720" y="-755.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -134,15 +134,15 @@
 </a>
 </g>
 </g>
-<!-- Node4&#45;&gt;Node199 -->
+<!-- Node4&#45;&gt;Node201 -->
 <g id="edge63" class="edge">
-<title>Node4&#45;&gt;Node199</title>
+<title>Node4&#45;&gt;Node201</title>
 <path fill="none" stroke="#191970" d="M2228.4773,-815.9294C2331.818,-809.8823 2533.5629,-795.5092 2703,-768 2705.9462,-767.5217 2708.956,-766.9889 2711.9887,-766.417"/>
 <polygon fill="#191970" stroke="#191970" points="2227.9654,-812.453 2218.1836,-816.5231 2228.3685,-819.4414 2227.9654,-812.453"/>
 </g>
-<!-- Node203 -->
+<!-- Node205 -->
 <g id="node34" class="node">
-<title>Node203</title>
+<title>Node205</title>
 <g id="a_node34"><a xlink:href="graph__executor_8h.html" target="_top" xlink:title="Tiny AoT executor. ">
 <polygon fill="#ffffff" stroke="#000000" points="2839.5,-603.5 2839.5,-633.5 2958.5,-633.5 2958.5,-603.5 2839.5,-603.5"/>
 <text text-anchor="start" x="2847.5" y="-621.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -150,15 +150,15 @@
 </a>
 </g>
 </g>
-<!-- Node4&#45;&gt;Node203 -->
+<!-- Node4&#45;&gt;Node205 -->
 <g id="edge64" class="edge">
-<title>Node4&#45;&gt;Node203</title>
+<title>Node4&#45;&gt;Node205</title>
 <path fill="none" stroke="#191970" d="M2228.4661,-818.3835C2390.7678,-815.1625 2788.9219,-803.6773 2837,-768 2874.5145,-740.1615 2856.6792,-712.9729 2875,-670 2880.2612,-657.6594 2886.7058,-643.8961 2891.5916,-633.7044"/>
 <polygon fill="#191970" stroke="#191970" points="2228.0532,-814.8908 2218.1229,-818.5839 2228.1888,-821.8895 2228.0532,-814.8908"/>
 </g>
-<!-- Node202 -->
+<!-- Node204 -->
 <g id="node35" class="node">
-<title>Node202</title>
+<title>Node204</title>
 <g id="a_node35"><a xlink:href="crt_2packed__func_8h.html" target="_top" xlink:title="Type&#45;erased function used across TVM API. ">
 <polygon fill="#ffffff" stroke="#000000" points="2884,-670.5 2884,-700.5 3000,-700.5 3000,-670.5 2884,-670.5"/>
 <text text-anchor="start" x="2892" y="-688.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -166,15 +166,15 @@
 </a>
 </g>
 </g>
-<!-- Node4&#45;&gt;Node202 -->
+<!-- Node4&#45;&gt;Node204 -->
 <g id="edge65" class="edge">
-<title>Node4&#45;&gt;Node202</title>
+<title>Node4&#45;&gt;Node204</title>
 <path fill="none" stroke="#191970" d="M2228.4466,-817.878C2425.5927,-812.8753 2982.3723,-796.2919 3009,-768 3029.9921,-745.6959 2995.5885,-717.6286 2968.8039,-700.6504"/>
 <polygon fill="#191970" stroke="#191970" points="2228.1235,-814.3849 2218.2147,-818.1354 2228.2996,-821.3827 2228.1235,-814.3849"/>
 </g>
-<!-- Node204 -->
+<!-- Node206 -->
 <g id="node36" class="node">
-<title>Node204</title>
+<title>Node206</title>
 <g id="a_node36"><a xlink:href="page__allocator_8h.html" target="_top" xlink:title="An implementation of a dynamic memory allocator for microcontrollers. ">
 <polygon fill="#ffffff" stroke="#000000" points="3056,-737.5 3056,-767.5 3172,-767.5 3172,-737.5 3056,-737.5"/>
 <text text-anchor="start" x="3064" y="-755.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -182,15 +182,15 @@
 </a>
 </g>
 </g>
-<!-- Node4&#45;&gt;Node204 -->
+<!-- Node4&#45;&gt;Node206 -->
 <g id="edge67" class="edge">
-<title>Node4&#45;&gt;Node204</title>
+<title>Node4&#45;&gt;Node206</title>
 <path fill="none" stroke="#191970" d="M2228.2783,-816.8739C2406.8958,-809.7861 2884.9873,-789.3556 3042,-768 3046.5611,-767.3796 3051.2632,-766.6272 3055.9765,-765.7922"/>
 <polygon fill="#191970" stroke="#191970" points="2228.073,-813.3792 2218.2191,-817.2716 2228.3495,-820.3738 2228.073,-813.3792"/>
 </g>
-<!-- Node205 -->
+<!-- Node207 -->
 <g id="node37" class="node">
-<title>Node205</title>
+<title>Node207</title>
 <g id="a_node37"><a xlink:href="platform_8h.html" target="_top" xlink:title="The virtual memory manager for micro&#45;controllers. ">
 <polygon fill="#ffffff" stroke="#000000" points="2884,-737.5 2884,-767.5 3000,-767.5 3000,-737.5 2884,-737.5"/>
 <text text-anchor="start" x="2892" y="-755.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -198,15 +198,15 @@
 </a>
 </g>
 </g>
-<!-- Node4&#45;&gt;Node205 -->
+<!-- Node4&#45;&gt;Node207 -->
 <g id="edge68" class="edge">
-<title>Node4&#45;&gt;Node205</title>
+<title>Node4&#45;&gt;Node207</title>
 <path fill="none" stroke="#191970" d="M2228.6275,-817.7049C2355.7417,-813.6751 2636.1755,-801.4818 2870,-768 2874.5566,-767.3475 2879.2555,-766.572 2883.9666,-765.7211"/>
 <polygon fill="#191970" stroke="#191970" points="2228.1617,-814.2176 2218.2749,-818.0257 2228.3786,-821.2142 2228.1617,-814.2176"/>
 </g>
-<!-- Node206 -->
+<!-- Node208 -->
 <g id="node38" class="node">
-<title>Node206</title>
+<title>Node208</title>
 <g id="a_node38"><a xlink:href="data__type_8h.html" target="_top" xlink:title="include/tvm/runtime\l/data_type.h">
 <polygon fill="#ffffff" stroke="#ff0000" points="996,-737.5 996,-767.5 1112,-767.5 1112,-737.5 996,-737.5"/>
 <text text-anchor="start" x="1004" y="-755.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -214,15 +214,15 @@
 </a>
 </g>
 </g>
-<!-- Node4&#45;&gt;Node206 -->
+<!-- Node4&#45;&gt;Node208 -->
 <g id="edge70" class="edge">
-<title>Node4&#45;&gt;Node206</title>
+<title>Node4&#45;&gt;Node208</title>
 <path fill="none" stroke="#191970" d="M2091.646,-817.5453C1892.2648,-811.6136 1313.6329,-792.7031 1126,-768 1121.4363,-767.3992 1116.7323,-766.6608 1112.0177,-765.8354"/>
 <polygon fill="#191970" stroke="#191970" points="2091.5987,-821.0453 2101.6979,-817.8429 2091.8059,-814.0484 2091.5987,-821.0453"/>
 </g>
-<!-- Node209 -->
+<!-- Node211 -->
 <g id="node39" class="node">
-<title>Node209</title>
+<title>Node211</title>
 <g id="a_node39"><a xlink:href="ndarray_8h.html" target="_top" xlink:title="A device&#45;independent managed NDArray abstraction. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="1330,-670.5 1330,-700.5 1446,-700.5 1446,-670.5 1330,-670.5"/>
 <text text-anchor="start" x="1338" y="-688.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -230,15 +230,15 @@
 </a>
 </g>
 </g>
-<!-- Node4&#45;&gt;Node209 -->
+<!-- Node4&#45;&gt;Node211 -->
 <g id="edge101" class="edge">
-<title>Node4&#45;&gt;Node209</title>
+<title>Node4&#45;&gt;Node211</title>
 <path fill="none" stroke="#191970" d="M2091.9188,-817.425C1927.0082,-811.9838 1515.5884,-795.7733 1460,-768 1430.2395,-753.1309 1407.5216,-720.0384 1396.1248,-700.5856"/>
 <polygon fill="#191970" stroke="#191970" points="2091.8384,-820.9242 2101.9474,-817.7527 2092.067,-813.9279 2091.8384,-820.9242"/>
 </g>
-<!-- Node211 -->
+<!-- Node213 -->
 <g id="node40" class="node">
-<title>Node211</title>
+<title>Node213</title>
 <g id="a_node40"><a xlink:href="device__api_8h.html" target="_top" xlink:title="Abstract device memory management API. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="1858,-536.5 1858,-566.5 1974,-566.5 1974,-536.5 1858,-536.5"/>
 <text text-anchor="start" x="1866" y="-554.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -246,15 +246,15 @@
 </a>
 </g>
 </g>
-<!-- Node4&#45;&gt;Node211 -->
+<!-- Node4&#45;&gt;Node213 -->
 <g id="edge95" class="edge">
-<title>Node4&#45;&gt;Node211</title>
+<title>Node4&#45;&gt;Node213</title>
 <path fill="none" stroke="#191970" d="M2161.3915,-794.0966C2162.3833,-747.5781 2156.9456,-649.252 2098,-603 2078.6889,-587.8474 2019.8637,-572.8291 1974.2614,-562.9559"/>
 <polygon fill="#191970" stroke="#191970" points="2157.8888,-794.1209 2161.0532,-804.2321 2164.8849,-794.3545 2157.8888,-794.1209"/>
 </g>
-<!-- Node212 -->
+<!-- Node214 -->
 <g id="node41" class="node">
-<title>Node212</title>
+<title>Node214</title>
 <g id="a_node41"><a xlink:href="conv2d_8h.html" target="_top" xlink:title="include/tvm/runtime\l/hexagon/ops/conv2d.h">
 <polygon fill="#ffffff" stroke="#000000" points="2171.5,-469.5 2171.5,-499.5 2300.5,-499.5 2300.5,-469.5 2171.5,-469.5"/>
 <text text-anchor="start" x="2179.5" y="-487.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -262,15 +262,15 @@
 </a>
 </g>
 </g>
-<!-- Node4&#45;&gt;Node212 -->
+<!-- Node4&#45;&gt;Node214 -->
 <g id="edge96" class="edge">
-<title>Node4&#45;&gt;Node212</title>
+<title>Node4&#45;&gt;Node214</title>
 <path fill="none" stroke="#191970" d="M2187.2194,-797.543C2196.1906,-789.1203 2205.5185,-778.895 2212,-768 2231.5237,-735.1816 2236,-723.6867 2236,-685.5 2236,-685.5 2236,-685.5 2236,-618.5 2236,-575.4618 2236,-524.6482 2236,-499.7729"/>
 <polygon fill="#191970" stroke="#191970" points="2184.6203,-795.1723 2179.5081,-804.4522 2189.2915,-800.3858 2184.6203,-795.1723"/>
 </g>
-<!-- Node213 -->
+<!-- Node215 -->
 <g id="node42" class="node">
-<title>Node213</title>
+<title>Node215</title>
 <g id="a_node42"><a xlink:href="profiling_8h.html" target="_top" xlink:title="Runtime profiling including timers. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="1815,-469.5 1815,-499.5 1931,-499.5 1931,-469.5 1815,-469.5"/>
 <text text-anchor="start" x="1823" y="-487.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -278,15 +278,15 @@
 </a>
 </g>
 </g>
-<!-- Node4&#45;&gt;Node213 -->
+<!-- Node4&#45;&gt;Node215 -->
 <g id="edge118" class="edge">
-<title>Node4&#45;&gt;Node213</title>
+<title>Node4&#45;&gt;Node215</title>
 <path fill="none" stroke="#191970" d="M2168.7197,-794.8459C2171.2775,-786.4406 2173.7283,-776.9138 2175,-768 2185.8044,-692.2681 2181.8586,-655.2673 2126,-603 2069.2454,-549.8943 1984.4077,-516.8406 1928.8847,-499.544"/>
 <polygon fill="#191970" stroke="#191970" points="2165.3685,-793.8339 2165.6187,-804.4257 2172.0283,-795.9897 2165.3685,-793.8339"/>
 </g>
-<!-- Node215 -->
+<!-- Node217 -->
 <g id="node43" class="node">
-<title>Node215</title>
+<title>Node217</title>
 <g id="a_node43"><a xlink:href="packed__func_8h.html" target="_top" xlink:title="Type&#45;erased function used across TVM API. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="1648,-603.5 1648,-633.5 1764,-633.5 1764,-603.5 1648,-603.5"/>
 <text text-anchor="start" x="1656" y="-621.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -294,15 +294,15 @@
 </a>
 </g>
 </g>
-<!-- Node4&#45;&gt;Node215 -->
+<!-- Node4&#45;&gt;Node217 -->
 <g id="edge117" class="edge">
-<title>Node4&#45;&gt;Node215</title>
+<title>Node4&#45;&gt;Node217</title>
 <path fill="none" stroke="#191970" d="M2091.6252,-817.0784C1966.226,-811.9294 1709.792,-797.8403 1683,-768 1648.9649,-730.0924 1679.7227,-663.7837 1696.6173,-633.8889"/>
 <polygon fill="#191970" stroke="#191970" points="2091.7495,-820.5862 2101.8823,-817.4916 2092.0313,-813.5918 2091.7495,-820.5862"/>
 </g>
-<!-- Node216 -->
+<!-- Node218 -->
 <g id="node44" class="node">
-<title>Node216</title>
+<title>Node218</title>
 <g id="a_node44"><a xlink:href="runtime_2module_8h.html" target="_top" xlink:title="Runtime container of the functions generated by TVM, This is used to support dynamically link...">
 <polygon fill="#ffffff" stroke="#ff0000" points="1686,-536.5 1686,-566.5 1802,-566.5 1802,-536.5 1686,-536.5"/>
 <text text-anchor="start" x="1694" y="-554.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -310,15 +310,15 @@
 </a>
 </g>
 </g>
-<!-- Node4&#45;&gt;Node216 -->
+<!-- Node4&#45;&gt;Node218 -->
 <g id="edge100" class="edge">
-<title>Node4&#45;&gt;Node216</title>
+<title>Node4&#45;&gt;Node218</title>
 <path fill="none" stroke="#191970" d="M2156.8771,-794.2636C2151.478,-761.2336 2137.3469,-703.6078 2102,-670 2058.2902,-628.4407 1892.0524,-585.3203 1802.3018,-564.4139"/>
 <polygon fill="#191970" stroke="#191970" points="2153.4592,-795.0752 2158.3914,-804.452 2160.3831,-794.046 2153.4592,-795.0752"/>
 </g>
-<!-- Node221 -->
+<!-- Node223 -->
 <g id="node45" class="node">
-<title>Node221</title>
+<title>Node223</title>
 <g id="a_node45"><a xlink:href="serializer_8h.html" target="_top" xlink:title="Serializer extension to support TVM data types Include this file to enable serialization of DLDataTyp...">
 <polygon fill="#ffffff" stroke="#000000" points="1206,-603.5 1206,-633.5 1322,-633.5 1322,-603.5 1206,-603.5"/>
 <text text-anchor="start" x="1214" y="-621.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -326,15 +326,15 @@
 </a>
 </g>
 </g>
-<!-- Node4&#45;&gt;Node221 -->
+<!-- Node4&#45;&gt;Node223 -->
 <g id="edge119" class="edge">
-<title>Node4&#45;&gt;Node221</title>
+<title>Node4&#45;&gt;Node223</title>
 <path fill="none" stroke="#191970" d="M2091.7895,-816.7774C1920.6899,-809.6611 1482.4854,-789.5581 1420,-768 1369.7759,-750.6722 1358.7486,-738.3866 1321,-701 1300.2198,-680.419 1282.1657,-651.3208 1272.2091,-633.7477"/>
 <polygon fill="#191970" stroke="#191970" points="2091.7993,-820.2808 2101.9355,-817.1972 2092.0888,-813.2867 2091.7993,-820.2808"/>
 </g>
-<!-- Node222 -->
+<!-- Node224 -->
 <g id="node46" class="node">
-<title>Node222</title>
+<title>Node224</title>
 <g id="a_node46"><a xlink:href="memory__manager_8h.html" target="_top" xlink:title="Abstract device memory management API. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="1416.5,-603.5 1416.5,-633.5 1553.5,-633.5 1553.5,-603.5 1416.5,-603.5"/>
 <text text-anchor="start" x="1424.5" y="-621.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -342,15 +342,15 @@
 </a>
 </g>
 </g>
-<!-- Node4&#45;&gt;Node222 -->
+<!-- Node4&#45;&gt;Node224 -->
 <g id="edge120" class="edge">
-<title>Node4&#45;&gt;Node222</title>
+<title>Node4&#45;&gt;Node224</title>
 <path fill="none" stroke="#191970" d="M2091.7713,-815.436C1975.3472,-807.9956 1743.3042,-790.8271 1665,-768 1601.0757,-749.3649 1579.13,-747.0118 1531,-701 1510.8487,-681.7356 1497.1758,-651.676 1490.2635,-633.6537"/>
 <polygon fill="#191970" stroke="#191970" points="2091.7881,-818.944 2101.9893,-816.083 2092.2305,-811.958 2091.7881,-818.944"/>
 </g>
-<!-- Node169 -->
+<!-- Node171 -->
 <g id="node47" class="node">
-<title>Node169</title>
+<title>Node171</title>
 <g id="a_node47"><a xlink:href="metadata_8h.html" target="_top" xlink:title="Defines types which can be used in Metadata. ">
 <polygon fill="#ffffff" stroke="#000000" points="2264,-670.5 2264,-700.5 2380,-700.5 2380,-670.5 2264,-670.5"/>
 <text text-anchor="start" x="2272" y="-688.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -358,15 +358,15 @@
 </a>
 </g>
 </g>
-<!-- Node4&#45;&gt;Node169 -->
+<!-- Node4&#45;&gt;Node171 -->
 <g id="edge97" class="edge">
-<title>Node4&#45;&gt;Node169</title>
+<title>Node4&#45;&gt;Node171</title>
 <path fill="none" stroke="#191970" d="M2228.4661,-813.4225C2288.4744,-806.5095 2369.1458,-792.8151 2389,-768 2407.9045,-744.3719 2375.0908,-717.1593 2349.0644,-700.6518"/>
 <polygon fill="#191970" stroke="#191970" points="2227.8269,-809.9719 2218.2747,-814.555 2228.6001,-816.9291 2227.8269,-809.9719"/>
 </g>
-<!-- Node224 -->
+<!-- Node226 -->
 <g id="node48" class="node">
-<title>Node224</title>
+<title>Node226</title>
 <g id="a_node48"><a xlink:href="metadata__types_8h.html" target="_top" xlink:title="Defines types which can be used in metadata here which are also shared between C and C++ code bases...">
 <polygon fill="#ffffff" stroke="#ff0000" points="2264,-737.5 2264,-767.5 2380,-767.5 2380,-737.5 2264,-737.5"/>
 <text text-anchor="start" x="2272" y="-755.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -374,15 +374,15 @@
 </a>
 </g>
 </g>
-<!-- Node4&#45;&gt;Node224 -->
+<!-- Node4&#45;&gt;Node226 -->
 <g id="edge98" class="edge">
-<title>Node4&#45;&gt;Node224</title>
+<title>Node4&#45;&gt;Node226</title>
 <path fill="none" stroke="#191970" d="M2205.7235,-800.5897C2230.9614,-790.1518 2261.8776,-777.3654 2285.4855,-767.6017"/>
 <polygon fill="#191970" stroke="#191970" points="2204.2592,-797.4077 2196.356,-804.4639 2206.9345,-803.8763 2204.2592,-797.4077"/>
 </g>
-<!-- Node226 -->
+<!-- Node228 -->
 <g id="node49" class="node">
-<title>Node226</title>
+<title>Node228</title>
 <g id="a_node49"><a xlink:href="object_8h.html" target="_top" xlink:title="A managed object in the TVM runtime. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="1692,-737.5 1692,-767.5 1808,-767.5 1808,-737.5 1692,-737.5"/>
 <text text-anchor="start" x="1700" y="-755.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -390,15 +390,15 @@
 </a>
 </g>
 </g>
-<!-- Node4&#45;&gt;Node226 -->
+<!-- Node4&#45;&gt;Node228 -->
 <g id="edge102" class="edge">
-<title>Node4&#45;&gt;Node226</title>
+<title>Node4&#45;&gt;Node228</title>
 <path fill="none" stroke="#191970" d="M2091.7972,-808.3547C2012.4631,-795.3903 1882.849,-774.2095 1808.1065,-761.9954"/>
 <polygon fill="#191970" stroke="#191970" points="2091.2588,-811.813 2101.6924,-809.9717 2092.3878,-804.9047 2091.2588,-811.813"/>
 </g>
-<!-- Node240 -->
+<!-- Node242 -->
 <g id="node50" class="node">
-<title>Node240</title>
+<title>Node242</title>
 <g id="a_node50"><a xlink:href="parallel__for_8h.html" target="_top" xlink:title="An implementation to run loop in parallel. ">
 <polygon fill="#ffffff" stroke="#000000" points="3190,-737.5 3190,-767.5 3304,-767.5 3304,-737.5 3190,-737.5"/>
 <text text-anchor="start" x="3198" y="-755.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/support</text>
@@ -406,9 +406,9 @@
 </a>
 </g>
 </g>
-<!-- Node4&#45;&gt;Node240 -->
+<!-- Node4&#45;&gt;Node242 -->
 <g id="edge121" class="edge">
-<title>Node4&#45;&gt;Node240</title>
+<title>Node4&#45;&gt;Node242</title>
 <path fill="none" stroke="#191970" d="M2228.5027,-817.6566C2426.3533,-812.0765 2996.2714,-794.1328 3181,-768 3183.9135,-767.5878 3186.8874,-767.1084 3189.8821,-766.5786"/>
 <polygon fill="#191970" stroke="#191970" points="2228.1421,-814.1652 2218.2442,-817.9442 2228.3384,-821.1625 2228.1421,-814.1652"/>
 </g>
@@ -519,24 +519,24 @@
 <path fill="none" stroke="#191970" d="M1084.969,-482.3672C1232.9254,-477.6539 1540.3281,-464.195 1577,-433 1639.1926,-380.0958 1603.1345,-261.3209 1590.7047,-226.3595"/>
 <polygon fill="#191970" stroke="#191970" points="1084.4832,-478.8806 1074.5976,-482.6916 1084.7021,-485.8772 1084.4832,-478.8806"/>
 </g>
-<!-- Node141 -->
+<!-- Node143 -->
 <g id="node10" class="node">
-<title>Node141</title>
+<title>Node143</title>
 <g id="a_node10"><a xlink:href="ir_2expr_8h.html" target="_top" xlink:title="Base expr nodes in TVM. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="1267.5,-268.5 1267.5,-287.5 1384.5,-287.5 1384.5,-268.5 1267.5,-268.5"/>
 <text text-anchor="middle" x="1326" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/ir/expr.h</text>
 </a>
 </g>
 </g>
-<!-- Node13&#45;&gt;Node141 -->
+<!-- Node13&#45;&gt;Node143 -->
 <g id="edge11" class="edge">
-<title>Node13&#45;&gt;Node141</title>
+<title>Node13&#45;&gt;Node143</title>
 <path fill="none" stroke="#191970" d="M1066.3794,-473.0989C1073.9909,-471.6986 1081.6745,-470.303 1089,-469 1136.2628,-460.5935 1268.2908,-467.178 1302,-433 1341.9636,-392.4805 1332.6706,-314.8113 1327.9113,-287.6579"/>
 <polygon fill="#191970" stroke="#191970" points="1065.6475,-469.6748 1056.4508,-474.9351 1066.9205,-476.5581 1065.6475,-469.6748"/>
 </g>
-<!-- Node149 -->
+<!-- Node151 -->
 <g id="node11" class="node">
-<title>Node149</title>
+<title>Node151</title>
 <g id="a_node11"><a xlink:href="script_2ir__builder_2base_8h.html" target="_top" xlink:title="include/tvm/script\l/ir_builder/base.h">
 <polygon fill="#ffffff" stroke="#ff0000" points="1343,-201.5 1343,-231.5 1447,-231.5 1447,-201.5 1343,-201.5"/>
 <text text-anchor="start" x="1351" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/script</text>
@@ -544,15 +544,15 @@
 </a>
 </g>
 </g>
-<!-- Node13&#45;&gt;Node149 -->
+<!-- Node13&#45;&gt;Node151 -->
 <g id="edge41" class="edge">
-<title>Node13&#45;&gt;Node149</title>
+<title>Node13&#45;&gt;Node151</title>
 <path fill="none" stroke="#191970" d="M1065.4681,-473.1334C1073.3732,-471.699 1081.3774,-470.2854 1089,-469 1194.9249,-451.1375 1242.4479,-497.9616 1328,-433 1393.0215,-383.6278 1396.6551,-272.055 1395.6982,-231.5515"/>
 <polygon fill="#191970" stroke="#191970" points="1064.7489,-469.7068 1055.5442,-474.9533 1066.0116,-476.592 1064.7489,-469.7068"/>
 </g>
-<!-- Node150 -->
+<!-- Node152 -->
 <g id="node12" class="node">
-<title>Node150</title>
+<title>Node152</title>
 <g id="a_node12"><a xlink:href="ir__builder_2ir_2frame_8h.html" target="_top" xlink:title="include/tvm/script\l/ir_builder/ir/frame.h">
 <polygon fill="#ffffff" stroke="#ff0000" points="1321,-134.5 1321,-164.5 1437,-164.5 1437,-134.5 1321,-134.5"/>
 <text text-anchor="start" x="1329" y="-152.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/script</text>
@@ -560,15 +560,15 @@
 </a>
 </g>
 </g>
-<!-- Node13&#45;&gt;Node150 -->
+<!-- Node13&#45;&gt;Node152 -->
 <g id="edge42" class="edge">
-<title>Node13&#45;&gt;Node150</title>
+<title>Node13&#45;&gt;Node152</title>
 <path fill="none" stroke="#191970" d="M1085.0846,-481.5435C1175.8215,-476.7022 1317.5684,-464.2534 1360,-433 1404.9242,-399.9107 1482.2905,-250.2128 1456,-201 1447.3611,-184.8289 1431.1468,-172.8476 1415.8172,-164.5313"/>
 <polygon fill="#191970" stroke="#191970" points="1084.6369,-478.0618 1074.8296,-482.07 1084.9959,-485.0526 1084.6369,-478.0618"/>
 </g>
-<!-- Node151 -->
+<!-- Node153 -->
 <g id="node13" class="node">
-<title>Node151</title>
+<title>Node153</title>
 <g id="a_node13"><a xlink:href="ir_2ir_8h.html" target="_top" xlink:title="include/tvm/script\l/ir_builder/ir/ir.h">
 <polygon fill="#ffffff" stroke="#000000" points="1225,-67.5 1225,-97.5 1329,-97.5 1329,-67.5 1225,-67.5"/>
 <text text-anchor="start" x="1233" y="-85.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/script</text>
@@ -576,15 +576,15 @@
 </a>
 </g>
 </g>
-<!-- Node13&#45;&gt;Node151 -->
+<!-- Node13&#45;&gt;Node153 -->
 <g id="edge43" class="edge">
-<title>Node13&#45;&gt;Node151</title>
+<title>Node13&#45;&gt;Node153</title>
 <path fill="none" stroke="#191970" d="M1084.895,-478.0434C1147.4941,-471.1346 1227.8748,-457.6662 1249,-433 1296.7731,-377.2191 1252.2568,-341.2174 1258,-268 1262.962,-204.7425 1271.4634,-129.6114 1275.1758,-97.8685"/>
 <polygon fill="#191970" stroke="#191970" points="1084.1924,-474.5983 1074.619,-479.137 1084.9332,-481.559 1084.1924,-474.5983"/>
 </g>
-<!-- Node170 -->
+<!-- Node172 -->
 <g id="node14" class="node">
-<title>Node170</title>
+<title>Node172</title>
 <g id="a_node14"><a xlink:href="doc_8h.html" target="_top" xlink:title="include/tvm/script\l/printer/doc.h">
 <polygon fill="#ffffff" stroke="#ff0000" points="674,-201.5 674,-231.5 778,-231.5 778,-201.5 674,-201.5"/>
 <text text-anchor="start" x="682" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/script</text>
@@ -592,15 +592,15 @@
 </a>
 </g>
 </g>
-<!-- Node13&#45;&gt;Node170 -->
+<!-- Node13&#45;&gt;Node172 -->
 <g id="edge44" class="edge">
-<title>Node13&#45;&gt;Node170</title>
+<title>Node13&#45;&gt;Node172</title>
 <path fill="none" stroke="#191970" d="M927.2875,-473.8704C874.6741,-465.2491 811.6315,-451.4924 792,-433 733.0782,-377.4969 726.2472,-271.1306 725.8266,-231.6985"/>
 <polygon fill="#191970" stroke="#191970" points="926.8888,-477.3511 937.3158,-475.4732 927.9936,-470.4388 926.8888,-477.3511"/>
 </g>
-<!-- Node172 -->
+<!-- Node174 -->
 <g id="node15" class="node">
-<title>Node172</title>
+<title>Node174</title>
 <g id="a_node15"><a xlink:href="printer_2frame_8h.html" target="_top" xlink:title="include/tvm/script\l/printer/frame.h">
 <polygon fill="#ffffff" stroke="#000000" points="712,-134.5 712,-164.5 816,-164.5 816,-134.5 712,-134.5"/>
 <text text-anchor="start" x="720" y="-152.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/script</text>
@@ -608,15 +608,15 @@
 </a>
 </g>
 </g>
-<!-- Node13&#45;&gt;Node172 -->
+<!-- Node13&#45;&gt;Node174 -->
 <g id="edge45" class="edge">
-<title>Node13&#45;&gt;Node172</title>
+<title>Node13&#45;&gt;Node174</title>
 <path fill="none" stroke="#191970" d="M942.9553,-472.8853C899.1361,-463.5809 845.8621,-449.401 830,-433 800.1741,-402.1607 806,-382.4027 806,-339.5 806,-339.5 806,-339.5 806,-278 806,-235.2838 784.7719,-188.3567 772.4993,-164.7962"/>
 <polygon fill="#191970" stroke="#191970" points="942.4984,-476.3649 953.0009,-474.9695 943.9205,-469.5109 942.4984,-476.3649"/>
 </g>
-<!-- Node173 -->
+<!-- Node175 -->
 <g id="node16" class="node">
-<title>Node173</title>
+<title>Node175</title>
 <g id="a_node16"><a xlink:href="ir__docsifier_8h.html" target="_top" xlink:title="include/tvm/script\l/printer/ir_docsifier.h">
 <polygon fill="#ffffff" stroke="#000000" points="706,-.5 706,-30.5 822,-30.5 822,-.5 706,-.5"/>
 <text text-anchor="start" x="714" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/script</text>
@@ -624,15 +624,15 @@
 </a>
 </g>
 </g>
-<!-- Node13&#45;&gt;Node173 -->
+<!-- Node13&#45;&gt;Node175 -->
 <g id="edge46" class="edge">
-<title>Node13&#45;&gt;Node173</title>
+<title>Node13&#45;&gt;Node175</title>
 <path fill="none" stroke="#191970" d="M969.8488,-470.9465C922.104,-450.4224 844,-406.5942 844,-339.5 844,-339.5 844,-339.5 844,-149.5 844,-100.459 804.0085,-53.7794 780.5761,-30.6372"/>
 <polygon fill="#191970" stroke="#191970" points="968.6717,-474.2475 979.249,-474.8566 971.3601,-467.7844 968.6717,-474.2475"/>
 </g>
-<!-- Node174 -->
+<!-- Node176 -->
 <g id="node17" class="node">
-<title>Node174</title>
+<title>Node176</title>
 <g id="a_node17"><a xlink:href="var__table_8h.html" target="_top" xlink:title="include/tvm/script\l/printer/var_table.h">
 <polygon fill="#ffffff" stroke="#000000" points="546,-67.5 546,-97.5 656,-97.5 656,-67.5 546,-67.5"/>
 <text text-anchor="start" x="554" y="-85.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/script</text>
@@ -640,9 +640,9 @@
 </a>
 </g>
 </g>
-<!-- Node13&#45;&gt;Node174 -->
+<!-- Node13&#45;&gt;Node176 -->
 <g id="edge49" class="edge">
-<title>Node13&#45;&gt;Node174</title>
+<title>Node13&#45;&gt;Node176</title>
 <path fill="none" stroke="#191970" d="M927.3829,-476.6333C824.9255,-465.9364 655.7272,-446.5821 630,-433 607.7388,-421.2477 613.3419,-404.3493 592,-391 543.347,-360.5678 503.3855,-400.1786 468,-355 407.3134,-277.518 417.1525,-208.0346 482,-134 498.5383,-115.1186 523.2447,-102.9151 545.8666,-95.1542"/>
 <polygon fill="#191970" stroke="#191970" points="927.0804,-480.1207 937.3886,-477.6726 927.8037,-473.1581 927.0804,-480.1207"/>
 </g>
@@ -678,69 +678,69 @@
 <path fill="none" stroke="#191970" d="M927.2169,-474.3968C865.5058,-465.3451 785.6329,-450.7737 758,-433 704.2446,-398.4241 627.3908,-251.3339 588,-201 578.2372,-188.525 566.7116,-174.77 558.0516,-164.6158"/>
 <polygon fill="#191970" stroke="#191970" points="926.9451,-477.8937 937.3423,-475.8568 927.9442,-470.9654 926.9451,-477.8937"/>
 </g>
-<!-- Node13&#45;&gt;Node177 -->
+<!-- Node13&#45;&gt;Node179 -->
 <g id="edge55" class="edge">
-<title>Node13&#45;&gt;Node177</title>
+<title>Node13&#45;&gt;Node179</title>
 <path fill="none" stroke="#191970" d="M972.2909,-470.1C957.0595,-461.6787 940.5824,-449.429 932,-433 915.3403,-401.1086 873.604,-420.6785 939,-201 943.764,-184.9967 954.6824,-168.8804 962.2012,-159.0435"/>
 <polygon fill="#191970" stroke="#191970" points="971.0381,-473.3914 981.5319,-474.8518 974.2393,-467.1662 971.0381,-473.3914"/>
 </g>
-<!-- Node178 -->
+<!-- Node180 -->
 <g id="node21" class="node">
-<title>Node178</title>
+<title>Node180</title>
 <g id="a_node21"><a xlink:href="relay_2base_8h.html" target="_top" xlink:title="Base classes for the Relay IR. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="1056.5,-73 1056.5,-92 1193.5,-92 1193.5,-73 1056.5,-73"/>
 <text text-anchor="middle" x="1125" y="-80" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/relay/base.h</text>
 </a>
 </g>
 </g>
-<!-- Node13&#45;&gt;Node178 -->
+<!-- Node13&#45;&gt;Node180 -->
 <g id="edge40" class="edge">
-<title>Node13&#45;&gt;Node178</title>
+<title>Node13&#45;&gt;Node180</title>
 <path fill="none" stroke="#191970" d="M1084.8452,-476.8147C1142.1639,-469.4074 1212.7699,-455.8742 1231,-433 1242.634,-418.4022 1237.867,-408.3577 1231,-391 1218.334,-358.9843 1125,-312.4301 1125,-278 1125,-278 1125,-278 1125,-216.5 1125,-169.9722 1125,-114.357 1125,-92.2517"/>
 <polygon fill="#191970" stroke="#191970" points="1084.1683,-473.372 1074.6786,-478.0834 1085.0351,-480.3181 1084.1683,-473.372"/>
 </g>
-<!-- Node184 -->
+<!-- Node186 -->
 <g id="node22" class="node">
-<title>Node184</title>
+<title>Node186</title>
 <g id="a_node22"><a xlink:href="var_8h.html" target="_top" xlink:title="Variables in the TIR. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="948.5,-207 948.5,-226 1063.5,-226 1063.5,-207 948.5,-207"/>
 <text text-anchor="middle" x="1006" y="-214" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/var.h</text>
 </a>
 </g>
 </g>
-<!-- Node13&#45;&gt;Node184 -->
+<!-- Node13&#45;&gt;Node186 -->
 <g id="edge56" class="edge">
-<title>Node13&#45;&gt;Node184</title>
+<title>Node13&#45;&gt;Node186</title>
 <path fill="none" stroke="#191970" d="M976.1551,-469.8367C962.1367,-461.2161 946.8181,-448.8702 939,-433 919.9174,-394.2635 982.815,-262.7893 1001.1386,-226.086"/>
 <polygon fill="#191970" stroke="#191970" points="974.6723,-473.0211 985.0876,-474.9632 978.1567,-466.9499 974.6723,-473.0211"/>
 </g>
-<!-- Node186 -->
+<!-- Node188 -->
 <g id="node23" class="node">
-<title>Node186</title>
+<title>Node188</title>
 <g id="a_node23"><a xlink:href="ir_2span_8h.html" target="_top" xlink:title="Span information for debugging purposes. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="1430.5,-402.5 1430.5,-421.5 1549.5,-421.5 1549.5,-402.5 1430.5,-402.5"/>
 <text text-anchor="middle" x="1490" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/ir/span.h</text>
 </a>
 </g>
 </g>
-<!-- Node13&#45;&gt;Node186 -->
+<!-- Node13&#45;&gt;Node188 -->
 <g id="edge32" class="edge">
-<title>Node13&#45;&gt;Node186</title>
+<title>Node13&#45;&gt;Node188</title>
 <path fill="none" stroke="#191970" d="M1063.603,-473.1663C1072.1021,-471.6617 1080.7665,-470.2203 1089,-469 1233.6316,-447.5647 1272.5056,-461.0557 1416,-433 1430.9413,-430.0787 1447.2499,-425.5948 1460.7452,-421.5193"/>
 <polygon fill="#191970" stroke="#191970" points="1062.9543,-469.7268 1053.7376,-474.952 1064.2011,-476.6148 1062.9543,-469.7268"/>
 </g>
-<!-- Node187 -->
+<!-- Node189 -->
 <g id="node24" class="node">
-<title>Node187</title>
+<title>Node189</title>
 <g id="a_node24"><a xlink:href="ir_2type_8h.html" target="_top" xlink:title="IR/AST nodes for the unified type system in TVM. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="1470,-330 1470,-349 1588,-349 1588,-330 1470,-330"/>
 <text text-anchor="middle" x="1529" y="-337" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/ir/type.h</text>
 </a>
 </g>
 </g>
-<!-- Node13&#45;&gt;Node187 -->
+<!-- Node13&#45;&gt;Node189 -->
 <g id="edge38" class="edge">
-<title>Node13&#45;&gt;Node187</title>
+<title>Node13&#45;&gt;Node189</title>
 <path fill="none" stroke="#191970" d="M1084.8309,-478.5144C1235.4775,-466.8915 1550.119,-441.6505 1558,-433 1580.2049,-408.627 1552.1014,-367.4992 1537.1863,-349.0075"/>
 <polygon fill="#191970" stroke="#191970" points="1084.4138,-475.036 1074.7121,-479.2936 1084.9514,-482.0154 1084.4138,-475.036"/>
 </g>
@@ -760,9 +760,9 @@
 <path fill="none" stroke="#191970" d="M1085.0121,-481.4841C1265.5335,-474.3006 1703.8794,-455.117 1850,-433 1860.0295,-431.4819 1870.6037,-429.3508 1880.7751,-427.0236"/>
 <polygon fill="#191970" stroke="#191970" points="1084.648,-477.9957 1074.7944,-481.8888 1084.9251,-484.9902 1084.648,-477.9957"/>
 </g>
-<!-- Node189 -->
+<!-- Node191 -->
 <g id="node26" class="node">
-<title>Node189</title>
+<title>Node191</title>
 <g id="a_node26"><a xlink:href="traced__object__functor_8h.html" target="_top" xlink:title="include/tvm/script\l/printer/traced_object\l_functor.h">
 <polygon fill="#ffffff" stroke="#000000" points="1720.5,-391.5 1720.5,-432.5 1841.5,-432.5 1841.5,-391.5 1720.5,-391.5"/>
 <text text-anchor="start" x="1728.5" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/script</text>
@@ -771,15 +771,15 @@
 </a>
 </g>
 </g>
-<!-- Node13&#45;&gt;Node189 -->
+<!-- Node13&#45;&gt;Node191 -->
 <g id="edge47" class="edge">
-<title>Node13&#45;&gt;Node189</title>
+<title>Node13&#45;&gt;Node191</title>
 <path fill="none" stroke="#191970" d="M1084.9878,-479.4008C1203.5465,-471.426 1434.4715,-454.6479 1630,-433 1659.8583,-429.6942 1692.9223,-425.1918 1720.4804,-421.2037"/>
 <polygon fill="#191970" stroke="#191970" points="1084.6551,-475.9152 1074.9113,-480.0756 1085.1229,-482.8995 1084.6551,-475.9152"/>
 </g>
-<!-- Node190 -->
+<!-- Node192 -->
 <g id="node27" class="node">
-<title>Node190</title>
+<title>Node192</title>
 <g id="a_node27"><a xlink:href="printer_8h.html" target="_top" xlink:title="include/tvm/script\l/printer.h">
 <polygon fill="#ffffff" stroke="#000000" points="1118,-397 1118,-427 1222,-427 1222,-397 1118,-397"/>
 <text text-anchor="start" x="1126" y="-415" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/script</text>
@@ -787,9 +787,9 @@
 </a>
 </g>
 </g>
-<!-- Node13&#45;&gt;Node190 -->
+<!-- Node13&#45;&gt;Node192 -->
 <g id="edge50" class="edge">
-<title>Node13&#45;&gt;Node190</title>
+<title>Node13&#45;&gt;Node192</title>
 <path fill="none" stroke="#191970" d="M1037.1533,-470.728C1065.3387,-458.268 1106.7644,-439.9548 1135.844,-427.0995"/>
 <polygon fill="#191970" stroke="#191970" points="1035.5089,-467.6281 1027.7779,-474.8726 1038.3393,-474.0304 1035.5089,-467.6281"/>
 </g>
@@ -809,99 +809,99 @@
 <path fill="none" stroke="#191970" d="M946.5362,-473.1073C938.6305,-471.6772 930.6249,-470.2717 923,-469 814.4413,-450.8947 777.9304,-479.1162 678,-433 652.2615,-421.1221 653.4078,-406.9885 630,-391 609.3623,-376.9036 584.3853,-363.9368 564.6826,-354.5405"/>
 <polygon fill="#191970" stroke="#191970" points="945.9939,-476.5662 956.4607,-474.9239 947.2543,-469.6806 945.9939,-476.5662"/>
 </g>
-<!-- Node141&#45;&gt;Node16 -->
+<!-- Node143&#45;&gt;Node16 -->
 <g id="edge12" class="edge">
-<title>Node141&#45;&gt;Node16</title>
+<title>Node143&#45;&gt;Node16</title>
 <path fill="none" stroke="#191970" d="M1376.3567,-266.1343C1425.8889,-254.463 1500.5576,-236.8686 1546.4101,-226.0643"/>
 <polygon fill="#191970" stroke="#191970" points="1375.4256,-262.7578 1366.4949,-268.4581 1377.0311,-269.5712 1375.4256,-262.7578"/>
 </g>
-<!-- Node141&#45;&gt;Node149 -->
+<!-- Node143&#45;&gt;Node151 -->
 <g id="edge13" class="edge">
-<title>Node141&#45;&gt;Node149</title>
+<title>Node143&#45;&gt;Node151</title>
 <path fill="none" stroke="#191970" d="M1344.5262,-261.4875C1354.9812,-252.1689 1367.948,-240.6116 1378.1207,-231.5446"/>
 <polygon fill="#191970" stroke="#191970" points="1341.9176,-259.1241 1336.7813,-268.3906 1346.5752,-264.3497 1341.9176,-259.1241"/>
 </g>
-<!-- Node141&#45;&gt;Node150 -->
+<!-- Node143&#45;&gt;Node152 -->
 <g id="edge16" class="edge">
-<title>Node141&#45;&gt;Node150</title>
+<title>Node143&#45;&gt;Node152</title>
 <path fill="none" stroke="#191970" d="M1324.8256,-257.9447C1324.6379,-241.8928 1326.0958,-219.1446 1334,-201 1340.0696,-187.0668 1351.3196,-174.1918 1361.0522,-164.8001"/>
 <polygon fill="#191970" stroke="#191970" points="1321.3397,-258.4152 1325.1844,-268.2878 1328.3355,-258.1725 1321.3397,-258.4152"/>
 </g>
-<!-- Node141&#45;&gt;Node151 -->
+<!-- Node143&#45;&gt;Node153 -->
 <g id="edge17" class="edge">
-<title>Node141&#45;&gt;Node151</title>
+<title>Node143&#45;&gt;Node153</title>
 <path fill="none" stroke="#191970" d="M1321.0995,-258.4481C1311.3233,-219.4428 1289.583,-132.7037 1280.7951,-97.6418"/>
 <polygon fill="#191970" stroke="#191970" points="1317.7438,-259.4561 1323.5701,-268.3051 1324.5338,-257.7542 1317.7438,-259.4561"/>
 </g>
-<!-- Node141&#45;&gt;Node170 -->
+<!-- Node143&#45;&gt;Node172 -->
 <g id="edge18" class="edge">
-<title>Node141&#45;&gt;Node170</title>
+<title>Node143&#45;&gt;Node172</title>
 <path fill="none" stroke="#191970" d="M1257.1047,-270.9382C1135.8237,-258.5069 886.7939,-232.9814 778.3711,-221.868"/>
 <polygon fill="#191970" stroke="#191970" points="1257.0722,-274.4532 1267.377,-271.9911 1257.7861,-267.4896 1257.0722,-274.4532"/>
 </g>
-<!-- Node141&#45;&gt;Node26 -->
+<!-- Node143&#45;&gt;Node26 -->
 <g id="edge25" class="edge">
-<title>Node141&#45;&gt;Node26</title>
+<title>Node143&#45;&gt;Node26</title>
 <path fill="none" stroke="#191970" d="M1256.958,-274.143C1136.6195,-267.2085 880.6856,-251.5221 665,-232 636.8272,-229.45 605.5097,-226.0109 579.7441,-223.0201"/>
 <polygon fill="#191970" stroke="#191970" points="1256.946,-277.648 1267.1301,-274.7269 1257.3472,-270.6595 1256.946,-277.648"/>
 </g>
-<!-- Node141&#45;&gt;Node177 -->
+<!-- Node143&#45;&gt;Node179 -->
 <g id="edge28" class="edge">
-<title>Node141&#45;&gt;Node177</title>
+<title>Node143&#45;&gt;Node179</title>
 <path fill="none" stroke="#191970" d="M1290.0345,-265.0181C1218.4843,-239.1917 1059.7633,-181.9005 996.6412,-159.1163"/>
 <polygon fill="#191970" stroke="#191970" points="1288.9386,-268.3435 1299.533,-268.4466 1291.3153,-261.7593 1288.9386,-268.3435"/>
 </g>
-<!-- Node141&#45;&gt;Node184 -->
+<!-- Node143&#45;&gt;Node186 -->
 <g id="edge30" class="edge">
-<title>Node141&#45;&gt;Node184</title>
+<title>Node143&#45;&gt;Node186</title>
 <path fill="none" stroke="#191970" d="M1266.5127,-266.5673C1205.7211,-254.8839 1112.3949,-236.9478 1055.5021,-226.0137"/>
 <polygon fill="#191970" stroke="#191970" points="1265.8702,-270.0078 1276.3511,-268.4581 1267.1914,-263.1336 1265.8702,-270.0078"/>
 </g>
-<!-- Node149&#45;&gt;Node150 -->
+<!-- Node151&#45;&gt;Node152 -->
 <g id="edge14" class="edge">
-<title>Node149&#45;&gt;Node150</title>
+<title>Node151&#45;&gt;Node152</title>
 <path fill="none" stroke="#191970" d="M1388.9864,-191.3179C1386.8427,-182.3414 1384.4959,-172.5143 1382.6163,-164.6432"/>
 <polygon fill="#191970" stroke="#191970" points="1385.6423,-192.3831 1391.3694,-201.2967 1392.4508,-190.7572 1385.6423,-192.3831"/>
 </g>
-<!-- Node150&#45;&gt;Node151 -->
+<!-- Node152&#45;&gt;Node153 -->
 <g id="edge15" class="edge">
-<title>Node150&#45;&gt;Node151</title>
+<title>Node152&#45;&gt;Node153</title>
 <path fill="none" stroke="#191970" d="M1347.4791,-128.7951C1332.1955,-118.7558 1314.1652,-106.9124 1300.1673,-97.7177"/>
 <polygon fill="#191970" stroke="#191970" points="1345.8295,-131.8991 1356.1092,-134.4639 1349.6726,-126.0484 1345.8295,-131.8991"/>
 </g>
-<!-- Node170&#45;&gt;Node172 -->
+<!-- Node172&#45;&gt;Node174 -->
 <g id="edge19" class="edge">
-<title>Node170&#45;&gt;Node172</title>
+<title>Node172&#45;&gt;Node174</title>
 <path fill="none" stroke="#191970" d="M739.6212,-192.4837C744.8868,-183.1996 750.753,-172.8565 755.4113,-164.6432"/>
 <polygon fill="#191970" stroke="#191970" points="736.5118,-190.8716 734.6228,-201.2967 742.6007,-194.325 736.5118,-190.8716"/>
 </g>
-<!-- Node170&#45;&gt;Node173 -->
+<!-- Node172&#45;&gt;Node175 -->
 <g id="edge23" class="edge">
-<title>Node170&#45;&gt;Node173</title>
+<title>Node172&#45;&gt;Node175</title>
 <path fill="none" stroke="#191970" d="M712.6119,-192.1504C708.7083,-183.7967 704.9661,-174.2238 703,-165 700.1277,-151.5249 699.7558,-147.3904 703,-134 712.698,-93.9712 738.4008,-52.3175 753.1631,-30.6508"/>
 <polygon fill="#191970" stroke="#191970" points="709.5797,-193.913 717.1718,-201.3029 715.8451,-190.7914 709.5797,-193.913"/>
 </g>
-<!-- Node170&#45;&gt;Node174 -->
+<!-- Node172&#45;&gt;Node176 -->
 <g id="edge24" class="edge">
-<title>Node170&#45;&gt;Node174</title>
+<title>Node172&#45;&gt;Node176</title>
 <path fill="none" stroke="#191970" d="M700.7323,-194.5435C690.8703,-185.648 679.6404,-175.1049 670,-165 648.6043,-142.5734 625.9436,-114.5424 612.6548,-97.605"/>
 <polygon fill="#191970" stroke="#191970" points="698.4986,-197.2411 708.2915,-201.2845 703.1575,-192.0167 698.4986,-197.2411"/>
 </g>
-<!-- Node172&#45;&gt;Node173 -->
+<!-- Node174&#45;&gt;Node175 -->
 <g id="edge20" class="edge">
-<title>Node172&#45;&gt;Node173</title>
+<title>Node174&#45;&gt;Node175</title>
 <path fill="none" stroke="#191970" d="M764,-124.3415C764,-96.8131 764,-53.5714 764,-30.7614"/>
 <polygon fill="#191970" stroke="#191970" points="760.5001,-124.3889 764,-134.389 767.5001,-124.389 760.5001,-124.3889"/>
 </g>
-<!-- Node172&#45;&gt;Node174 -->
+<!-- Node174&#45;&gt;Node176 -->
 <g id="edge21" class="edge">
-<title>Node172&#45;&gt;Node174</title>
+<title>Node174&#45;&gt;Node176</title>
 <path fill="none" stroke="#191970" d="M717.9942,-130.5897C692.6006,-120.1518 661.4935,-107.3654 637.7399,-97.6017"/>
 <polygon fill="#191970" stroke="#191970" points="716.8398,-133.8992 727.4196,-134.4639 719.5011,-127.4248 716.8398,-133.8992"/>
 </g>
-<!-- Node174&#45;&gt;Node173 -->
+<!-- Node176&#45;&gt;Node175 -->
 <g id="edge22" class="edge">
-<title>Node174&#45;&gt;Node173</title>
+<title>Node176&#45;&gt;Node175</title>
 <path fill="none" stroke="#191970" d="M647.0058,-63.5897C672.3994,-53.1518 703.5065,-40.3654 727.2601,-30.6017"/>
 <polygon fill="#191970" stroke="#191970" points="645.4989,-60.4248 637.5804,-67.4639 648.1602,-66.8992 645.4989,-60.4248"/>
 </g>
@@ -917,51 +917,51 @@
 <path fill="none" stroke="#191970" d="M533.0583,-191.6103C535.6261,-182.5553 538.4493,-172.5998 540.7057,-164.6432"/>
 <polygon fill="#191970" stroke="#191970" points="529.6725,-190.7211 530.3114,-201.2967 536.4069,-192.6309 529.6725,-190.7211"/>
 </g>
-<!-- Node177&#45;&gt;Node178 -->
+<!-- Node179&#45;&gt;Node180 -->
 <g id="edge29" class="edge">
-<title>Node177&#45;&gt;Node178</title>
+<title>Node179&#45;&gt;Node180</title>
 <path fill="none" stroke="#191970" d="M1001.4353,-135.9118C1031.5615,-122.8895 1076.2642,-103.5664 1102.8335,-92.0817"/>
 <polygon fill="#191970" stroke="#191970" points="999.9981,-132.72 992.2077,-139.9005 1002.7756,-139.1454 999.9981,-132.72"/>
 </g>
-<!-- Node184&#45;&gt;Node177 -->
+<!-- Node186&#45;&gt;Node179 -->
 <g id="edge31" class="edge">
-<title>Node184&#45;&gt;Node177</title>
+<title>Node186&#45;&gt;Node179</title>
 <path fill="none" stroke="#191970" d="M995.9186,-197.7374C989.1613,-185.1614 980.5446,-169.1246 975.1484,-159.0817"/>
 <polygon fill="#191970" stroke="#191970" points="993.0257,-199.7482 1000.8421,-206.9005 999.192,-196.435 993.0257,-199.7482"/>
 </g>
-<!-- Node186&#45;&gt;Node141 -->
+<!-- Node188&#45;&gt;Node143 -->
 <g id="edge33" class="edge">
-<title>Node186&#45;&gt;Node141</title>
+<title>Node188&#45;&gt;Node143</title>
 <path fill="none" stroke="#191970" d="M1470.2482,-395.8613C1435.733,-367.6599 1365.4743,-310.2534 1337.6876,-287.5496"/>
 <polygon fill="#191970" stroke="#191970" points="1468.2472,-398.7461 1478.2055,-402.3631 1472.6763,-393.3255 1468.2472,-398.7461"/>
 </g>
-<!-- Node186&#45;&gt;Node178 -->
+<!-- Node188&#45;&gt;Node180 -->
 <g id="edge37" class="edge">
-<title>Node186&#45;&gt;Node178</title>
+<title>Node188&#45;&gt;Node180</title>
 <path fill="none" stroke="#191970" d="M1430.7161,-400.3694C1336.4686,-380.1612 1163,-335.1978 1163,-278 1163,-278 1163,-278 1163,-216.5 1163,-168.3794 1140.038,-114.1 1129.8127,-92.3143"/>
 <polygon fill="#191970" stroke="#191970" points="1430.1829,-403.834 1440.6907,-402.4792 1431.6315,-396.9855 1430.1829,-403.834"/>
 </g>
-<!-- Node186&#45;&gt;Node187 -->
+<!-- Node188&#45;&gt;Node189 -->
 <g id="edge34" class="edge">
-<title>Node186&#45;&gt;Node187</title>
+<title>Node188&#45;&gt;Node189</title>
 <path fill="none" stroke="#191970" d="M1500.0377,-393.3401C1507.6417,-379.2046 1517.7794,-360.3588 1523.8236,-349.1228"/>
 <polygon fill="#191970" stroke="#191970" points="1496.9148,-391.7576 1495.2597,-402.2223 1503.0795,-395.0738 1496.9148,-391.7576"/>
 </g>
-<!-- Node187&#45;&gt;Node16 -->
+<!-- Node189&#45;&gt;Node16 -->
 <g id="edge35" class="edge">
-<title>Node187&#45;&gt;Node16</title>
+<title>Node189&#45;&gt;Node16</title>
 <path fill="none" stroke="#191970" d="M1537.82,-320.7956C1550.3576,-294.2071 1572.8749,-246.4549 1582.47,-226.1068"/>
 <polygon fill="#191970" stroke="#191970" points="1534.6506,-319.3107 1533.5512,-329.8484 1540.982,-322.2963 1534.6506,-319.3107"/>
 </g>
-<!-- Node187&#45;&gt;Node141 -->
+<!-- Node189&#45;&gt;Node143 -->
 <g id="edge36" class="edge">
-<title>Node187&#45;&gt;Node141</title>
+<title>Node189&#45;&gt;Node143</title>
 <path fill="none" stroke="#191970" d="M1487.4099,-326.9001C1448.8803,-315.2273 1392.4131,-298.1202 1357.496,-287.5419"/>
 <polygon fill="#191970" stroke="#191970" points="1486.696,-330.3408 1497.2813,-329.8906 1488.7256,-323.6415 1486.696,-330.3408"/>
 </g>
-<!-- Node189&#45;&gt;Node173 -->
+<!-- Node191&#45;&gt;Node175 -->
 <g id="edge48" class="edge">
-<title>Node189&#45;&gt;Node173</title>
+<title>Node191&#45;&gt;Node175</title>
 <path fill="none" stroke="#191970" d="M1851.1449,-390.3676C1946.4187,-359.9605 2105,-305.2548 2105,-278 2105,-278 2105,-278 2105,-149.5 2105,-84.0244 1079.4424,-30.5879 822.0515,-18.2076"/>
 <polygon fill="#191970" stroke="#191970" points="1850.0325,-387.0485 1841.5618,-393.4124 1852.1522,-393.7199 1850.0325,-387.0485"/>
 </g>
@@ -971,261 +971,261 @@
 <path fill="none" stroke="#191970" d="M529.9754,-314.2943C528.9659,-289.4615 527.4567,-252.3355 526.6181,-231.7056"/>
 <polygon fill="#191970" stroke="#191970" points="526.4835,-314.5682 530.3869,-324.4178 533.4778,-314.2838 526.4835,-314.5682"/>
 </g>
-<!-- Node191&#45;&gt;Node13 -->
+<!-- Node193&#45;&gt;Node13 -->
 <g id="edge59" class="edge">
-<title>Node191&#45;&gt;Node13</title>
+<title>Node193&#45;&gt;Node13</title>
 <path fill="none" stroke="#191970" d="M1235.2781,-539.5596C1181.2205,-526.578 1095.8917,-506.0869 1045.9924,-494.1039"/>
 <polygon fill="#191970" stroke="#191970" points="1234.7815,-543.0398 1245.3224,-541.9717 1236.4161,-536.2333 1234.7815,-543.0398"/>
 </g>
-<!-- Node191&#45;&gt;Node131 -->
+<!-- Node193&#45;&gt;Node131 -->
 <g id="edge58" class="edge">
-<title>Node191&#45;&gt;Node131</title>
+<title>Node193&#45;&gt;Node131</title>
 <path fill="none" stroke="#191970" d="M1327.2158,-539.1354C1388.2815,-521.5969 1505.0173,-489.3677 1606,-469 1713.4545,-447.327 1742.4167,-454.0244 1850,-433 1858.8995,-431.2608 1868.2649,-429.2079 1877.4086,-427.0781"/>
 <polygon fill="#191970" stroke="#191970" points="1326.1859,-535.7897 1317.5471,-541.9234 1328.1254,-542.5157 1326.1859,-535.7897"/>
 </g>
-<!-- Node202&#45;&gt;Node203 -->
+<!-- Node204&#45;&gt;Node205 -->
 <g id="edge66" class="edge">
-<title>Node202&#45;&gt;Node203</title>
+<title>Node204&#45;&gt;Node205</title>
 <path fill="none" stroke="#191970" d="M2926.7725,-661.7735C2920.7666,-652.4154 2914.0449,-641.9421 2908.7188,-633.6432"/>
 <polygon fill="#191970" stroke="#191970" points="2923.8958,-663.7712 2932.2426,-670.2967 2929.7869,-659.9903 2923.8958,-663.7712"/>
 </g>
-<!-- Node205&#45;&gt;Node202 -->
+<!-- Node207&#45;&gt;Node204 -->
 <g id="edge69" class="edge">
-<title>Node205&#45;&gt;Node202</title>
+<title>Node207&#45;&gt;Node204</title>
 <path fill="none" stroke="#191970" d="M2942,-727.0249C2942,-718.128 2942,-708.4287 2942,-700.6432"/>
 <polygon fill="#191970" stroke="#191970" points="2938.5001,-727.2966 2942,-737.2967 2945.5001,-727.2967 2938.5001,-727.2966"/>
 </g>
-<!-- Node206&#45;&gt;Node170 -->
+<!-- Node208&#45;&gt;Node172 -->
 <g id="edge92" class="edge">
-<title>Node206&#45;&gt;Node170</title>
+<title>Node208&#45;&gt;Node172</title>
 <path fill="none" stroke="#191970" d="M985.6415,-750.9363C814.2442,-746.5626 375.756,-732.4093 318,-701 277.7063,-679.0872 256,-664.3667 256,-618.5 256,-618.5 256,-618.5 256,-484.5 256,-434.6385 275.6942,-417.3893 318,-391 422.5861,-325.7619 482.9844,-410.5954 593,-355 650.7215,-325.831 697.3381,-261.3335 716.5323,-231.8034"/>
 <polygon fill="#191970" stroke="#191970" points="985.7209,-754.4394 995.8058,-751.1922 985.8971,-747.4416 985.7209,-754.4394"/>
 </g>
-<!-- Node206&#45;&gt;Node177 -->
+<!-- Node208&#45;&gt;Node179 -->
 <g id="edge93" class="edge">
-<title>Node206&#45;&gt;Node177</title>
+<title>Node208&#45;&gt;Node179</title>
 <path fill="none" stroke="#191970" d="M985.6852,-746.9812C805.6576,-731.5269 332,-684.5864 332,-618.5 332,-618.5 332,-618.5 332,-484.5 332,-439.1247 336.8398,-418.4109 373,-391 464.5633,-321.5914 520.7197,-393.4287 629,-355 770.0279,-304.9492 916.4082,-192.5782 958.2228,-159.0928"/>
 <polygon fill="#191970" stroke="#191970" points="985.5711,-750.4841 995.8321,-747.8456 986.1653,-743.5094 985.5711,-750.4841"/>
 </g>
-<!-- Node206&#45;&gt;Node184 -->
+<!-- Node208&#45;&gt;Node186 -->
 <g id="edge94" class="edge">
-<title>Node206&#45;&gt;Node184</title>
+<title>Node208&#45;&gt;Node186</title>
 <path fill="none" stroke="#191970" d="M985.6074,-743.4274C844.5479,-720.6127 523.3074,-646.5326 389,-433 329.2247,-337.9644 482.1175,-434.1981 744,-355 846.4353,-324.0217 955.3742,-251.9515 992.625,-226.0142"/>
 <polygon fill="#191970" stroke="#191970" points="985.4296,-746.9426 995.8532,-745.0452 986.5215,-740.0282 985.4296,-746.9426"/>
 </g>
-<!-- Node206&#45;&gt;Node187 -->
+<!-- Node208&#45;&gt;Node189 -->
 <g id="edge71" class="edge">
-<title>Node206&#45;&gt;Node187</title>
+<title>Node208&#45;&gt;Node189</title>
 <path fill="none" stroke="#191970" d="M1055.3119,-727.291C1056.6702,-710.7319 1059.4249,-688.7464 1065,-670 1086.1073,-599.0257 1099.9502,-573.1548 1164,-536 1262.0178,-479.1406 1299.9081,-495.838 1410,-469 1483.7669,-451.0173 1529.3025,-492.0753 1577,-433 1588.7264,-418.4764 1583.8391,-408.3687 1577,-391 1570.1038,-373.4864 1554.1345,-358.4072 1542.5037,-349.1712"/>
 <polygon fill="#191970" stroke="#191970" points="1051.8182,-727.0788 1054.6054,-737.3005 1058.8009,-727.5718 1051.8182,-727.0788"/>
 </g>
-<!-- Node206&#45;&gt;Node131 -->
+<!-- Node208&#45;&gt;Node131 -->
 <g id="edge72" class="edge">
-<title>Node206&#45;&gt;Node131</title>
+<title>Node208&#45;&gt;Node131</title>
 <path fill="none" stroke="#191970" d="M1122.323,-747.2866C1220.2581,-739.2094 1395.3202,-722.3519 1455,-701 1507.8298,-682.0988 1517.509,-668.1876 1562,-634 1612.6168,-595.1052 1614.3165,-572.046 1667,-536 1723.5998,-497.2744 1743.6971,-497.6629 1806,-469 1837.9029,-454.3228 1874.5692,-438.3241 1900.8109,-427.0228"/>
 <polygon fill="#191970" stroke="#191970" points="1121.9059,-743.8089 1112.2238,-748.111 1122.4754,-750.7857 1121.9059,-743.8089"/>
 </g>
-<!-- Node206&#45;&gt;Node191 -->
+<!-- Node208&#45;&gt;Node193 -->
 <g id="edge73" class="edge">
-<title>Node206&#45;&gt;Node191</title>
+<title>Node208&#45;&gt;Node193</title>
 <path fill="none" stroke="#191970" d="M1059.5146,-727.5695C1068.1045,-694.4717 1087.8111,-636.3327 1126,-603 1151.3578,-580.8667 1186.3814,-568.2176 1217.1045,-561.0021"/>
 <polygon fill="#191970" stroke="#191970" points="1056.0932,-726.8262 1057.107,-737.3725 1062.8912,-728.4959 1056.0932,-726.8262"/>
 </g>
-<!-- Node206&#45;&gt;Node209 -->
+<!-- Node208&#45;&gt;Node211 -->
 <g id="edge74" class="edge">
-<title>Node206&#45;&gt;Node209</title>
+<title>Node208&#45;&gt;Node211</title>
 <path fill="none" stroke="#191970" d="M1122.1244,-738.8343C1183.2774,-726.5671 1272.0736,-708.7547 1329.929,-697.149"/>
 <polygon fill="#191970" stroke="#191970" points="1121.2946,-735.431 1112.1784,-740.8295 1122.6714,-742.2942 1121.2946,-735.431"/>
 </g>
-<!-- Node206&#45;&gt;Node215 -->
+<!-- Node208&#45;&gt;Node217 -->
 <g id="edge91" class="edge">
-<title>Node206&#45;&gt;Node215</title>
+<title>Node208&#45;&gt;Node217</title>
 <path fill="none" stroke="#191970" d="M1122.3449,-747.7493C1234.2959,-739.5139 1451.7374,-721.4905 1526,-701 1583.6596,-685.0906 1646.0937,-652.6951 1680.1747,-633.574"/>
 <polygon fill="#191970" stroke="#191970" points="1121.8946,-744.2728 1112.1761,-748.4918 1122.4044,-751.2542 1121.8946,-744.2728"/>
 </g>
-<!-- Node209&#45;&gt;Node8 -->
+<!-- Node211&#45;&gt;Node8 -->
 <g id="edge75" class="edge">
-<title>Node209&#45;&gt;Node8</title>
+<title>Node211&#45;&gt;Node8</title>
 <path fill="none" stroke="#191970" d="M1377.5902,-660.9825C1368.2285,-642.0424 1352.6146,-616.805 1331,-603 1274.1779,-566.7083 799.3158,-512.1677 733,-500 579.7957,-471.8898 526.8058,-505.6037 389,-433 336.5876,-405.3863 294,-398.7417 294,-339.5 294,-339.5 294,-339.5 294,-278 294,-236.9315 296.0664,-188.5157 297.2288,-164.5303"/>
 <polygon fill="#191970" stroke="#191970" points="1374.5046,-662.6465 1381.9145,-670.219 1380.8442,-659.6784 1374.5046,-662.6465"/>
 </g>
-<!-- Node209&#45;&gt;Node191 -->
+<!-- Node211&#45;&gt;Node193 -->
 <g id="edge76" class="edge">
-<title>Node209&#45;&gt;Node191</title>
+<title>Node211&#45;&gt;Node193</title>
 <path fill="none" stroke="#191970" d="M1385.3419,-660.3486C1382.447,-642.8596 1376.4353,-619.7686 1364,-603 1349.6641,-583.6684 1325.7436,-569.557 1307.9728,-561.0449"/>
 <polygon fill="#191970" stroke="#191970" points="1381.8991,-660.9992 1386.7713,-670.4074 1388.8295,-660.0144 1381.8991,-660.9992"/>
 </g>
-<!-- Node209&#45;&gt;Node211 -->
+<!-- Node211&#45;&gt;Node213 -->
 <g id="edge77" class="edge">
-<title>Node209&#45;&gt;Node211</title>
+<title>Node211&#45;&gt;Node213</title>
 <path fill="none" stroke="#191970" d="M1456.0664,-679.6012C1550.4404,-670.8753 1715.8721,-653.4483 1773,-634 1820.2107,-617.9279 1869.3068,-585.5909 1895.9151,-566.5239"/>
 <polygon fill="#191970" stroke="#191970" points="1455.6807,-676.1218 1446.0419,-680.52 1456.3197,-683.0926 1455.6807,-676.1218"/>
 </g>
-<!-- Node209&#45;&gt;Node215 -->
+<!-- Node211&#45;&gt;Node217 -->
 <g id="edge80" class="edge">
-<title>Node209&#45;&gt;Node215</title>
+<title>Node211&#45;&gt;Node217</title>
 <path fill="none" stroke="#191970" d="M1456.3349,-671.1024C1513.3638,-659.0869 1593.7651,-642.147 1647.7751,-630.7675"/>
 <polygon fill="#191970" stroke="#191970" points="1455.3897,-667.7246 1446.3261,-673.2112 1456.8329,-674.5742 1455.3897,-667.7246"/>
 </g>
-<!-- Node209&#45;&gt;Node221 -->
+<!-- Node211&#45;&gt;Node223 -->
 <g id="edge88" class="edge">
-<title>Node209&#45;&gt;Node221</title>
+<title>Node211&#45;&gt;Node223</title>
 <path fill="none" stroke="#191970" d="M1345.2778,-665.7743C1325.4833,-655.5335 1302.8324,-643.2132 1286.4016,-633.7177"/>
 <polygon fill="#191970" stroke="#191970" points="1343.9356,-669.0192 1354.4315,-670.4639 1347.1274,-662.7892 1343.9356,-669.0192"/>
 </g>
-<!-- Node209&#45;&gt;Node222 -->
+<!-- Node211&#45;&gt;Node224 -->
 <g id="edge90" class="edge">
-<title>Node209&#45;&gt;Node222</title>
+<title>Node211&#45;&gt;Node224</title>
 <path fill="none" stroke="#191970" d="M1418.6246,-664.3469C1433.0578,-654.3776 1449.9467,-642.7121 1463.0763,-633.6432"/>
 <polygon fill="#191970" stroke="#191970" points="1416.2497,-661.7335 1410.0108,-670.2967 1420.228,-667.4932 1416.2497,-661.7335"/>
 </g>
-<!-- Node211&#45;&gt;Node212 -->
+<!-- Node213&#45;&gt;Node214 -->
 <g id="edge78" class="edge">
-<title>Node211&#45;&gt;Node212</title>
+<title>Node213&#45;&gt;Node214</title>
 <path fill="none" stroke="#191970" d="M1984.3908,-537.1807C2039.753,-525.5892 2117.1835,-509.3772 2171.4816,-498.0085"/>
 <polygon fill="#191970" stroke="#191970" points="1983.3913,-533.814 1974.3208,-539.2891 1984.8259,-540.6654 1983.3913,-533.814"/>
 </g>
-<!-- Node211&#45;&gt;Node213 -->
+<!-- Node213&#45;&gt;Node215 -->
 <g id="edge79" class="edge">
-<title>Node211&#45;&gt;Node213</title>
+<title>Node213&#45;&gt;Node215</title>
 <path fill="none" stroke="#191970" d="M1900.7725,-527.7735C1894.7666,-518.4154 1888.0449,-507.9421 1882.7188,-499.6432"/>
 <polygon fill="#191970" stroke="#191970" points="1897.8958,-529.7712 1906.2426,-536.2967 1903.7869,-525.9903 1897.8958,-529.7712"/>
 </g>
-<!-- Node215&#45;&gt;Node6 -->
+<!-- Node217&#45;&gt;Node6 -->
 <g id="edge81" class="edge">
-<title>Node215&#45;&gt;Node6</title>
+<title>Node217&#45;&gt;Node6</title>
 <path fill="none" stroke="#191970" d="M1646.1972,-600.5657C1580.0096,-581.4514 1469.868,-551.8105 1373,-536 1177.1674,-504.0368 1124.714,-525.9932 928,-500 764.922,-478.4514 725.03,-466.5874 564,-433 555.2043,-431.1654 545.95,-429.1286 536.8587,-427.0657"/>
 <polygon fill="#191970" stroke="#191970" points="1645.4558,-603.9949 1656.0352,-603.4229 1647.4081,-597.2727 1645.4558,-603.9949"/>
 </g>
-<!-- Node215&#45;&gt;Node189 -->
+<!-- Node217&#45;&gt;Node191 -->
 <g id="edge87" class="edge">
-<title>Node215&#45;&gt;Node189</title>
+<title>Node217&#45;&gt;Node191</title>
 <path fill="none" stroke="#191970" d="M1683.6665,-595.6413C1676.9688,-587.3061 1670.482,-577.3658 1667,-567 1662.6127,-553.9394 1661.9959,-548.8369 1667,-536 1684.0469,-492.2701 1724.5286,-454.6327 1752.3594,-432.6373"/>
 <polygon fill="#191970" stroke="#191970" points="1681.1594,-598.094 1690.3311,-603.3979 1686.4688,-593.5321 1681.1594,-598.094"/>
 </g>
-<!-- Node215&#45;&gt;Node191 -->
+<!-- Node217&#45;&gt;Node193 -->
 <g id="edge82" class="edge">
-<title>Node215&#45;&gt;Node191</title>
+<title>Node217&#45;&gt;Node193</title>
 <path fill="none" stroke="#191970" d="M1637.993,-608.013C1571.6366,-597.7355 1467.8448,-581.5401 1378,-567 1366.2044,-565.0911 1353.5738,-563.0104 1341.5435,-561.0113"/>
 <polygon fill="#191970" stroke="#191970" points="1637.4915,-611.477 1647.9093,-609.548 1638.5624,-604.5594 1637.4915,-611.477"/>
 </g>
-<!-- Node215&#45;&gt;Node211 -->
+<!-- Node217&#45;&gt;Node213 -->
 <g id="edge83" class="edge">
-<title>Node215&#45;&gt;Node211</title>
+<title>Node217&#45;&gt;Node213</title>
 <path fill="none" stroke="#191970" d="M1762.6928,-600.4123C1795.8618,-589.8298 1837.2547,-576.6235 1868.6664,-566.6017"/>
 <polygon fill="#191970" stroke="#191970" points="1761.5911,-597.0899 1753.1281,-603.4639 1763.7188,-603.7587 1761.5911,-597.0899"/>
 </g>
-<!-- Node215&#45;&gt;Node213 -->
+<!-- Node217&#45;&gt;Node215 -->
 <g id="edge86" class="edge">
-<title>Node215&#45;&gt;Node213</title>
+<title>Node217&#45;&gt;Node215</title>
 <path fill="none" stroke="#191970" d="M1686.8483,-595.3371C1674.6815,-577.6625 1663.5347,-553.4605 1677,-536 1693.8125,-514.1993 1763.3186,-499.8067 1814.809,-491.912"/>
 <polygon fill="#191970" stroke="#191970" points="1684.056,-597.4479 1692.7996,-603.4312 1689.6956,-593.3012 1684.056,-597.4479"/>
 </g>
-<!-- Node215&#45;&gt;Node216 -->
+<!-- Node217&#45;&gt;Node218 -->
 <g id="edge84" class="edge">
-<title>Node215&#45;&gt;Node216</title>
+<title>Node217&#45;&gt;Node218</title>
 <path fill="none" stroke="#191970" d="M1713.2671,-594.1932C1718.0159,-584.9844 1724.0637,-574.771 1729.6578,-566.6432"/>
 <polygon fill="#191970" stroke="#191970" points="1710.0673,-592.7706 1708.862,-603.2967 1716.3684,-595.8197 1710.0673,-592.7706"/>
 </g>
-<!-- Node216&#45;&gt;Node215 -->
+<!-- Node218&#45;&gt;Node217 -->
 <g id="edge85" class="edge">
-<title>Node216&#45;&gt;Node215</title>
+<title>Node218&#45;&gt;Node217</title>
 <path fill="none" stroke="#191970" d="M1736.768,-575.7386C1732.0256,-584.9425 1725.9799,-595.1586 1720.3836,-603.2967"/>
 <polygon fill="#191970" stroke="#191970" points="1739.9637,-577.1697 1741.1649,-566.6432 1733.6614,-574.1231 1739.9637,-577.1697"/>
 </g>
-<!-- Node221&#45;&gt;Node209 -->
+<!-- Node223&#45;&gt;Node211 -->
 <g id="edge89" class="edge">
-<title>Node221&#45;&gt;Node209</title>
+<title>Node223&#45;&gt;Node211</title>
 <path fill="none" stroke="#191970" d="M1306.8329,-638.283C1326.7276,-648.5779 1349.4876,-660.9619 1365.9125,-670.4639"/>
 <polygon fill="#191970" stroke="#191970" points="1308.4223,-635.1647 1297.9267,-633.7177 1305.2292,-641.394 1308.4223,-635.1647"/>
 </g>
-<!-- Node224&#45;&gt;Node169 -->
+<!-- Node226&#45;&gt;Node171 -->
 <g id="edge99" class="edge">
-<title>Node224&#45;&gt;Node169</title>
+<title>Node226&#45;&gt;Node171</title>
 <path fill="none" stroke="#191970" d="M2322,-727.0249C2322,-718.128 2322,-708.4287 2322,-700.6432"/>
 <polygon fill="#191970" stroke="#191970" points="2318.5001,-727.2966 2322,-737.2967 2325.5001,-727.2967 2318.5001,-727.2966"/>
 </g>
-<!-- Node226&#45;&gt;Node13 -->
+<!-- Node228&#45;&gt;Node13 -->
 <g id="edge108" class="edge">
-<title>Node226&#45;&gt;Node13</title>
+<title>Node228&#45;&gt;Node13</title>
 <path fill="none" stroke="#191970" d="M1681.8639,-750.9859C1523.1565,-745.9496 1135.8043,-724.4574 1050,-634 1011.4572,-593.3671 1006.4771,-520.6986 1005.9827,-494.3688"/>
 <polygon fill="#191970" stroke="#191970" points="1681.8793,-754.4879 1691.9814,-751.2944 1682.0927,-747.4911 1681.8793,-754.4879"/>
 </g>
-<!-- Node226&#45;&gt;Node16 -->
+<!-- Node228&#45;&gt;Node16 -->
 <g id="edge103" class="edge">
-<title>Node226&#45;&gt;Node16</title>
+<title>Node228&#45;&gt;Node16</title>
 <path fill="none" stroke="#191970" d="M1802.9694,-733.785C1824.9827,-725.0886 1850.3851,-713.8254 1872,-701 1891.0422,-689.7011 1892.5457,-682.2356 1911,-670 1938.8482,-651.536 1952.1961,-656.3878 1977,-634 2060.4325,-558.6945 2114.3446,-480.9811 2047,-391 1994.4767,-320.8221 1730.4802,-251.0042 1627.8738,-226.0805"/>
 <polygon fill="#191970" stroke="#191970" points="1801.5042,-730.599 1793.4426,-737.4738 1804.0318,-737.1268 1801.5042,-730.599"/>
 </g>
-<!-- Node226&#45;&gt;Node141 -->
+<!-- Node228&#45;&gt;Node143 -->
 <g id="edge104" class="edge">
-<title>Node226&#45;&gt;Node141</title>
+<title>Node228&#45;&gt;Node143</title>
 <path fill="none" stroke="#191970" d="M1681.8688,-747.4449C1578.3509,-739.2369 1386.5601,-721.815 1321,-701 1306.0739,-696.261 1205.1209,-647.3902 1197,-634 1174.4125,-596.7563 1169.1386,-570.2892 1196,-536 1237.2144,-483.3888 1277.2505,-516.5551 1342,-500 1454.6953,-471.1862 1522.8782,-524.2629 1595,-433 1625.0419,-394.985 1628.2632,-361.0171 1597,-324 1583.4325,-307.9354 1460.0928,-292.187 1384.5631,-283.934"/>
 <polygon fill="#191970" stroke="#191970" points="1681.6651,-750.9396 1691.9086,-748.2341 1682.2137,-743.9612 1681.6651,-750.9396"/>
 </g>
-<!-- Node226&#45;&gt;Node186 -->
+<!-- Node228&#45;&gt;Node188 -->
 <g id="edge105" class="edge">
-<title>Node226&#45;&gt;Node186</title>
+<title>Node228&#45;&gt;Node188</title>
 <path fill="none" stroke="#191970" d="M1681.4069,-748.5673C1569.091,-741.5345 1351.1266,-725.2342 1278,-701 1222.2143,-682.5126 1193.1874,-685.009 1164,-634 1142.3684,-596.1958 1139.2751,-571.8576 1164,-536 1181.6595,-510.389 1381.0359,-445.9922 1459.0089,-421.5823"/>
 <polygon fill="#191970" stroke="#191970" points="1681.4122,-752.0742 1691.6094,-749.1988 1681.8448,-745.0876 1681.4122,-752.0742"/>
 </g>
-<!-- Node226&#45;&gt;Node187 -->
+<!-- Node228&#45;&gt;Node189 -->
 <g id="edge106" class="edge">
-<title>Node226&#45;&gt;Node187</title>
+<title>Node228&#45;&gt;Node189</title>
 <path fill="none" stroke="#191970" d="M1786.1114,-732.3471C1801.9772,-723.1821 1820.673,-711.954 1837,-701 1855.9706,-688.2724 1858.7698,-682.3319 1878,-670 1906.4556,-651.7519 1918.7963,-655.3561 1945,-634 1975.0837,-609.4818 1985.0977,-602.4019 2001,-567 2033.258,-495.1867 2073.6902,-449.4934 2021,-391 1992.5111,-359.3734 1713.0858,-345.8455 1588.4667,-341.3516"/>
 <polygon fill="#191970" stroke="#191970" points="1784.0745,-729.4803 1777.1363,-737.4873 1787.5534,-735.5547 1784.0745,-729.4803"/>
 </g>
-<!-- Node226&#45;&gt;Node131 -->
+<!-- Node228&#45;&gt;Node131 -->
 <g id="edge107" class="edge">
-<title>Node226&#45;&gt;Node131</title>
+<title>Node228&#45;&gt;Node131</title>
 <path fill="none" stroke="#191970" d="M1773.0295,-729.99C1790.8248,-712.9454 1816.3332,-689.2592 1840,-670 1900.7521,-620.5622 1947.2975,-636.7155 1983,-567 2007.6097,-518.9453 1968.8673,-455.7852 1947.9581,-427.2085"/>
 <polygon fill="#191970" stroke="#191970" points="1770.2016,-727.8546 1765.4303,-737.3143 1775.0593,-732.8947 1770.2016,-727.8546"/>
 </g>
-<!-- Node226&#45;&gt;Node191 -->
+<!-- Node228&#45;&gt;Node193 -->
 <g id="edge109" class="edge">
-<title>Node226&#45;&gt;Node191</title>
+<title>Node228&#45;&gt;Node193</title>
 <path fill="none" stroke="#191970" d="M1728.9733,-729.838C1696.5477,-696.3921 1630.8476,-634.3934 1562,-603 1551.9445,-598.4148 1418.0003,-574.631 1340.1246,-561.0449"/>
 <polygon fill="#191970" stroke="#191970" points="1726.6722,-732.4957 1736.1147,-737.3009 1731.7297,-727.6561 1726.6722,-732.4957"/>
 </g>
-<!-- Node226&#45;&gt;Node197 -->
+<!-- Node228&#45;&gt;Node199 -->
 <g id="edge110" class="edge">
-<title>Node226&#45;&gt;Node197</title>
+<title>Node228&#45;&gt;Node199</title>
 <path fill="none" stroke="#191970" d="M1817.5642,-734.886C1866.5296,-722.1207 1930.8564,-705.3507 1970.5394,-695.0053"/>
 <polygon fill="#191970" stroke="#191970" points="1816.4694,-731.5543 1807.6758,-737.4639 1818.2354,-738.3279 1816.4694,-731.5543"/>
 </g>
-<!-- Node226&#45;&gt;Node209 -->
+<!-- Node228&#45;&gt;Node211 -->
 <g id="edge113" class="edge">
-<title>Node226&#45;&gt;Node209</title>
+<title>Node228&#45;&gt;Node211</title>
 <path fill="none" stroke="#191970" d="M1681.6439,-739.8485C1613.5705,-727.2492 1510.2132,-708.1196 1446.0172,-696.238"/>
 <polygon fill="#191970" stroke="#191970" points="1681.3684,-743.3568 1691.8384,-741.7353 1682.6424,-736.4737 1681.3684,-743.3568"/>
 </g>
-<!-- Node226&#45;&gt;Node213 -->
+<!-- Node228&#45;&gt;Node215 -->
 <g id="edge115" class="edge">
-<title>Node226&#45;&gt;Node213</title>
+<title>Node228&#45;&gt;Node215</title>
 <path fill="none" stroke="#191970" d="M1761.138,-727.9907C1779.4027,-687.8336 1816.9013,-605.5319 1849,-536 1854.6228,-523.8199 1861.0448,-510.0465 1865.83,-499.8102"/>
 <polygon fill="#191970" stroke="#191970" points="1757.877,-726.7068 1756.9239,-737.2587 1764.2492,-729.6043 1757.877,-726.7068"/>
 </g>
-<!-- Node226&#45;&gt;Node215 -->
+<!-- Node228&#45;&gt;Node217 -->
 <g id="edge114" class="edge">
-<title>Node226&#45;&gt;Node215</title>
+<title>Node228&#45;&gt;Node217</title>
 <path fill="none" stroke="#191970" d="M1737.3016,-727.8496C1733.3157,-719.4752 1729.1411,-709.9662 1726,-701 1717.9601,-678.0502 1711.9757,-650.4119 1708.7371,-633.6627"/>
 <polygon fill="#191970" stroke="#191970" points="1734.2732,-729.6232 1741.8207,-737.0586 1740.5573,-726.5394 1734.2732,-729.6232"/>
 </g>
-<!-- Node226&#45;&gt;Node216 -->
+<!-- Node228&#45;&gt;Node218 -->
 <g id="edge112" class="edge">
-<title>Node226&#45;&gt;Node216</title>
+<title>Node228&#45;&gt;Node218</title>
 <path fill="none" stroke="#191970" d="M1753.7576,-727.3244C1756.3302,-710.9363 1759.992,-689.1289 1764,-670 1767.3821,-653.8581 1770.9849,-650.3689 1773,-634 1774.6834,-620.3255 1776.4944,-616.3273 1773,-603 1769.5648,-589.8983 1761.9619,-576.6154 1755.4026,-566.8216"/>
 <polygon fill="#191970" stroke="#191970" points="1750.2954,-726.8121 1752.2317,-737.2285 1757.2137,-727.8781 1750.2954,-726.8121"/>
 </g>
-<!-- Node226&#45;&gt;Node222 -->
+<!-- Node228&#45;&gt;Node224 -->
 <g id="edge116" class="edge">
-<title>Node226&#45;&gt;Node222</title>
+<title>Node228&#45;&gt;Node224</title>
 <path fill="none" stroke="#191970" d="M1682.0137,-740.3296C1648.0724,-732.3767 1607.2672,-719.8752 1574,-701 1542.4816,-683.117 1513.14,-652.0225 1497.2781,-633.5421"/>
 <polygon fill="#191970" stroke="#191970" points="1681.3247,-743.7621 1691.8504,-742.5536 1682.8684,-736.9344 1681.3247,-743.7621"/>
 </g>
-<!-- Node226&#45;&gt;Node169 -->
+<!-- Node228&#45;&gt;Node171 -->
 <g id="edge111" class="edge">
-<title>Node226&#45;&gt;Node169</title>
+<title>Node228&#45;&gt;Node171</title>
 <path fill="none" stroke="#191970" d="M1818.2269,-744.5084C1931.7642,-731.2094 2157.4961,-704.7688 2263.9176,-692.3034"/>
 <polygon fill="#191970" stroke="#191970" points="1817.7847,-741.0362 1808.2598,-745.6759 1818.5991,-747.9886 1817.7847,-741.0362"/>
 </g>
diff --git a/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1ScheduleRule-members.html b/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1ScheduleRule-members.html
index 1d909ba5d2..8972d40105 100644
--- a/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1ScheduleRule-members.html
+++ b/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1ScheduleRule-members.html
@@ -71,26 +71,28 @@ $(function() {
 <table class="directory">
   <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html#a3e9b0901b6e01257b060a45e159cc37e">_type_is_nullable</a></td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html">tvm::runtime::ObjectRef</a></td><td class="entry"><span class="mlabel">static</span></td></tr>
   <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#ac88a36846b8653f9ad41218a44bec110">AddRFactor</a>(int max_jobs_per_core, Optional&lt; Integer &gt; max_innermost_factor)</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_schedule::ScheduleRule</a></td><td class="entry"><span class="mlabel">static</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html#a2d76fa1fb628ff276a284e61123589c5">as</a>() const</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html">tvm::runtime::ObjectRef</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#a96f8ddfecce77dae00b9c958c2d514f5">AutoBind</a>(int max_threadblocks, Array&lt; Integer &gt; thread_extents, int max_threads_per_block=-1)</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_schedule::ScheduleRule</a></td><td class="entry"><span class="mlabel">static</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#a73a8c07ad4fa26d5c3e28f33c2215f1d">AutoInline</a>(bool into_producer, bool into_consumer, bool inline_const_tensor, bool disallow_if_then_else, bool require_injective, bool require_ordered, Optional&lt; Array&lt; String &gt;&gt; disallow_op)</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_schedule::ScheduleRule</a></td><td class="en [...]
-  <tr><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html#aa5c355fbb7d2f7402ee360dba8a52cdd">ContainerType</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html">tvm::runtime::ObjectRef</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#a14acfc5ea272e2e53f9ac3e1110e53ea">CrossThreadReduction</a>(Array&lt; Integer &gt; thread_extents)</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_schedule::ScheduleRule</a></td><td class="entry"><span class="mlabel">static</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html#ac261cdb80487fb29ac42b28678f8cbef">data_</a></td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html">tvm::runtime::ObjectRef</a></td><td class="entry"><span class="mlabel">protected</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#a77ab3dd14cbfcec7ed059559f7afc372">DefaultCUDA</a>()</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_schedule::ScheduleRule</a></td><td class="entry"><span class="mlabel">static</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#a2abd71c2f3600573784d855d3cd63814">DefaultCUDATensorCore</a>()</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_schedule::ScheduleRule</a></td><td class="entry"><span class="mlabel">static</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#acd4de1f7ace3a34603f8832ae1b3180b">DefaultHexagon</a>()</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_schedule::ScheduleRule</a></td><td class="entry"><span class="mlabel">static</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#a031b6dcad67f1d985aa30adb13e2b6e8">DefaultLLVM</a>()</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_schedule::ScheduleRule</a></td><td class="entry"><span class="mlabel">static</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html#a17d8d5ad92691f9e18e3e0ae8ef69e4f">defined</a>() const</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html">tvm::runtime::ObjectRef</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html#acd04bb22a6861e9952c344ee8547411f">DowncastNoCheck</a>(ObjectRef ref)</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html">tvm::runtime::ObjectRef</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span><span class="mlabel">static</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#a2c558d23de2ff6bf298bc7167a210859">FApply</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_schedule::ScheduleRule</a></td><td class="entry"></td></tr>
-  <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#a4c02153b06f9c5577114d719747a7b1a">FAsString</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_schedule::ScheduleRule</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#a7bed37c51b09c7e58ce8f25d601bc24f">FClone</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_schedule::ScheduleRule</a></td><td class="entry"></td></tr>
-  <tr><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html#a22e5bb9d64dbc773bb9263b70882239e">FFIClearAfterMove</a>(ObjectRef *ref)</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html">tvm::runtime::ObjectRef</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span><span class="mlabel">static</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#a19b2fb7007e375c8fc39168b7ee071aa">FInitializeWithTuneContext</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_schedule::ScheduleRule</a></td><td class="entry"></td></tr>
-  <tr><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html#aadbc0886ffa80162ff31eefd0431ba09">get</a>() const</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html">tvm::runtime::ObjectRef</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html#ae423057ecf93c18714d17f53cd1d318f">get_mutable</a>() const</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html">tvm::runtime::ObjectRef</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html#aed593996e4076632450de8fde776707c">GetDataPtr</a>(const ObjectRef &amp;ref)</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html">tvm::runtime::ObjectRef</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span><span class="mlabel">static</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#a4f50de278ec0889780dc9d7066cda499">ApplyCustomRule</a>()</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_schedule::ScheduleRule</a></td><td class="entry"><span class="mlabel">static</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html#a2d76fa1fb628ff276a284e61123589c5">as</a>() const</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html">tvm::runtime::ObjectRef</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#a96f8ddfecce77dae00b9c958c2d514f5">AutoBind</a>(int max_threadblocks, Array&lt; Integer &gt; thread_extents, int max_threads_per_block=-1)</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_schedule::ScheduleRule</a></td><td class="entry"><span class="mlabel">static</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#a73a8c07ad4fa26d5c3e28f33c2215f1d">AutoInline</a>(bool into_producer, bool into_consumer, bool inline_const_tensor, bool disallow_if_then_else, bool require_injective, bool require_ordered, Optional&lt; Array&lt; String &gt;&gt; disallow_op)</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_schedule::ScheduleRule</a></td><td class="entry"><span cl [...]
+  <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html#aa5c355fbb7d2f7402ee360dba8a52cdd">ContainerType</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html">tvm::runtime::ObjectRef</a></td><td class="entry"></td></tr>
+  <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#a14acfc5ea272e2e53f9ac3e1110e53ea">CrossThreadReduction</a>(Array&lt; Integer &gt; thread_extents)</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_schedule::ScheduleRule</a></td><td class="entry"><span class="mlabel">static</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html#ac261cdb80487fb29ac42b28678f8cbef">data_</a></td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html">tvm::runtime::ObjectRef</a></td><td class="entry"><span class="mlabel">protected</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#a77ab3dd14cbfcec7ed059559f7afc372">DefaultCUDA</a>()</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_schedule::ScheduleRule</a></td><td class="entry"><span class="mlabel">static</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#a2abd71c2f3600573784d855d3cd63814">DefaultCUDATensorCore</a>()</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_schedule::ScheduleRule</a></td><td class="entry"><span class="mlabel">static</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#acd4de1f7ace3a34603f8832ae1b3180b">DefaultHexagon</a>()</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_schedule::ScheduleRule</a></td><td class="entry"><span class="mlabel">static</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#a031b6dcad67f1d985aa30adb13e2b6e8">DefaultLLVM</a>()</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_schedule::ScheduleRule</a></td><td class="entry"><span class="mlabel">static</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html#a17d8d5ad92691f9e18e3e0ae8ef69e4f">defined</a>() const</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html">tvm::runtime::ObjectRef</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html#acd04bb22a6861e9952c344ee8547411f">DowncastNoCheck</a>(ObjectRef ref)</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html">tvm::runtime::ObjectRef</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span><span class="mlabel">static</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#a2c558d23de2ff6bf298bc7167a210859">FApply</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_schedule::ScheduleRule</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#a4c02153b06f9c5577114d719747a7b1a">FAsString</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_schedule::ScheduleRule</a></td><td class="entry"></td></tr>
+  <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#a7bed37c51b09c7e58ce8f25d601bc24f">FClone</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_schedule::ScheduleRule</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html#a22e5bb9d64dbc773bb9263b70882239e">FFIClearAfterMove</a>(ObjectRef *ref)</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html">tvm::runtime::ObjectRef</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span><span class="mlabel">static</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#a19b2fb7007e375c8fc39168b7ee071aa">FInitializeWithTuneContext</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_schedule::ScheduleRule</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html#aadbc0886ffa80162ff31eefd0431ba09">get</a>() const</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html">tvm::runtime::ObjectRef</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html#ae423057ecf93c18714d17f53cd1d318f">get_mutable</a>() const</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html">tvm::runtime::ObjectRef</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html#aed593996e4076632450de8fde776707c">GetDataPtr</a>(const ObjectRef &amp;ref)</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html">tvm::runtime::ObjectRef</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span><span class="mlabel">static</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#abebbe9b3c71f3c7f0346641e0b7e96ad">IsApplyCustomRule</a>(const ScheduleRule &amp;rule)</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_schedule::ScheduleRule</a></td><td class="entry"><span class="mlabel">static</span></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#aaa910aa414fd65947b08badf1ec7e3fa">MultiLevelTiling</a>(String structure, Optional&lt; Array&lt; String &gt;&gt; tile_binds, Optional&lt; Integer &gt; max_innermost_factor, Optional&lt; Array&lt; Integer &gt;&gt; vector_load_lens, Optional&lt; Map&lt; String, ObjectRef &gt;&gt; reuse_read, Optional&lt; Map&lt; String, ObjectRef &gt;&gt; reuse_write)</td><td class="entry"><a class="el" [...]
   <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#a57a6551c51df77b91de6b89661f0e7c9">MultiLevelTilingTensorCore</a>(Array&lt; Map&lt; String, String &gt;&gt; intrin_groups, String structure, Optional&lt; Array&lt; String &gt;&gt; tile_binds, Optional&lt; Integer &gt; max_innermost_factor, Optional&lt; Array&lt; Integer &gt;&gt; vector_load_lens, Optional&lt; Map&lt; String, ObjectRef &gt;&gt; reuse_read, Optional&lt; Map&lt; String, ObjectRef &gt [...]
   <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#a08251350067dc524a1362ec723691a18">MultiLevelTilingWideVector</a>(String structure, Integer vector_length_in_bits, Optional&lt; Integer &gt; max_innermost_factor, Optional&lt; Map&lt; String, ObjectRef &gt;&gt; reuse_read, Optional&lt; Map&lt; String, ObjectRef &gt;&gt; reuse_write)</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">tvm::meta_sc [...]
diff --git a/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1ScheduleRule.html b/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1ScheduleRule.html
index 29876ff848..ba416d16ba 100644
--- a/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1ScheduleRule.html
+++ b/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1ScheduleRule.html
@@ -79,13 +79,13 @@ $(function() {
 <div class="dynheader">
 Inheritance diagram for tvm::meta_schedule::ScheduleRule:</div>
 <div class="dyncontent">
-<div class="center"><iframe scrolling="no" frameborder="0" src="classtvm_1_1meta__schedule_1_1ScheduleRule__inherit__graph.svg" width="235" height="698"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
+<div class="center"><iframe scrolling="no" frameborder="0" src="classtvm_1_1meta__schedule_1_1ScheduleRule__inherit__graph.svg" width="235" height="639"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
 </div>
 </div>
 <div class="dynheader">
 Collaboration diagram for tvm::meta_schedule::ScheduleRule:</div>
 <div class="dyncontent">
-<div class="center"><iframe scrolling="no" frameborder="0" src="classtvm_1_1meta__schedule_1_1ScheduleRule__coll__graph.svg" width="235" height="986"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
+<div class="center"><iframe scrolling="no" frameborder="0" src="classtvm_1_1meta__schedule_1_1ScheduleRule__coll__graph.svg" width="235" height="927"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
 </div>
 </div>
 <table class="memberdecls">
@@ -148,6 +148,12 @@ Public Member Functions</h2></td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-static-methods"></a>
 Static Public Member Functions</h2></td></tr>
+<tr class="memitem:a4f50de278ec0889780dc9d7066cda499"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">ScheduleRule</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#a4f50de278ec0889780dc9d7066cda499">ApplyCustomRule</a> ()</td></tr>
+<tr class="memdesc:a4f50de278ec0889780dc9d7066cda499"><td class="mdescLeft">&#160;</td><td class="mdescRight">Create a rule that applies customized rules registered using block attribute <code>schedule_rule</code>. The rule will be dispatched according to target keys.  <a href="#a4f50de278ec0889780dc9d7066cda499">More...</a><br /></td></tr>
+<tr class="separator:a4f50de278ec0889780dc9d7066cda499"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:abebbe9b3c71f3c7f0346641e0b7e96ad"><td class="memItemLeft" align="right" valign="top">static bool&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#abebbe9b3c71f3c7f0346641e0b7e96ad">IsApplyCustomRule</a> (const <a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">ScheduleRule</a> &amp;rule)</td></tr>
+<tr class="memdesc:abebbe9b3c71f3c7f0346641e0b7e96ad"><td class="mdescLeft">&#160;</td><td class="mdescRight">Check if the rule is <code>ApplyCustomRule</code>  <a href="#abebbe9b3c71f3c7f0346641e0b7e96ad">More...</a><br /></td></tr>
+<tr class="separator:abebbe9b3c71f3c7f0346641e0b7e96ad"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a73a8c07ad4fa26d5c3e28f33c2215f1d"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">ScheduleRule</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html#a73a8c07ad4fa26d5c3e28f33c2215f1d">AutoInline</a> (bool into_producer, bool into_consumer, bool inline_const_tensor, bool disallow_if_then_else, bool require_injective, bool r [...]
 <tr class="memdesc:a73a8c07ad4fa26d5c3e28f33c2215f1d"><td class="mdescLeft">&#160;</td><td class="mdescRight">Create an auto-inline rule that inlines spatial blocks if it satisfies some conditions.  <a href="#a73a8c07ad4fa26d5c3e28f33c2215f1d">More...</a><br /></td></tr>
 <tr class="separator:a73a8c07ad4fa26d5c3e28f33c2215f1d"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -348,6 +354,34 @@ Additional Inherited Members</h2></td></tr>
 </dl>
 <dl class="section return"><dt>Returns</dt><dd>The schedule rule created </dd></dl>
 
+</div>
+</div>
+<a id="a4f50de278ec0889780dc9d7066cda499"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a4f50de278ec0889780dc9d7066cda499">&#9670;&nbsp;</a></span>ApplyCustomRule()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">static <a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">ScheduleRule</a> tvm::meta_schedule::ScheduleRule::ApplyCustomRule </td>
+          <td>(</td>
+          <td class="paramname"></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">static</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+<p>Create a rule that applies customized rules registered using block attribute <code>schedule_rule</code>. The rule will be dispatched according to target keys. </p>
+<dl class="section return"><dt>Returns</dt><dd>The created schedule rule. </dd></dl>
+
 </div>
 </div>
 <a id="a96f8ddfecce77dae00b9c958c2d514f5"></a>
@@ -625,6 +659,34 @@ Additional Inherited Members</h2></td></tr>
 
 <p>Create default schedule rules for LLVM. </p>
 
+</div>
+</div>
+<a id="abebbe9b3c71f3c7f0346641e0b7e96ad"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#abebbe9b3c71f3c7f0346641e0b7e96ad">&#9670;&nbsp;</a></span>IsApplyCustomRule()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">static bool tvm::meta_schedule::ScheduleRule::IsApplyCustomRule </td>
+          <td>(</td>
+          <td class="paramtype">const <a class="el" href="classtvm_1_1meta__schedule_1_1ScheduleRule.html">ScheduleRule</a> &amp;&#160;</td>
+          <td class="paramname"><em>rule</em></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">static</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+<p>Check if the rule is <code>ApplyCustomRule</code> </p>
+
 </div>
 </div>
 <a id="aaa910aa414fd65947b08badf1ec7e3fa"></a>
diff --git a/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1ScheduleRule__coll__graph.svg b/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1ScheduleRule__coll__graph.svg
index 71eb46f929..287165a1c1 100644
--- a/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1ScheduleRule__coll__graph.svg
+++ b/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1ScheduleRule__coll__graph.svg
@@ -4,102 +4,98 @@
 <!-- Generated by graphviz version 2.40.1 (20161225.0304)
  -->
 <!-- Title: tvm::meta_schedule::ScheduleRule Pages: 1 -->
-<svg width="176pt" height="739pt"
- viewBox="0.00 0.00 176.00 739.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
-<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 735)">
+<svg width="176pt" height="695pt"
+ viewBox="0.00 0.00 176.00 695.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 691)">
 <title>tvm::meta_schedule::ScheduleRule</title>
-<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-735 172,-735 172,4 -4,4"/>
+<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-691 172,-691 172,4 -4,4"/>
 <!-- Node2 -->
 <g id="node1" class="node">
 <title>Node2</title>
-<polygon fill="#bfbfbf" stroke="#000000" points="0,-.5 0,-244.5 168,-244.5 168,-.5 0,-.5"/>
-<text text-anchor="start" x="8" y="-232.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::meta_schedule</text>
-<text text-anchor="middle" x="84" y="-221.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">::ScheduleRule</text>
-<polyline fill="none" stroke="#000000" points="0,-214.5 168,-214.5 "/>
-<text text-anchor="middle" x="84" y="-202.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
-<polyline fill="none" stroke="#000000" points="0,-195.5 168,-195.5 "/>
-<text text-anchor="start" x="8" y="-183.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TVM_DEFINE_MUTABLE</text>
-<text text-anchor="start" x="8" y="-172.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_OBJECT_REF_METHODS()</text>
-<text text-anchor="start" x="8" y="-161.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ AutoInline()</text>
-<text text-anchor="start" x="8" y="-150.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ MultiLevelTiling()</text>
-<text text-anchor="start" x="8" y="-139.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ MultiLevelTilingWithIntrin()</text>
-<text text-anchor="start" x="8" y="-128.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ MultiLevelTilingTensorCore()</text>
-<text text-anchor="start" x="8" y="-117.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ MultiLevelTilingWideVector()</text>
-<text text-anchor="start" x="8" y="-106.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ AddRFactor()</text>
-<text text-anchor="start" x="8" y="-95.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CrossThreadReduction()</text>
-<text text-anchor="start" x="8" y="-84.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ RandomComputeLocation()</text>
-<text text-anchor="start" x="8" y="-73.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ParallelizeVectorizeUnroll()</text>
-<text text-anchor="start" x="8" y="-62.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ AutoBind()</text>
-<text text-anchor="start" x="8" y="-51.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ PyScheduleRule()</text>
-<text text-anchor="start" x="8" y="-40.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ DefaultLLVM()</text>
-<text text-anchor="start" x="8" y="-29.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ DefaultCUDA()</text>
-<text text-anchor="start" x="8" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ DefaultCUDATensorCore()</text>
-<text text-anchor="start" x="8" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ DefaultHexagon()</text>
+<polygon fill="#bfbfbf" stroke="#000000" points="0,-.5 0,-200.5 168,-200.5 168,-.5 0,-.5"/>
+<text text-anchor="start" x="8" y="-188.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::meta_schedule</text>
+<text text-anchor="middle" x="84" y="-177.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">::ScheduleRule</text>
+<polyline fill="none" stroke="#000000" points="0,-170.5 168,-170.5 "/>
+<text text-anchor="middle" x="84" y="-158.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
+<polyline fill="none" stroke="#000000" points="0,-151.5 168,-151.5 "/>
+<text text-anchor="start" x="8" y="-139.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TVM_DEFINE_MUTABLE</text>
+<text text-anchor="start" x="8" y="-128.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_OBJECT_REF_METHODS()</text>
+<text text-anchor="start" x="8" y="-117.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ApplyCustomRule()</text>
+<text text-anchor="start" x="8" y="-106.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ IsApplyCustomRule()</text>
+<text text-anchor="start" x="8" y="-95.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ AutoInline()</text>
+<text text-anchor="start" x="8" y="-84.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ MultiLevelTiling()</text>
+<text text-anchor="start" x="8" y="-73.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ MultiLevelTilingWithIntrin()</text>
+<text text-anchor="start" x="8" y="-62.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ MultiLevelTilingTensorCore()</text>
+<text text-anchor="start" x="8" y="-51.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ MultiLevelTilingWideVector()</text>
+<text text-anchor="start" x="8" y="-40.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ AddRFactor()</text>
+<text text-anchor="start" x="8" y="-29.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CrossThreadReduction()</text>
+<text text-anchor="start" x="8" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ RandomComputeLocation()</text>
+<text text-anchor="start" x="8" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">and 7 more...</text>
 </g>
 <!-- Node3 -->
 <g id="node2" class="node">
 <title>Node3</title>
 <g id="a_node2"><a xlink:href="classtvm_1_1runtime_1_1ObjectRef.html" target="_top" xlink:title="Base class of all object reference. ">
-<polygon fill="#ffffff" stroke="#000000" points="17,-282.5 17,-504.5 151,-504.5 151,-282.5 17,-282.5"/>
-<text text-anchor="middle" x="84" y="-492.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::ObjectRef</text>
-<polyline fill="none" stroke="#000000" points="17,-485.5 151,-485.5 "/>
-<text text-anchor="start" x="25" y="-473.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_is_nullable</text>
-<polyline fill="none" stroke="#000000" points="17,-466.5 151,-466.5 "/>
-<text text-anchor="start" x="25" y="-454.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ObjectRef()</text>
-<text text-anchor="start" x="25" y="-443.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ObjectRef()</text>
-<text text-anchor="start" x="25" y="-432.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ same_as()</text>
-<text text-anchor="start" x="25" y="-421.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator==()</text>
-<text text-anchor="start" x="25" y="-410.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator!=()</text>
-<text text-anchor="start" x="25" y="-399.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator&lt;()</text>
-<text text-anchor="start" x="25" y="-388.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ defined()</text>
-<text text-anchor="start" x="25" y="-377.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ get()</text>
-<text text-anchor="start" x="25" y="-366.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator&#45;&gt;()</text>
-<text text-anchor="start" x="25" y="-355.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ unique()</text>
-<text text-anchor="start" x="25" y="-344.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ use_count()</text>
-<text text-anchor="start" x="25" y="-333.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ as()</text>
-<text text-anchor="start" x="25" y="-322.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># get_mutable()</text>
-<text text-anchor="start" x="25" y="-311.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># DowncastNoCheck()</text>
-<text text-anchor="start" x="25" y="-300.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># FFIClearAfterMove()</text>
-<text text-anchor="start" x="25" y="-289.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># GetDataPtr()</text>
+<polygon fill="#ffffff" stroke="#000000" points="17,-238.5 17,-460.5 151,-460.5 151,-238.5 17,-238.5"/>
+<text text-anchor="middle" x="84" y="-448.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::ObjectRef</text>
+<polyline fill="none" stroke="#000000" points="17,-441.5 151,-441.5 "/>
+<text text-anchor="start" x="25" y="-429.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_is_nullable</text>
+<polyline fill="none" stroke="#000000" points="17,-422.5 151,-422.5 "/>
+<text text-anchor="start" x="25" y="-410.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ObjectRef()</text>
+<text text-anchor="start" x="25" y="-399.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ObjectRef()</text>
+<text text-anchor="start" x="25" y="-388.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ same_as()</text>
+<text text-anchor="start" x="25" y="-377.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator==()</text>
+<text text-anchor="start" x="25" y="-366.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator!=()</text>
+<text text-anchor="start" x="25" y="-355.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator&lt;()</text>
+<text text-anchor="start" x="25" y="-344.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ defined()</text>
+<text text-anchor="start" x="25" y="-333.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ get()</text>
+<text text-anchor="start" x="25" y="-322.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator&#45;&gt;()</text>
+<text text-anchor="start" x="25" y="-311.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ unique()</text>
+<text text-anchor="start" x="25" y="-300.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ use_count()</text>
+<text text-anchor="start" x="25" y="-289.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ as()</text>
+<text text-anchor="start" x="25" y="-278.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># get_mutable()</text>
+<text text-anchor="start" x="25" y="-267.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># DowncastNoCheck()</text>
+<text text-anchor="start" x="25" y="-256.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># FFIClearAfterMove()</text>
+<text text-anchor="start" x="25" y="-245.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># GetDataPtr()</text>
 </a>
 </g>
 </g>
 <!-- Node3&#45;&gt;Node2 -->
 <g id="edge1" class="edge">
 <title>Node3&#45;&gt;Node2</title>
-<path fill="none" stroke="#191970" d="M84,-272.2451C84,-263.1286 84,-253.9251 84,-244.8022"/>
-<polygon fill="none" stroke="#191970" points="80.5001,-272.492 84,-282.492 87.5001,-272.492 80.5001,-272.492"/>
+<path fill="none" stroke="#191970" d="M84,-228.4182C84,-219.1346 84,-209.8256 84,-200.698"/>
+<polygon fill="none" stroke="#191970" points="80.5001,-228.4721 84,-238.4721 87.5001,-228.4721 80.5001,-228.4721"/>
 </g>
 <!-- Node4 -->
 <g id="node3" class="node">
 <title>Node4</title>
 <g id="a_node3"><a xlink:href="classtvm_1_1runtime_1_1ObjectPtr.html" target="_top" xlink:title="{tvm::runtime::ObjectPtr\l\&lt; tvm::runtime::Object \&gt;\n||+ ObjectPtr()\l+ ObjectPtr()\l+ ObjectPtr()\l+ ObjectPtr()\l+ ObjectPtr()\l+ ObjectPtr()\l+ ~ObjectPtr()\l+ swap()\l+ get()\l+ operator&#45;\&gt;()\land 11 more...\l}">
-<polygon fill="#ffffff" stroke="#000000" points="14,-552.5 14,-730.5 154,-730.5 154,-552.5 14,-552.5"/>
-<text text-anchor="start" x="22" y="-718.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::ObjectPtr</text>
-<text text-anchor="middle" x="84" y="-707.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">&lt; tvm::runtime::Object &gt;</text>
-<polyline fill="none" stroke="#000000" points="14,-700.5 154,-700.5 "/>
-<text text-anchor="middle" x="84" y="-688.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
-<polyline fill="none" stroke="#000000" points="14,-681.5 154,-681.5 "/>
-<text text-anchor="start" x="22" y="-669.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ObjectPtr()</text>
-<text text-anchor="start" x="22" y="-658.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ObjectPtr()</text>
-<text text-anchor="start" x="22" y="-647.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ObjectPtr()</text>
-<text text-anchor="start" x="22" y="-636.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ObjectPtr()</text>
+<polygon fill="#ffffff" stroke="#000000" points="14,-508.5 14,-686.5 154,-686.5 154,-508.5 14,-508.5"/>
+<text text-anchor="start" x="22" y="-674.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::ObjectPtr</text>
+<text text-anchor="middle" x="84" y="-663.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">&lt; tvm::runtime::Object &gt;</text>
+<polyline fill="none" stroke="#000000" points="14,-656.5 154,-656.5 "/>
+<text text-anchor="middle" x="84" y="-644.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
+<polyline fill="none" stroke="#000000" points="14,-637.5 154,-637.5 "/>
 <text text-anchor="start" x="22" y="-625.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ObjectPtr()</text>
 <text text-anchor="start" x="22" y="-614.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ObjectPtr()</text>
-<text text-anchor="start" x="22" y="-603.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ~ObjectPtr()</text>
-<text text-anchor="start" x="22" y="-592.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ swap()</text>
-<text text-anchor="start" x="22" y="-581.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ get()</text>
-<text text-anchor="start" x="22" y="-570.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator&#45;&gt;()</text>
-<text text-anchor="start" x="22" y="-559.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">and 11 more...</text>
+<text text-anchor="start" x="22" y="-603.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ObjectPtr()</text>
+<text text-anchor="start" x="22" y="-592.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ObjectPtr()</text>
+<text text-anchor="start" x="22" y="-581.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ObjectPtr()</text>
+<text text-anchor="start" x="22" y="-570.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ObjectPtr()</text>
+<text text-anchor="start" x="22" y="-559.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ~ObjectPtr()</text>
+<text text-anchor="start" x="22" y="-548.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ swap()</text>
+<text text-anchor="start" x="22" y="-537.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ get()</text>
+<text text-anchor="start" x="22" y="-526.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator&#45;&gt;()</text>
+<text text-anchor="start" x="22" y="-515.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">and 11 more...</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node3 -->
 <g id="edge2" class="edge">
 <title>Node4&#45;&gt;Node3</title>
-<path fill="none" stroke="#404040" d="M84,-552.3167C84,-540.8765 84,-529.0062 84,-517.1402"/>
-<polygon fill="none" stroke="#404040" points="84.0001,-516.7944 80,-510.7944 84,-504.7944 88,-510.7943 84.0001,-516.7944"/>
-<text text-anchor="middle" x="103.5" y="-526" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> #data_</text>
+<path fill="none" stroke="#404040" d="M84,-508.3167C84,-496.8765 84,-485.0062 84,-473.1402"/>
+<polygon fill="none" stroke="#404040" points="84.0001,-472.7944 80,-466.7944 84,-460.7944 88,-466.7943 84.0001,-472.7944"/>
+<text text-anchor="middle" x="103.5" y="-482" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> #data_</text>
 </g>
 </g>
 </svg>
diff --git a/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1ScheduleRule__inherit__graph.svg b/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1ScheduleRule__inherit__graph.svg
index 4826c8ebec..1c0139d91e 100644
--- a/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1ScheduleRule__inherit__graph.svg
+++ b/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1ScheduleRule__inherit__graph.svg
@@ -4,72 +4,68 @@
 <!-- Generated by graphviz version 2.40.1 (20161225.0304)
  -->
 <!-- Title: tvm::meta_schedule::ScheduleRule Pages: 1 -->
-<svg width="176pt" height="523pt"
- viewBox="0.00 0.00 176.00 523.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
-<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 519)">
+<svg width="176pt" height="479pt"
+ viewBox="0.00 0.00 176.00 479.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 475)">
 <title>tvm::meta_schedule::ScheduleRule</title>
-<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-519 172,-519 172,4 -4,4"/>
+<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-475 172,-475 172,4 -4,4"/>
 <!-- Node0 -->
 <g id="node1" class="node">
 <title>Node0</title>
-<polygon fill="#bfbfbf" stroke="#000000" points="0,-.5 0,-244.5 168,-244.5 168,-.5 0,-.5"/>
-<text text-anchor="start" x="8" y="-232.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::meta_schedule</text>
-<text text-anchor="middle" x="84" y="-221.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">::ScheduleRule</text>
-<polyline fill="none" stroke="#000000" points="0,-214.5 168,-214.5 "/>
-<text text-anchor="middle" x="84" y="-202.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
-<polyline fill="none" stroke="#000000" points="0,-195.5 168,-195.5 "/>
-<text text-anchor="start" x="8" y="-183.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TVM_DEFINE_MUTABLE</text>
-<text text-anchor="start" x="8" y="-172.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_OBJECT_REF_METHODS()</text>
-<text text-anchor="start" x="8" y="-161.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ AutoInline()</text>
-<text text-anchor="start" x="8" y="-150.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ MultiLevelTiling()</text>
-<text text-anchor="start" x="8" y="-139.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ MultiLevelTilingWithIntrin()</text>
-<text text-anchor="start" x="8" y="-128.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ MultiLevelTilingTensorCore()</text>
-<text text-anchor="start" x="8" y="-117.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ MultiLevelTilingWideVector()</text>
-<text text-anchor="start" x="8" y="-106.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ AddRFactor()</text>
-<text text-anchor="start" x="8" y="-95.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CrossThreadReduction()</text>
-<text text-anchor="start" x="8" y="-84.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ RandomComputeLocation()</text>
-<text text-anchor="start" x="8" y="-73.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ParallelizeVectorizeUnroll()</text>
-<text text-anchor="start" x="8" y="-62.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ AutoBind()</text>
-<text text-anchor="start" x="8" y="-51.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ PyScheduleRule()</text>
-<text text-anchor="start" x="8" y="-40.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ DefaultLLVM()</text>
-<text text-anchor="start" x="8" y="-29.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ DefaultCUDA()</text>
-<text text-anchor="start" x="8" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ DefaultCUDATensorCore()</text>
-<text text-anchor="start" x="8" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ DefaultHexagon()</text>
+<polygon fill="#bfbfbf" stroke="#000000" points="0,-.5 0,-200.5 168,-200.5 168,-.5 0,-.5"/>
+<text text-anchor="start" x="8" y="-188.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::meta_schedule</text>
+<text text-anchor="middle" x="84" y="-177.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">::ScheduleRule</text>
+<polyline fill="none" stroke="#000000" points="0,-170.5 168,-170.5 "/>
+<text text-anchor="middle" x="84" y="-158.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
+<polyline fill="none" stroke="#000000" points="0,-151.5 168,-151.5 "/>
+<text text-anchor="start" x="8" y="-139.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TVM_DEFINE_MUTABLE</text>
+<text text-anchor="start" x="8" y="-128.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_OBJECT_REF_METHODS()</text>
+<text text-anchor="start" x="8" y="-117.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ApplyCustomRule()</text>
+<text text-anchor="start" x="8" y="-106.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ IsApplyCustomRule()</text>
+<text text-anchor="start" x="8" y="-95.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ AutoInline()</text>
+<text text-anchor="start" x="8" y="-84.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ MultiLevelTiling()</text>
+<text text-anchor="start" x="8" y="-73.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ MultiLevelTilingWithIntrin()</text>
+<text text-anchor="start" x="8" y="-62.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ MultiLevelTilingTensorCore()</text>
+<text text-anchor="start" x="8" y="-51.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ MultiLevelTilingWideVector()</text>
+<text text-anchor="start" x="8" y="-40.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ AddRFactor()</text>
+<text text-anchor="start" x="8" y="-29.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CrossThreadReduction()</text>
+<text text-anchor="start" x="8" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ RandomComputeLocation()</text>
+<text text-anchor="start" x="8" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">and 7 more...</text>
 </g>
 <!-- Node1 -->
 <g id="node2" class="node">
 <title>Node1</title>
 <g id="a_node2"><a xlink:href="classtvm_1_1runtime_1_1ObjectRef.html" target="_top" xlink:title="Base class of all object reference. ">
-<polygon fill="#ffffff" stroke="#000000" points="17,-281.5 17,-514.5 151,-514.5 151,-281.5 17,-281.5"/>
-<text text-anchor="middle" x="84" y="-502.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::ObjectRef</text>
-<polyline fill="none" stroke="#000000" points="17,-495.5 151,-495.5 "/>
-<text text-anchor="start" x="25" y="-483.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_is_nullable</text>
-<text text-anchor="start" x="25" y="-472.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># data_</text>
-<polyline fill="none" stroke="#000000" points="17,-465.5 151,-465.5 "/>
-<text text-anchor="start" x="25" y="-453.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ObjectRef()</text>
-<text text-anchor="start" x="25" y="-442.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ObjectRef()</text>
-<text text-anchor="start" x="25" y="-431.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ same_as()</text>
-<text text-anchor="start" x="25" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator==()</text>
-<text text-anchor="start" x="25" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator!=()</text>
-<text text-anchor="start" x="25" y="-398.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator&lt;()</text>
-<text text-anchor="start" x="25" y="-387.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ defined()</text>
-<text text-anchor="start" x="25" y="-376.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ get()</text>
-<text text-anchor="start" x="25" y="-365.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator&#45;&gt;()</text>
-<text text-anchor="start" x="25" y="-354.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ unique()</text>
-<text text-anchor="start" x="25" y="-343.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ use_count()</text>
-<text text-anchor="start" x="25" y="-332.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ as()</text>
-<text text-anchor="start" x="25" y="-321.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># get_mutable()</text>
-<text text-anchor="start" x="25" y="-310.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># DowncastNoCheck()</text>
-<text text-anchor="start" x="25" y="-299.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># FFIClearAfterMove()</text>
-<text text-anchor="start" x="25" y="-288.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># GetDataPtr()</text>
+<polygon fill="#ffffff" stroke="#000000" points="17,-237.5 17,-470.5 151,-470.5 151,-237.5 17,-237.5"/>
+<text text-anchor="middle" x="84" y="-458.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::ObjectRef</text>
+<polyline fill="none" stroke="#000000" points="17,-451.5 151,-451.5 "/>
+<text text-anchor="start" x="25" y="-439.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_is_nullable</text>
+<text text-anchor="start" x="25" y="-428.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># data_</text>
+<polyline fill="none" stroke="#000000" points="17,-421.5 151,-421.5 "/>
+<text text-anchor="start" x="25" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ObjectRef()</text>
+<text text-anchor="start" x="25" y="-398.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ObjectRef()</text>
+<text text-anchor="start" x="25" y="-387.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ same_as()</text>
+<text text-anchor="start" x="25" y="-376.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator==()</text>
+<text text-anchor="start" x="25" y="-365.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator!=()</text>
+<text text-anchor="start" x="25" y="-354.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator&lt;()</text>
+<text text-anchor="start" x="25" y="-343.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ defined()</text>
+<text text-anchor="start" x="25" y="-332.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ get()</text>
+<text text-anchor="start" x="25" y="-321.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator&#45;&gt;()</text>
+<text text-anchor="start" x="25" y="-310.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ unique()</text>
+<text text-anchor="start" x="25" y="-299.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ use_count()</text>
+<text text-anchor="start" x="25" y="-288.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ as()</text>
+<text text-anchor="start" x="25" y="-277.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># get_mutable()</text>
+<text text-anchor="start" x="25" y="-266.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># DowncastNoCheck()</text>
+<text text-anchor="start" x="25" y="-255.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># FFIClearAfterMove()</text>
+<text text-anchor="start" x="25" y="-244.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># GetDataPtr()</text>
 </a>
 </g>
 </g>
 <!-- Node1&#45;&gt;Node0 -->
 <g id="edge1" class="edge">
 <title>Node1&#45;&gt;Node0</title>
-<path fill="none" stroke="#191970" d="M84,-270.9732C84,-262.2632 84,-253.4957 84,-244.811"/>
-<polygon fill="none" stroke="#191970" points="80.5001,-271.1632 84,-281.1632 87.5001,-271.1632 80.5001,-271.1632"/>
+<path fill="none" stroke="#191970" d="M84,-227.2283C84,-218.3287 84,-209.4293 84,-200.7056"/>
+<polygon fill="none" stroke="#191970" points="80.5001,-227.2668 84,-237.2668 87.5001,-227.2669 80.5001,-227.2668"/>
 </g>
 </g>
 </svg>
diff --git a/docs/reference/api/doxygen/data__type_8h__dep__incl.svg b/docs/reference/api/doxygen/data__type_8h__dep__incl.svg
index f587f3a5ee..c95836002c 100644
--- a/docs/reference/api/doxygen/data__type_8h__dep__incl.svg
+++ b/docs/reference/api/doxygen/data__type_8h__dep__incl.svg
@@ -47,9 +47,9 @@
 <path fill="none" stroke="#191970" d="M1430.5134,-1008.8628C1667.1294,-1006.3486 2448.9674,-995.3485 2697,-958 2925.7734,-923.5515 3059.029,-1007.4425 3200,-824 3208.3953,-813.0754 3207.6842,-804.4359 3200,-793 3146.9313,-714.0211 2502.0242,-492.2787 2324.8064,-432.5044"/>
 <polygon fill="#191970" stroke="#191970" points="1430.2885,-1005.3649 1420.3257,-1008.9694 1430.3618,-1012.3645 1430.2885,-1005.3649"/>
 </g>
-<!-- Node174 -->
+<!-- Node176 -->
 <g id="node30" class="node">
-<title>Node174</title>
+<title>Node176</title>
 <g id="a_node30"><a xlink:href="doc_8h.html" target="_top" xlink:title="include/tvm/script\l/printer/doc.h">
 <polygon fill="#ffffff" stroke="#ff0000" points="1398,-536.5 1398,-566.5 1502,-566.5 1502,-536.5 1398,-536.5"/>
 <text text-anchor="start" x="1406" y="-554.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/script</text>
@@ -57,60 +57,60 @@
 </a>
 </g>
 </g>
-<!-- Node8&#45;&gt;Node174 -->
+<!-- Node8&#45;&gt;Node176 -->
 <g id="edge154" class="edge">
-<title>Node8&#45;&gt;Node174</title>
+<title>Node8&#45;&gt;Node176</title>
 <path fill="none" stroke="#191970" d="M1362,-984.1092C1362,-957.1594 1362,-913.3265 1362,-875.5 1362,-875.5 1362,-875.5 1362,-674.5 1362,-628.5509 1403.1224,-587.9449 1429.1452,-566.849"/>
 <polygon fill="#191970" stroke="#191970" points="1358.5001,-984.227 1362,-994.2271 1365.5001,-984.2271 1358.5001,-984.227"/>
 </g>
-<!-- Node181 -->
+<!-- Node183 -->
 <g id="node31" class="node">
-<title>Node181</title>
+<title>Node183</title>
 <g id="a_node31"><a xlink:href="tir_2expr_8h.html" target="_top" xlink:title="TIR expressions. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="667.5,-475 667.5,-494 788.5,-494 788.5,-475 667.5,-475"/>
 <text text-anchor="middle" x="728" y="-482" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/expr.h</text>
 </a>
 </g>
 </g>
-<!-- Node8&#45;&gt;Node181 -->
+<!-- Node8&#45;&gt;Node183 -->
 <g id="edge155" class="edge">
-<title>Node8&#45;&gt;Node181</title>
+<title>Node8&#45;&gt;Node183</title>
 <path fill="none" stroke="#191970" d="M1293.2343,-1006.1878C1166.9786,-999.5841 903.0807,-983.2098 815,-958 736.1758,-935.4395 650,-957.4892 650,-875.5 650,-875.5 650,-875.5 650,-613 650,-577.7513 650.6498,-566.0955 669,-536 679.8296,-518.2388 698.7888,-503.3123 712.292,-494.1764"/>
 <polygon fill="#191970" stroke="#191970" points="1293.3815,-1009.7 1303.5491,-1006.7215 1293.7434,-1002.7094 1293.3815,-1009.7"/>
 </g>
-<!-- Node188 -->
+<!-- Node190 -->
 <g id="node32" class="node">
-<title>Node188</title>
+<title>Node190</title>
 <g id="a_node32"><a xlink:href="var_8h.html" target="_top" xlink:title="Variables in the TIR. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="678.5,-542 678.5,-561 793.5,-561 793.5,-542 678.5,-542"/>
 <text text-anchor="middle" x="736" y="-549" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/var.h</text>
 </a>
 </g>
 </g>
-<!-- Node8&#45;&gt;Node188 -->
+<!-- Node8&#45;&gt;Node190 -->
 <g id="edge156" class="edge">
-<title>Node8&#45;&gt;Node188</title>
+<title>Node8&#45;&gt;Node190</title>
 <path fill="none" stroke="#191970" d="M1293.6906,-1008.0834C1139.7375,-1003.0405 779,-980.0674 779,-875.5 779,-875.5 779,-875.5 779,-674.5 779,-630.1619 753.7942,-581.6162 741.979,-561.2959"/>
 <polygon fill="#191970" stroke="#191970" points="1293.8642,-1011.5904 1303.9683,-1008.4034 1294.0821,-1004.5938 1293.8642,-1011.5904"/>
 </g>
-<!-- Node190 -->
+<!-- Node192 -->
 <g id="node33" class="node">
-<title>Node190</title>
+<title>Node192</title>
 <g id="a_node33"><a xlink:href="reflection_8h.html" target="_top" xlink:title="Reflection and serialization of compiler IR/AST nodes. ">
 <polygon fill="#ffffff" stroke="#000000" points="2078.5,-799 2078.5,-818 2237.5,-818 2237.5,-799 2078.5,-799"/>
 <text text-anchor="middle" x="2158" y="-806" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/node/reflection.h</text>
 </a>
 </g>
 </g>
-<!-- Node8&#45;&gt;Node190 -->
+<!-- Node8&#45;&gt;Node192 -->
 <g id="edge80" class="edge">
-<title>Node8&#45;&gt;Node190</title>
+<title>Node8&#45;&gt;Node192</title>
 <path fill="none" stroke="#191970" d="M1430.3209,-1008.881C1654.3196,-1006.4996 2365.0448,-996.0599 2591,-958 2693.0503,-940.8106 2752.242,-974.0402 2814,-891 2822.2221,-879.9445 2823.3533,-870.1164 2814,-860 2794.6842,-839.1084 2408.0428,-819.5397 2237.571,-811.8904"/>
 <polygon fill="#191970" stroke="#191970" points="1429.989,-1005.3842 1420.0261,-1008.9884 1430.0621,-1012.3838 1429.989,-1005.3842"/>
 </g>
-<!-- Node201 -->
+<!-- Node203 -->
 <g id="node41" class="node">
-<title>Node201</title>
+<title>Node203</title>
 <g id="a_node41"><a xlink:href="structural__equal_8h.html" target="_top" xlink:title="Structural equality comparison. ">
 <polygon fill="#ffffff" stroke="#000000" points="955.5,-860.5 955.5,-890.5 1106.5,-890.5 1106.5,-860.5 955.5,-860.5"/>
 <text text-anchor="start" x="963.5" y="-878.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/node/structural</text>
@@ -118,15 +118,15 @@
 </a>
 </g>
 </g>
-<!-- Node8&#45;&gt;Node201 -->
+<!-- Node8&#45;&gt;Node203 -->
 <g id="edge115" class="edge">
-<title>Node8&#45;&gt;Node201</title>
+<title>Node8&#45;&gt;Node203</title>
 <path fill="none" stroke="#191970" d="M1315.2927,-990.5913C1248.9743,-963.7434 1128.8534,-915.1144 1068.1901,-890.5558"/>
 <polygon fill="#191970" stroke="#191970" points="1314.0909,-993.8806 1324.6735,-994.389 1316.7177,-987.3922 1314.0909,-993.8806"/>
 </g>
-<!-- Node202 -->
+<!-- Node204 -->
 <g id="node42" class="node">
-<title>Node202</title>
+<title>Node204</title>
 <g id="a_node42"><a xlink:href="structural__hash_8h.html" target="_top" xlink:title="include/tvm/node/structural\l_hash.h">
 <polygon fill="#ffffff" stroke="#000000" points="1182.5,-860.5 1182.5,-890.5 1333.5,-890.5 1333.5,-860.5 1182.5,-860.5"/>
 <text text-anchor="start" x="1190.5" y="-878.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/node/structural</text>
@@ -134,15 +134,15 @@
 </a>
 </g>
 </g>
-<!-- Node8&#45;&gt;Node202 -->
+<!-- Node8&#45;&gt;Node204 -->
 <g id="edge119" class="edge">
-<title>Node8&#45;&gt;Node202</title>
+<title>Node8&#45;&gt;Node204</title>
 <path fill="none" stroke="#191970" d="M1343.8739,-986.1452C1322.6014,-958.7365 1287.9396,-914.076 1269.8447,-890.7614"/>
 <polygon fill="#191970" stroke="#191970" points="1341.3758,-988.635 1350.272,-994.389 1346.9057,-984.3431 1341.3758,-988.635"/>
 </g>
-<!-- Node203 -->
+<!-- Node205 -->
 <g id="node43" class="node">
-<title>Node203</title>
+<title>Node205</title>
 <g id="a_node43"><a xlink:href="ndarray_8h.html" target="_top" xlink:title="A device&#45;independent managed NDArray abstraction. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="2466,-927.5 2466,-957.5 2582,-957.5 2582,-927.5 2466,-927.5"/>
 <text text-anchor="start" x="2474" y="-945.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -150,15 +150,15 @@
 </a>
 </g>
 </g>
-<!-- Node8&#45;&gt;Node203 -->
+<!-- Node8&#45;&gt;Node205 -->
 <g id="edge123" class="edge">
-<title>Node8&#45;&gt;Node203</title>
+<title>Node8&#45;&gt;Node205</title>
 <path fill="none" stroke="#191970" d="M1430.2617,-1005.5641C1639.6544,-993.4907 2270.4805,-957.1177 2465.7461,-945.8589"/>
 <polygon fill="#191970" stroke="#191970" points="1429.8032,-1002.0846 1420.0213,-1006.1545 1430.2062,-1009.073 1429.8032,-1002.0846"/>
 </g>
-<!-- Node209 -->
+<!-- Node211 -->
 <g id="node47" class="node">
-<title>Node209</title>
+<title>Node211</title>
 <g id="a_node47"><a xlink:href="packed__func_8h.html" target="_top" xlink:title="Type&#45;erased function used across TVM API. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="2057,-860.5 2057,-890.5 2173,-890.5 2173,-860.5 2057,-860.5"/>
 <text text-anchor="start" x="2065" y="-878.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -166,15 +166,15 @@
 </a>
 </g>
 </g>
-<!-- Node8&#45;&gt;Node209 -->
+<!-- Node8&#45;&gt;Node211 -->
 <g id="edge152" class="edge">
-<title>Node8&#45;&gt;Node209</title>
+<title>Node8&#45;&gt;Node211</title>
 <path fill="none" stroke="#191970" d="M1430.3635,-997.3344C1577.0281,-971.2347 1919.665,-910.2608 2056.7719,-885.862"/>
 <polygon fill="#191970" stroke="#191970" points="1429.3799,-993.9543 1420.1478,-999.1523 1430.6064,-1000.8461 1429.3799,-993.9543"/>
 </g>
-<!-- Node217 -->
+<!-- Node219 -->
 <g id="node50" class="node">
-<title>Node217</title>
+<title>Node219</title>
 <g id="a_node50"><a xlink:href="bytecode_8h.html" target="_top" xlink:title="The bytecode for Relay virtual machine. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="3285,-927.5 3285,-957.5 3401,-957.5 3401,-927.5 3285,-927.5"/>
 <text text-anchor="start" x="3293" y="-945.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -182,9 +182,9 @@
 </a>
 </g>
 </g>
-<!-- Node8&#45;&gt;Node217 -->
+<!-- Node8&#45;&gt;Node219 -->
 <g id="edge153" class="edge">
-<title>Node8&#45;&gt;Node217</title>
+<title>Node8&#45;&gt;Node219</title>
 <path fill="none" stroke="#191970" d="M1430.2724,-1009.3517C1726.7139,-1008.447 2903.7929,-1002.0937 3271,-958 3275.5703,-957.4512 3280.279,-956.7502 3284.9968,-955.9508"/>
 <polygon fill="#191970" stroke="#191970" points="1430.0909,-1005.8521 1420.1013,-1009.3816 1430.1116,-1012.8521 1430.0909,-1005.8521"/>
 </g>
@@ -309,9 +309,9 @@
 <path fill="none" stroke="#191970" d="M442.914,-654.7384C451.4253,-596.1135 475.1463,-425.7687 467,-402 461.274,-385.2932 447.6731,-369.8239 437.921,-360.2878"/>
 <polygon fill="#191970" stroke="#191970" points="439.4123,-654.4961 441.4285,-664.8973 446.3386,-655.509 439.4123,-654.4961"/>
 </g>
-<!-- Node145 -->
+<!-- Node147 -->
 <g id="node25" class="node">
-<title>Node145</title>
+<title>Node147</title>
 <g id="a_node25"><a xlink:href="affine__type_8h.html" target="_top" xlink:title="Quantized Tensor Types. ">
 <polygon fill="#ffffff" stroke="#000000" points="506.5,-536.5 506.5,-566.5 621.5,-566.5 621.5,-536.5 506.5,-536.5"/>
 <text text-anchor="start" x="514.5" y="-554.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/ir/affine</text>
@@ -319,30 +319,30 @@
 </a>
 </g>
 </g>
-<!-- Node9&#45;&gt;Node145 -->
+<!-- Node9&#45;&gt;Node147 -->
 <g id="edge43" class="edge">
-<title>Node9&#45;&gt;Node145</title>
+<title>Node9&#45;&gt;Node147</title>
 <path fill="none" stroke="#191970" d="M457.0805,-657.5952C467.2299,-647.5474 480.3535,-634.5501 492,-623 511.6381,-603.5245 534.2691,-581.0448 548.8358,-566.5707"/>
 <polygon fill="#191970" stroke="#191970" points="454.6174,-655.1086 449.9727,-664.6311 459.5419,-660.0835 454.6174,-655.1086"/>
 </g>
-<!-- Node146 -->
+<!-- Node148 -->
 <g id="node26" class="node">
-<title>Node146</title>
+<title>Node148</title>
 <g id="a_node26"><a xlink:href="ir_2expr_8h.html" target="_top" xlink:title="Base expr nodes in TVM. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="953.5,-603.5 953.5,-622.5 1070.5,-622.5 1070.5,-603.5 953.5,-603.5"/>
 <text text-anchor="middle" x="1012" y="-610.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/ir/expr.h</text>
 </a>
 </g>
 </g>
-<!-- Node9&#45;&gt;Node146 -->
+<!-- Node9&#45;&gt;Node148 -->
 <g id="edge44" class="edge">
-<title>Node9&#45;&gt;Node146</title>
+<title>Node9&#45;&gt;Node148</title>
 <path fill="none" stroke="#191970" d="M509.0333,-667.0777C622.596,-654.8677 846.8235,-630.7594 953.2617,-619.3154"/>
 <polygon fill="#191970" stroke="#191970" points="508.6281,-663.601 499.0596,-668.1501 509.3765,-670.5609 508.6281,-663.601"/>
 </g>
-<!-- Node171 -->
+<!-- Node173 -->
 <g id="node28" class="node">
-<title>Node171</title>
+<title>Node173</title>
 <g id="a_node28"><a xlink:href="tensor__type_8h.html" target="_top" xlink:title="Polymorphic tensor types. ">
 <polygon fill="#ffffff" stroke="#000000" points="256,-536.5 256,-566.5 374,-566.5 374,-536.5 256,-536.5"/>
 <text text-anchor="start" x="264" y="-554.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/ir/tensor</text>
@@ -350,9 +350,9 @@
 </a>
 </g>
 </g>
-<!-- Node9&#45;&gt;Node171 -->
+<!-- Node9&#45;&gt;Node173 -->
 <g id="edge73" class="edge">
-<title>Node9&#45;&gt;Node171</title>
+<title>Node9&#45;&gt;Node173</title>
 <path fill="none" stroke="#191970" d="M423.0359,-657.8073C398.6459,-633.8075 354.024,-589.8996 330.4842,-566.7365"/>
 <polygon fill="#191970" stroke="#191970" points="420.6087,-660.3293 430.1915,-664.8484 425.5184,-655.3398 420.6087,-660.3293"/>
 </g>
@@ -726,90 +726,90 @@
 <path fill="none" stroke="#191970" d="M1798.1009,-406.6918C1893.2795,-394.3591 2034.4167,-374.7672 2043,-366 2097.6509,-310.1783 2083.1262,-204.0061 2075.3677,-164.6621"/>
 <polygon fill="#191970" stroke="#191970" points="1797.6461,-403.2214 1788.1766,-407.9731 1798.5424,-410.1638 1797.6461,-403.2214"/>
 </g>
-<!-- Node146&#45;&gt;Node10 -->
+<!-- Node148&#45;&gt;Node10 -->
 <g id="edge45" class="edge">
-<title>Node146&#45;&gt;Node10</title>
+<title>Node148&#45;&gt;Node10</title>
 <path fill="none" stroke="#191970" d="M1074.6029,-601.6345C1123.5235,-592.624 1193.188,-579.5095 1254,-567 1263.022,-565.1441 1272.6771,-563.066 1281.8455,-561.0493"/>
 <polygon fill="#191970" stroke="#191970" points="1073.6919,-598.2432 1064.4893,-603.4932 1074.9573,-605.1279 1073.6919,-598.2432"/>
 </g>
-<!-- Node146&#45;&gt;Node11 -->
+<!-- Node148&#45;&gt;Node11 -->
 <g id="edge53" class="edge">
-<title>Node146&#45;&gt;Node11</title>
+<title>Node148&#45;&gt;Node11</title>
 <path fill="none" stroke="#191970" d="M1054.6348,-600.3849C1080.0183,-592.2158 1112.5305,-580.5831 1140,-567 1162.7646,-555.7434 1163.9199,-544.0677 1188,-536 1247.3236,-516.1244 1660.4713,-495.1397 1824.2979,-487.501"/>
 <polygon fill="#191970" stroke="#191970" points="1053.4121,-597.1007 1044.9317,-603.4515 1055.5216,-603.7753 1053.4121,-597.1007"/>
 </g>
-<!-- Node146&#45;&gt;Node102 -->
+<!-- Node148&#45;&gt;Node102 -->
 <g id="edge54" class="edge">
-<title>Node146&#45;&gt;Node102</title>
+<title>Node148&#45;&gt;Node102</title>
 <path fill="none" stroke="#191970" d="M942.8712,-602.9382C909.189,-596.0819 868.9494,-584.8918 836,-567 818.0292,-557.2417 820.8499,-543.9282 802,-536 672.5349,-481.5476 282.5959,-585.2797 171,-500 125.552,-465.2694 120.9861,-387.6027 120.8415,-360.2552"/>
 <polygon fill="#191970" stroke="#191970" points="942.6138,-606.4538 953.0984,-604.9295 943.9517,-599.5828 942.6138,-606.4538"/>
 </g>
-<!-- Node146&#45;&gt;Node107 -->
+<!-- Node148&#45;&gt;Node107 -->
 <g id="edge68" class="edge">
-<title>Node146&#45;&gt;Node107</title>
+<title>Node148&#45;&gt;Node107</title>
 <path fill="none" stroke="#191970" d="M1023.3471,-594.4967C1027.9045,-586.3278 1032.8212,-576.4501 1036,-567 1047.8574,-531.75 1050,-521.6909 1050,-484.5 1050,-484.5 1050,-484.5 1050,-417.5 1050,-360.1862 639.3963,-307.3445 487.0239,-289.7084"/>
 <polygon fill="#191970" stroke="#191970" points="1020.3015,-592.7715 1018.2877,-603.1732 1026.3485,-596.2977 1020.3015,-592.7715"/>
 </g>
-<!-- Node146&#45;&gt;Node130 -->
+<!-- Node148&#45;&gt;Node130 -->
 <g id="edge62" class="edge">
-<title>Node146&#45;&gt;Node130</title>
+<title>Node148&#45;&gt;Node130</title>
 <path fill="none" stroke="#191970" d="M1016.7552,-593.3494C1025.5459,-551.6897 1039.0968,-454.0615 988,-402 966.7,-380.2978 785.4908,-363.7661 678.4092,-355.7846"/>
 <polygon fill="#191970" stroke="#191970" points="1013.2928,-592.7938 1014.5109,-603.3184 1020.1218,-594.3313 1013.2928,-592.7938"/>
 </g>
-<!-- Node146&#45;&gt;Node132 -->
+<!-- Node148&#45;&gt;Node132 -->
 <g id="edge63" class="edge">
-<title>Node146&#45;&gt;Node132</title>
+<title>Node148&#45;&gt;Node132</title>
 <path fill="none" stroke="#191970" d="M1000.4412,-594.7251C988.8919,-577.6724 969.7589,-552.4546 948,-536 867.2066,-474.902 565.6617,-388.5033 462.1463,-360.0166"/>
 <polygon fill="#191970" stroke="#191970" points="997.6282,-596.8176 1006.0458,-603.2513 1003.4776,-592.9725 997.6282,-596.8176"/>
 </g>
-<!-- Node146&#45;&gt;Node37 -->
+<!-- Node148&#45;&gt;Node37 -->
 <g id="edge61" class="edge">
-<title>Node146&#45;&gt;Node37</title>
+<title>Node148&#45;&gt;Node37</title>
 <path fill="none" stroke="#191970" d="M1030.2296,-595.9469C1052.7147,-572.9703 1088,-529.7112 1088,-484.5 1088,-484.5 1088,-484.5 1088,-283.5 1088,-192.0149 1764.4109,-160.1178 1995.606,-151.8874"/>
 <polygon fill="#191970" stroke="#191970" points="1027.5436,-593.6804 1022.8889,-603.1981 1032.4629,-598.6604 1027.5436,-593.6804"/>
 </g>
-<!-- Node146&#45;&gt;Node38 -->
+<!-- Node148&#45;&gt;Node38 -->
 <g id="edge57" class="edge">
-<title>Node146&#45;&gt;Node38</title>
+<title>Node148&#45;&gt;Node38</title>
 <path fill="none" stroke="#191970" d="M1060.4878,-600.8655C1093.7872,-592.215 1138.8696,-579.8772 1178,-567 1214.6148,-554.9507 1221.2857,-543.966 1259,-536 1567.2716,-470.8867 1660.1076,-571.3316 1967,-500 2020.7168,-487.5145 2142.4868,-418.8447 2195,-402 2247.4112,-385.1881 2308.1146,-372.0778 2354.8308,-363.2927"/>
 <polygon fill="#191970" stroke="#191970" points="1059.3604,-597.5416 1050.5514,-603.4282 1061.1087,-604.3198 1059.3604,-597.5416"/>
 </g>
-<!-- Node146&#45;&gt;Node139 -->
+<!-- Node148&#45;&gt;Node139 -->
 <g id="edge58" class="edge">
-<title>Node146&#45;&gt;Node139</title>
+<title>Node148&#45;&gt;Node139</title>
 <path fill="none" stroke="#191970" d="M1043.1424,-599.0917C1060.9012,-590.6723 1083.2798,-579.222 1102,-567 1120.2506,-555.0846 1120.8583,-546.4239 1140,-536 1388.5796,-400.6318 1731.2562,-363.8865 1881.8317,-354.0399"/>
 <polygon fill="#191970" stroke="#191970" points="1041.3902,-596.0468 1033.8042,-603.443 1044.3468,-602.3918 1041.3902,-596.0468"/>
 </g>
-<!-- Node146&#45;&gt;Node145 -->
+<!-- Node148&#45;&gt;Node147 -->
 <g id="edge46" class="edge">
-<title>Node146&#45;&gt;Node145</title>
+<title>Node148&#45;&gt;Node147</title>
 <path fill="none" stroke="#191970" d="M943.4842,-604.0827C874.2217,-595.0044 764.1237,-580.399 669,-567 653.6729,-564.841 637.0992,-562.4301 621.7473,-560.1649"/>
 <polygon fill="#191970" stroke="#191970" points="943.0305,-607.5531 953.4004,-605.3812 943.9395,-600.6124 943.0305,-607.5531"/>
 </g>
-<!-- Node151 -->
+<!-- Node153 -->
 <g id="node27" class="node">
-<title>Node151</title>
+<title>Node153</title>
 <g id="a_node27"><a xlink:href="ir_2attrs_8h.html" target="_top" xlink:title="Helpers for attribute objects. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="303.5,-475 303.5,-494 422.5,-494 422.5,-475 303.5,-475"/>
 <text text-anchor="middle" x="363" y="-482" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/ir/attrs.h</text>
 </a>
 </g>
 </g>
-<!-- Node146&#45;&gt;Node151 -->
+<!-- Node148&#45;&gt;Node153 -->
 <g id="edge47" class="edge">
-<title>Node146&#45;&gt;Node151</title>
+<title>Node148&#45;&gt;Node153</title>
 <path fill="none" stroke="#191970" d="M981.0081,-599.2992C931.5105,-577.4372 839.7577,-537.0014 836,-536 759.3748,-515.5807 531.6249,-496.8216 422.539,-488.7182"/>
 <polygon fill="#191970" stroke="#191970" points="979.7441,-602.5671 990.3056,-603.4068 982.5729,-596.1641 979.7441,-602.5671"/>
 </g>
-<!-- Node146&#45;&gt;Node171 -->
+<!-- Node148&#45;&gt;Node173 -->
 <g id="edge55" class="edge">
-<title>Node146&#45;&gt;Node171</title>
+<title>Node148&#45;&gt;Node173</title>
 <path fill="none" stroke="#191970" d="M943.1384,-606.924C806.4716,-594.8651 503.084,-568.0957 374.4729,-556.7476"/>
 <polygon fill="#191970" stroke="#191970" points="943.2053,-610.4434 953.4743,-607.836 943.8206,-603.4705 943.2053,-610.4434"/>
 </g>
-<!-- Node144 -->
+<!-- Node146 -->
 <g id="node29" class="node">
-<title>Node144</title>
+<title>Node146</title>
 <g id="a_node29"><a xlink:href="schedule__rule_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/schedule_rule.h">
 <polygon fill="#ffffff" stroke="#000000" points="1710,-536.5 1710,-566.5 1862,-566.5 1862,-536.5 1710,-536.5"/>
 <text text-anchor="start" x="1718" y="-554.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
@@ -817,183 +817,183 @@
 </a>
 </g>
 </g>
-<!-- Node146&#45;&gt;Node144 -->
+<!-- Node148&#45;&gt;Node146 -->
 <g id="edge59" class="edge">
-<title>Node146&#45;&gt;Node144</title>
+<title>Node148&#45;&gt;Node146</title>
 <path fill="none" stroke="#191970" d="M1080.8847,-607.5266C1225.4131,-596.0428 1559.3615,-569.5081 1709.9358,-557.5439"/>
 <polygon fill="#191970" stroke="#191970" points="1080.5024,-604.0459 1070.8111,-608.327 1081.0569,-611.0239 1080.5024,-604.0459"/>
 </g>
-<!-- Node146&#45;&gt;Node174 -->
+<!-- Node148&#45;&gt;Node176 -->
 <g id="edge64" class="edge">
-<title>Node146&#45;&gt;Node174</title>
+<title>Node148&#45;&gt;Node176</title>
 <path fill="none" stroke="#191970" d="M1081.1811,-607.2245C1157.1042,-600.2677 1282.3794,-586.9667 1389,-567 1391.8998,-566.457 1394.8665,-565.856 1397.8537,-565.2155"/>
 <polygon fill="#191970" stroke="#191970" points="1080.4463,-603.7766 1070.8027,-608.164 1081.0775,-610.7481 1080.4463,-603.7766"/>
 </g>
-<!-- Node146&#45;&gt;Node181 -->
+<!-- Node148&#45;&gt;Node183 -->
 <g id="edge65" class="edge">
-<title>Node146&#45;&gt;Node181</title>
+<title>Node148&#45;&gt;Node183</title>
 <path fill="none" stroke="#191970" d="M992.8481,-596.8304C968.9034,-576.8713 928.9977,-544.5179 912,-536 870.9254,-515.4167 820.4384,-502.0312 782.7459,-494.08"/>
 <polygon fill="#191970" stroke="#191970" points="990.6749,-599.5757 1000.5883,-603.3139 995.1698,-594.2095 990.6749,-599.5757"/>
 </g>
-<!-- Node146&#45;&gt;Node188 -->
+<!-- Node148&#45;&gt;Node190 -->
 <g id="edge69" class="edge">
-<title>Node146&#45;&gt;Node188</title>
+<title>Node148&#45;&gt;Node190</title>
 <path fill="none" stroke="#191970" d="M959.404,-601.2802C907.0303,-589.61 827.6128,-571.9137 778.9226,-561.0643"/>
 <polygon fill="#191970" stroke="#191970" points="958.656,-604.6993 969.1778,-603.4581 960.1785,-597.8669 958.656,-604.6993"/>
 </g>
-<!-- Node151&#45;&gt;Node101 -->
+<!-- Node153&#45;&gt;Node101 -->
 <g id="edge49" class="edge">
-<title>Node151&#45;&gt;Node101</title>
+<title>Node153&#45;&gt;Node101</title>
 <path fill="none" stroke="#191970" d="M374.2717,-466.0804C380.7301,-455.5264 388.7328,-442.4489 394.8076,-432.5218"/>
 <polygon fill="#191970" stroke="#191970" points="371.1086,-464.544 368.8743,-474.9005 377.0794,-468.1977 371.1086,-464.544"/>
 </g>
-<!-- Node151&#45;&gt;Node102 -->
+<!-- Node153&#45;&gt;Node102 -->
 <g id="edge48" class="edge">
-<title>Node151&#45;&gt;Node102</title>
+<title>Node153&#45;&gt;Node102</title>
 <path fill="none" stroke="#191970" d="M336.7091,-469.9422C286.8996,-442.3617 180.0305,-383.1863 138.2463,-360.0496"/>
 <polygon fill="#191970" stroke="#191970" points="335.1521,-473.0808 345.596,-474.8631 338.543,-466.9569 335.1521,-473.0808"/>
 </g>
-<!-- Node151&#45;&gt;Node130 -->
+<!-- Node153&#45;&gt;Node130 -->
 <g id="edge50" class="edge">
-<title>Node151&#45;&gt;Node130</title>
+<title>Node153&#45;&gt;Node130</title>
 <path fill="none" stroke="#191970" d="M393.0893,-470.5363C414.0694,-460.5736 442.5681,-446.5863 467,-433 511.1696,-408.4379 561.365,-376.0164 585.5132,-360.1224"/>
 <polygon fill="#191970" stroke="#191970" points="391.5712,-467.3825 384.0204,-474.8146 394.5579,-473.7133 391.5712,-467.3825"/>
 </g>
-<!-- Node151&#45;&gt;Node131 -->
+<!-- Node153&#45;&gt;Node131 -->
 <g id="edge51" class="edge">
-<title>Node151&#45;&gt;Node131</title>
+<title>Node153&#45;&gt;Node131</title>
 <path fill="none" stroke="#191970" d="M349.8579,-466.5301C328.8046,-437.743 288.2365,-382.2724 271.9841,-360.0496"/>
 <polygon fill="#191970" stroke="#191970" points="347.2238,-468.8575 355.9521,-474.8631 352.874,-464.7252 347.2238,-468.8575"/>
 </g>
-<!-- Node151&#45;&gt;Node132 -->
+<!-- Node153&#45;&gt;Node132 -->
 <g id="edge52" class="edge">
-<title>Node151&#45;&gt;Node132</title>
+<title>Node153&#45;&gt;Node132</title>
 <path fill="none" stroke="#191970" d="M350.1415,-466.1067C339.6991,-448.6938 328.4425,-422.2721 340,-402 351.8228,-381.2626 375.6376,-368.024 395.3661,-360.1694"/>
 <polygon fill="#191970" stroke="#191970" points="347.3539,-468.2426 355.709,-474.7574 353.2402,-464.4542 347.3539,-468.2426"/>
 </g>
-<!-- Node171&#45;&gt;Node131 -->
+<!-- Node173&#45;&gt;Node131 -->
 <g id="edge56" class="edge">
-<title>Node171&#45;&gt;Node131</title>
+<title>Node173&#45;&gt;Node131</title>
 <path fill="none" stroke="#191970" d="M245.8791,-534.9025C213.655,-525.6272 180.353,-513.2782 171,-500 134.9826,-448.8672 216.0588,-384.2933 250.4821,-360.1699"/>
 <polygon fill="#191970" stroke="#191970" points="245.2443,-538.3592 255.8173,-537.68 247.1285,-531.6175 245.2443,-538.3592"/>
 </g>
-<!-- Node144&#45;&gt;Node42 -->
+<!-- Node146&#45;&gt;Node42 -->
 <g id="edge60" class="edge">
-<title>Node144&#45;&gt;Node42</title>
+<title>Node146&#45;&gt;Node42</title>
 <path fill="none" stroke="#191970" d="M1764.6911,-528.5519C1757.4864,-520.0173 1749.8052,-509.9746 1744,-500 1731.1723,-477.9592 1721.5254,-449.8137 1716.3279,-432.748"/>
 <polygon fill="#191970" stroke="#191970" points="1762.1667,-530.9826 1771.3759,-536.221 1767.4435,-526.383 1762.1667,-530.9826"/>
 </g>
-<!-- Node181&#45;&gt;Node107 -->
+<!-- Node183&#45;&gt;Node107 -->
 <g id="edge67" class="edge">
-<title>Node181&#45;&gt;Node107</title>
+<title>Node183&#45;&gt;Node107</title>
 <path fill="none" stroke="#191970" d="M728.6887,-464.3495C728.6174,-432.0858 723.4515,-369.0144 687,-335 658.6192,-308.5167 553.6519,-294.4195 487.2626,-287.9882"/>
 <polygon fill="#191970" stroke="#191970" points="725.1834,-464.68 728.5329,-474.7314 732.1826,-464.7851 725.1834,-464.68"/>
 </g>
-<!-- Node181&#45;&gt;Node131 -->
+<!-- Node183&#45;&gt;Node131 -->
 <g id="edge66" class="edge">
-<title>Node181&#45;&gt;Node131</title>
+<title>Node183&#45;&gt;Node131</title>
 <path fill="none" stroke="#191970" d="M690.0302,-471.7556C639.4083,-454.9356 546.9021,-424.794 467,-402 411.1415,-386.065 345.9241,-369.8679 305.0394,-360.0009"/>
 <polygon fill="#191970" stroke="#191970" points="688.9863,-475.0969 699.5799,-474.9359 691.1981,-468.4555 688.9863,-475.0969"/>
 </g>
-<!-- Node188&#45;&gt;Node181 -->
+<!-- Node190&#45;&gt;Node183 -->
 <g id="edge70" class="edge">
-<title>Node188&#45;&gt;Node181</title>
+<title>Node190&#45;&gt;Node183</title>
 <path fill="none" stroke="#191970" d="M733.6248,-531.6079C732.145,-519.214 730.3076,-503.8263 729.1441,-494.0817"/>
 <polygon fill="#191970" stroke="#191970" points="730.1928,-532.3861 734.8538,-541.9005 737.1434,-531.5561 730.1928,-532.3861"/>
 </g>
-<!-- Node190&#45;&gt;Node135 -->
+<!-- Node192&#45;&gt;Node135 -->
 <g id="edge85" class="edge">
-<title>Node190&#45;&gt;Node135</title>
+<title>Node192&#45;&gt;Node135</title>
 <path fill="none" stroke="#191970" d="M2164.0945,-788.9675C2185.392,-720.711 2256.23,-493.681 2275.2596,-432.6927"/>
 <polygon fill="#191970" stroke="#191970" points="2160.6533,-788.2461 2161.0157,-798.8348 2167.3355,-790.3312 2160.6533,-788.2461"/>
 </g>
-<!-- Node190&#45;&gt;Node136 -->
+<!-- Node192&#45;&gt;Node136 -->
 <g id="edge87" class="edge">
-<title>Node190&#45;&gt;Node136</title>
+<title>Node192&#45;&gt;Node136</title>
 <path fill="none" stroke="#191970" d="M2151.4206,-789.1138C2148.5187,-779.5663 2145.4431,-767.8098 2144,-757 2142.1768,-743.3434 2143.5295,-739.7697 2144,-726 2147.9103,-611.5648 2133.4411,-579.8833 2162,-469 2179.2294,-402.1047 2219.5292,-329.4902 2237.7563,-298.6831"/>
 <polygon fill="#191970" stroke="#191970" points="2148.1368,-790.3354 2154.5269,-798.7863 2154.8016,-788.1951 2148.1368,-790.3354"/>
 </g>
-<!-- Node190&#45;&gt;Node39 -->
+<!-- Node192&#45;&gt;Node39 -->
 <g id="edge100" class="edge">
-<title>Node190&#45;&gt;Node39</title>
+<title>Node192&#45;&gt;Node39</title>
 <path fill="none" stroke="#191970" d="M2247.6597,-800.2632C2366.0653,-788.9642 2561.0022,-768.7267 2572,-757 2680.028,-641.8122 2605.4482,-552.3032 2557,-402 2533.9483,-330.4854 2474.039,-261.3679 2445.9485,-231.6802"/>
 <polygon fill="#191970" stroke="#191970" points="2247.1321,-796.7974 2237.5081,-801.2279 2247.7944,-803.766 2247.1321,-796.7974"/>
 </g>
-<!-- Node190&#45;&gt;Node35 -->
+<!-- Node192&#45;&gt;Node35 -->
 <g id="edge91" class="edge">
-<title>Node190&#45;&gt;Node35</title>
+<title>Node192&#45;&gt;Node35</title>
 <path fill="none" stroke="#191970" d="M2247.5776,-802.1868C2395.6108,-791.4332 2676.8869,-769.5142 2695,-757 2736.4144,-728.3871 2895,-400.8374 2895,-350.5 2895,-350.5 2895,-350.5 2895,-216.5 2895,-138.0786 2797.3989,-105.285 2725.1512,-91.7527"/>
 <polygon fill="#191970" stroke="#191970" points="2247.3104,-798.6969 2237.5894,-802.9101 2247.8161,-805.6786 2247.3104,-798.6969"/>
 </g>
-<!-- Node190&#45;&gt;Node36 -->
+<!-- Node192&#45;&gt;Node36 -->
 <g id="edge102" class="edge">
-<title>Node190&#45;&gt;Node36</title>
+<title>Node192&#45;&gt;Node36</title>
 <path fill="none" stroke="#191970" d="M2247.5413,-804.1802C2410.7773,-795.8743 2744.561,-776.7446 2794,-757 3014.5974,-668.8993 3141,-588.0393 3141,-350.5 3141,-350.5 3141,-350.5 3141,-149.5 3141,-64.3471 2863.1679,-31.5033 2725.0658,-20.4375"/>
 <polygon fill="#191970" stroke="#191970" points="2247.3269,-800.6864 2237.5166,-804.6875 2247.6808,-807.6775 2247.3269,-800.6864"/>
 </g>
-<!-- Node190&#45;&gt;Node37 -->
+<!-- Node192&#45;&gt;Node37 -->
 <g id="edge103" class="edge">
-<title>Node190&#45;&gt;Node37</title>
+<title>Node192&#45;&gt;Node37</title>
 <path fill="none" stroke="#191970" d="M2160.0431,-788.7994C2165.0789,-736.476 2176.6522,-589.7715 2162,-469 2147.2258,-347.2231 2136.7677,-316.6974 2096,-201 2091.6072,-188.5333 2085.2374,-175.0074 2080.1639,-164.9468"/>
 <polygon fill="#191970" stroke="#191970" points="2156.5356,-788.7017 2159.0269,-798.9995 2163.5011,-789.3958 2156.5356,-788.7017"/>
 </g>
-<!-- Node190&#45;&gt;Node38 -->
+<!-- Node192&#45;&gt;Node38 -->
 <g id="edge88" class="edge">
-<title>Node190&#45;&gt;Node38</title>
+<title>Node192&#45;&gt;Node38</title>
 <path fill="none" stroke="#191970" d="M2247.7574,-802.7511C2356.9062,-794.8407 2529.1297,-779.0343 2549,-757 2591.8773,-709.4532 2553,-677.0247 2553,-613 2553,-613 2553,-613 2553,-551.5 2553,-484.5757 2573.6351,-456.646 2535,-402 2523.0197,-385.0549 2503.8442,-373.4084 2485.2028,-365.54"/>
 <polygon fill="#191970" stroke="#191970" points="2247.4507,-799.264 2237.726,-803.4688 2247.9503,-806.2462 2247.4507,-799.264"/>
 </g>
-<!-- Node190&#45;&gt;Node137 -->
+<!-- Node192&#45;&gt;Node137 -->
 <g id="edge92" class="edge">
-<title>Node190&#45;&gt;Node137</title>
+<title>Node192&#45;&gt;Node137</title>
 <path fill="none" stroke="#191970" d="M2247.6638,-804.3323C2379.5145,-797.4081 2613.3718,-781.7911 2643,-757 2672.2867,-732.4947 2667,-712.6867 2667,-674.5 2667,-674.5 2667,-674.5 2667,-484.5 2667,-440.5323 2652.3775,-390.1326 2644.2666,-365.5584"/>
 <polygon fill="#191970" stroke="#191970" points="2247.3771,-800.8424 2237.5719,-804.8558 2247.7398,-807.833 2247.3771,-800.8424"/>
 </g>
-<!-- Node190&#45;&gt;Node138 -->
+<!-- Node192&#45;&gt;Node138 -->
 <g id="edge90" class="edge">
-<title>Node190&#45;&gt;Node138</title>
+<title>Node192&#45;&gt;Node138</title>
 <path fill="none" stroke="#191970" d="M2247.6604,-804.2348C2384.6325,-797.0047 2633.6997,-780.7732 2667,-757 2699.8558,-733.5441 2705,-714.8693 2705,-674.5 2705,-674.5 2705,-674.5 2705,-551.5 2705,-453.8913 2736.2321,-339.3736 2748.3102,-298.7368"/>
 <polygon fill="#191970" stroke="#191970" points="2247.3967,-800.7437 2237.5929,-804.7607 2247.7619,-807.7342 2247.3967,-800.7437"/>
 </g>
-<!-- Node190&#45;&gt;Node139 -->
+<!-- Node192&#45;&gt;Node139 -->
 <g id="edge98" class="edge">
-<title>Node190&#45;&gt;Node139</title>
+<title>Node192&#45;&gt;Node139</title>
 <path fill="none" stroke="#191970" d="M2126.9278,-793.9644C2112.0624,-785.3563 2095.477,-772.983 2086,-757 2004.9411,-620.2941 2129.2055,-535.5203 2043,-402 2032.5181,-385.765 2014.9383,-373.8782 1998.4776,-365.6424"/>
 <polygon fill="#191970" stroke="#191970" points="2125.4256,-797.1315 2135.8824,-798.8361 2128.771,-790.9826 2125.4256,-797.1315"/>
 </g>
-<!-- Node190&#45;&gt;Node34 -->
+<!-- Node192&#45;&gt;Node34 -->
 <g id="edge86" class="edge">
-<title>Node190&#45;&gt;Node34</title>
+<title>Node192&#45;&gt;Node34</title>
 <path fill="none" stroke="#191970" d="M2247.6692,-804.6144C2403.7643,-797.2877 2713.0572,-780.0706 2757,-757 2904.4655,-679.5786 3003.8232,-487.8242 3030.0709,-432.6029"/>
 <polygon fill="#191970" stroke="#191970" points="2247.4398,-801.1211 2237.6134,-805.0826 2247.7654,-808.1136 2247.4398,-801.1211"/>
 </g>
-<!-- Node190&#45;&gt;Node40 -->
+<!-- Node192&#45;&gt;Node40 -->
 <g id="edge89" class="edge">
-<title>Node190&#45;&gt;Node40</title>
+<title>Node192&#45;&gt;Node40</title>
 <path fill="none" stroke="#191970" d="M2138.367,-792.3428C2127.7678,-782.8705 2115.0417,-770.1974 2106,-757 2085.8764,-727.6272 1989.7787,-494.9231 1964.2429,-432.7355"/>
 <polygon fill="#191970" stroke="#191970" points="2136.1104,-795.0185 2145.9616,-798.918 2140.6923,-789.7264 2136.1104,-795.0185"/>
 </g>
-<!-- Node190&#45;&gt;Node41 -->
+<!-- Node192&#45;&gt;Node41 -->
 <g id="edge97" class="edge">
-<title>Node190&#45;&gt;Node41</title>
+<title>Node192&#45;&gt;Node41</title>
 <path fill="none" stroke="#191970" d="M2247.8412,-802.3245C2353.0528,-794.1602 2515.4329,-778.2835 2534,-757 2556.1528,-731.6061 2476.7152,-494.9459 2455.2238,-432.531"/>
 <polygon fill="#191970" stroke="#191970" points="2247.548,-798.8367 2237.8447,-803.0905 2248.0829,-805.8162 2247.548,-798.8367"/>
 </g>
-<!-- Node190&#45;&gt;Node42 -->
+<!-- Node192&#45;&gt;Node42 -->
 <g id="edge101" class="edge">
-<title>Node190&#45;&gt;Node42</title>
+<title>Node192&#45;&gt;Node42</title>
 <path fill="none" stroke="#191970" d="M2133.8648,-793.3113C2119.3522,-783.6852 2100.874,-770.5324 2086,-757 1989.7582,-669.4392 1998.055,-614.115 1894,-536 1863.4657,-513.0776 1848.2492,-520.4394 1816,-500 1797.8796,-488.5154 1795.7305,-482.429 1779,-469 1763.4397,-456.5103 1745.5031,-442.7563 1732.1064,-432.6062"/>
 <polygon fill="#191970" stroke="#191970" points="2132.0317,-796.2943 2142.3208,-798.8211 2135.8532,-790.4294 2132.0317,-796.2943"/>
 </g>
-<!-- Node190&#45;&gt;Node144 -->
+<!-- Node192&#45;&gt;Node146 -->
 <g id="edge99" class="edge">
-<title>Node190&#45;&gt;Node144</title>
+<title>Node192&#45;&gt;Node146</title>
 <path fill="none" stroke="#191970" d="M2095.356,-796.6301C2062.9265,-788.6639 2023.6161,-776.0594 1992,-757 1969.1246,-743.2098 1843.4399,-611.9397 1800.436,-566.7195"/>
 <polygon fill="#191970" stroke="#191970" points="2094.6461,-800.0587 2105.1833,-798.9555 2096.258,-793.2468 2094.6461,-800.0587"/>
 </g>
-<!-- Node191 -->
+<!-- Node193 -->
 <g id="node34" class="node">
-<title>Node191</title>
+<title>Node193</title>
 <g id="a_node34"><a xlink:href="env__func_8h.html" target="_top" xlink:title="Serializable global function used in IR. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="180.5,-469.5 180.5,-499.5 285.5,-499.5 285.5,-469.5 180.5,-469.5"/>
 <text text-anchor="start" x="188.5" y="-487.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/ir/env</text>
@@ -1001,24 +1001,24 @@
 </a>
 </g>
 </g>
-<!-- Node190&#45;&gt;Node191 -->
+<!-- Node192&#45;&gt;Node193 -->
 <g id="edge81" class="edge">
-<title>Node190&#45;&gt;Node191</title>
+<title>Node192&#45;&gt;Node193</title>
 <path fill="none" stroke="#191970" d="M2068.0657,-807.5084C1865.3244,-804.6619 1357.2773,-794.1069 934,-757 830.3699,-747.9152 805.2701,-738.5306 702,-726 555.5374,-708.2286 505.2952,-753.2416 372,-690 301.5822,-656.5904 283.0456,-636.1056 247,-567 235.7799,-545.4891 233.2408,-516.9352 232.8314,-499.6811"/>
 <polygon fill="#191970" stroke="#191970" points="2068.3147,-811.0121 2078.3619,-807.65 2068.411,-804.0127 2068.3147,-811.0121"/>
 </g>
-<!-- Node192 -->
+<!-- Node194 -->
 <g id="node35" class="node">
-<title>Node192</title>
+<title>Node194</title>
 <g id="a_node35"><a xlink:href="instrument_8h.html" target="_top" xlink:title="include/tvm/ir/instrument.h">
 <polygon fill="#ffffff" stroke="#ff0000" points="2223.5,-732 2223.5,-751 2372.5,-751 2372.5,-732 2223.5,-732"/>
 <text text-anchor="middle" x="2298" y="-739" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/ir/instrument.h</text>
 </a>
 </g>
 </g>
-<!-- Node190&#45;&gt;Node192 -->
+<!-- Node192&#45;&gt;Node194 -->
 <g id="edge84" class="edge">
-<title>Node190&#45;&gt;Node192</title>
+<title>Node192&#45;&gt;Node194</title>
 <path fill="none" stroke="#191970" d="M2187.1029,-794.5722C2214.3019,-781.5555 2254.1884,-762.467 2277.9786,-751.0817"/>
 <polygon fill="#191970" stroke="#191970" points="2185.5679,-791.4266 2178.0586,-798.9005 2188.5898,-797.7407 2185.5679,-791.4266"/>
 </g>
@@ -1032,9 +1032,9 @@
 </a>
 </g>
 </g>
-<!-- Node190&#45;&gt;Node142 -->
+<!-- Node192&#45;&gt;Node142 -->
 <g id="edge93" class="edge">
-<title>Node190&#45;&gt;Node142</title>
+<title>Node192&#45;&gt;Node142</title>
 <path fill="none" stroke="#191970" d="M2070.3145,-797.7347C1985.9252,-787.359 1854.726,-771.1883 1741,-757 1725.8362,-755.1082 1709.6224,-753.0735 1694.1076,-751.1207"/>
 <polygon fill="#191970" stroke="#191970" points="2070.0171,-801.2245 2080.3695,-798.9708 2070.8713,-794.2768 2070.0171,-801.2245"/>
 </g>
@@ -1048,30 +1048,30 @@
 </a>
 </g>
 </g>
-<!-- Node190&#45;&gt;Node143 -->
+<!-- Node192&#45;&gt;Node143 -->
 <g id="edge95" class="edge">
-<title>Node190&#45;&gt;Node143</title>
+<title>Node192&#45;&gt;Node143</title>
 <path fill="none" stroke="#191970" d="M2100.795,-796.9556C2045.4289,-785.7823 1961.2083,-768.786 1900.6026,-756.5553"/>
 <polygon fill="#191970" stroke="#191970" points="2100.2903,-800.4243 2110.7851,-798.9717 2101.6751,-793.5626 2100.2903,-800.4243"/>
 </g>
-<!-- Node193 -->
+<!-- Node195 -->
 <g id="node38" class="node">
-<title>Node193</title>
+<title>Node195</title>
 <g id="a_node38"><a xlink:href="node_8h.html" target="_top" xlink:title="Definitions and helper macros for IR/AST nodes. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="943.5,-732 943.5,-751 1080.5,-751 1080.5,-732 943.5,-732"/>
 <text text-anchor="middle" x="1012" y="-739" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/node/node.h</text>
 </a>
 </g>
 </g>
-<!-- Node190&#45;&gt;Node193 -->
+<!-- Node192&#45;&gt;Node195 -->
 <g id="edge104" class="edge">
-<title>Node190&#45;&gt;Node193</title>
+<title>Node192&#45;&gt;Node195</title>
 <path fill="none" stroke="#191970" d="M2068.0406,-803.2406C1846.4965,-790.2882 1278.2644,-757.0669 1080.8285,-745.524"/>
 <polygon fill="#191970" stroke="#191970" points="2068.1381,-806.7522 2078.3254,-803.8419 2068.5467,-799.7641 2068.1381,-806.7522"/>
 </g>
-<!-- Node198 -->
+<!-- Node200 -->
 <g id="node39" class="node">
-<title>Node198</title>
+<title>Node200</title>
 <g id="a_node39"><a xlink:href="traced__object_8h.html" target="_top" xlink:title="include/tvm/script\l/printer/traced_object.h">
 <polygon fill="#ffffff" stroke="#ff0000" points="1749,-659.5 1749,-689.5 1879,-689.5 1879,-659.5 1749,-659.5"/>
 <text text-anchor="start" x="1757" y="-677.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/script</text>
@@ -1079,15 +1079,15 @@
 </a>
 </g>
 </g>
-<!-- Node190&#45;&gt;Node198 -->
+<!-- Node192&#45;&gt;Node200 -->
 <g id="edge112" class="edge">
-<title>Node190&#45;&gt;Node198</title>
+<title>Node192&#45;&gt;Node200</title>
 <path fill="none" stroke="#191970" d="M2085.2877,-797.0633C2045.2942,-789.088 1995.5132,-776.3391 1954,-757 1932.6439,-747.0512 1931.0065,-738.4419 1911,-726 1889.6278,-712.7089 1864.5064,-699.432 1845.1843,-689.6883"/>
 <polygon fill="#191970" stroke="#191970" points="2084.7507,-800.5243 2095.2337,-798.9892 2086.0815,-793.6519 2084.7507,-800.5243"/>
 </g>
-<!-- Node199 -->
+<!-- Node201 -->
 <g id="node40" class="node">
-<title>Node199</title>
+<title>Node201</title>
 <g id="a_node40"><a xlink:href="instruction_8h.html" target="_top" xlink:title="include/tvm/tir/schedule\l/instruction.h">
 <polygon fill="#ffffff" stroke="#ff0000" points="2391,-726.5 2391,-756.5 2525,-756.5 2525,-726.5 2391,-726.5"/>
 <text text-anchor="start" x="2399" y="-744.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/schedule</text>
@@ -1095,21 +1095,21 @@
 </a>
 </g>
 </g>
-<!-- Node190&#45;&gt;Node199 -->
+<!-- Node192&#45;&gt;Node201 -->
 <g id="edge114" class="edge">
-<title>Node190&#45;&gt;Node199</title>
+<title>Node192&#45;&gt;Node201</title>
 <path fill="none" stroke="#191970" d="M2210.6453,-796.7425C2260.7717,-785.5476 2336.4346,-768.6496 2390.873,-756.4917"/>
 <polygon fill="#191970" stroke="#191970" points="2209.6608,-793.3761 2200.6641,-798.9717 2211.1866,-800.2078 2209.6608,-793.3761"/>
 </g>
-<!-- Node191&#45;&gt;Node101 -->
+<!-- Node193&#45;&gt;Node101 -->
 <g id="edge82" class="edge">
-<title>Node191&#45;&gt;Node101</title>
+<title>Node193&#45;&gt;Node101</title>
 <path fill="none" stroke="#191970" d="M280.912,-465.7275C307.616,-455.2645 340.4277,-442.4084 365.4569,-432.6017"/>
 <polygon fill="#191970" stroke="#191970" points="279.4097,-462.5569 271.3757,-469.4639 281.9634,-469.0745 279.4097,-462.5569"/>
 </g>
-<!-- Node191&#45;&gt;Node131 -->
+<!-- Node193&#45;&gt;Node131 -->
 <g id="edge83" class="edge">
-<title>Node191&#45;&gt;Node131</title>
+<title>Node193&#45;&gt;Node131</title>
 <path fill="none" stroke="#191970" d="M238.9343,-459.6499C246.0962,-429.6597 257.7887,-380.6972 262.7065,-360.1042"/>
 <polygon fill="#191970" stroke="#191970" points="235.5271,-458.8495 236.6086,-469.389 242.3357,-460.4755 235.5271,-458.8495"/>
 </g>
@@ -1125,111 +1125,111 @@
 <path fill="none" stroke="#191970" d="M1861.8654,-720.4127C1872.2575,-712.3306 1882.3237,-702.0941 1888,-690 1905.5845,-652.5338 1916.8812,-594.8881 1871,-536 1844.7756,-502.3411 1816.5835,-524.9924 1782,-500 1755.9661,-481.1861 1733.4791,-450.6964 1721.4164,-432.5234"/>
 <polygon fill="#191970" stroke="#191970" points="1859.596,-717.7343 1853.5389,-726.4269 1863.6948,-723.4089 1859.596,-717.7343"/>
 </g>
-<!-- Node193&#45;&gt;Node9 -->
+<!-- Node195&#45;&gt;Node9 -->
 <g id="edge107" class="edge">
-<title>Node193&#45;&gt;Node9</title>
+<title>Node195&#45;&gt;Node9</title>
 <path fill="none" stroke="#191970" d="M933.0554,-732.253C817.0248,-718.662 603.0063,-693.5934 499.3505,-681.4519"/>
 <polygon fill="#191970" stroke="#191970" points="932.9484,-735.7643 943.2877,-733.4515 933.7629,-728.8118 932.9484,-735.7643"/>
 </g>
-<!-- Node193&#45;&gt;Node10 -->
+<!-- Node195&#45;&gt;Node10 -->
 <g id="edge105" class="edge">
-<title>Node193&#45;&gt;Node10</title>
+<title>Node195&#45;&gt;Node10</title>
 <path fill="none" stroke="#191970" d="M1046.9063,-728.2663C1095.2152,-709.0725 1184.0633,-670.5503 1251,-623 1277.3214,-604.3019 1303.1136,-576.0517 1315.8254,-561.282"/>
 <polygon fill="#191970" stroke="#191970" points="1045.5467,-725.04 1037.5194,-731.9547 1048.1067,-731.5551 1045.5467,-725.04"/>
 </g>
-<!-- Node193&#45;&gt;Node135 -->
+<!-- Node195&#45;&gt;Node135 -->
 <g id="edge108" class="edge">
-<title>Node193&#45;&gt;Node135</title>
+<title>Node195&#45;&gt;Node135</title>
 <path fill="none" stroke="#191970" d="M1085.2644,-730.428C1234.6888,-707.0147 1584.694,-647.9129 1871,-567 2012.1396,-527.1125 2174.2449,-461.8247 2244.4862,-432.5405"/>
 <polygon fill="#191970" stroke="#191970" points="1084.6261,-726.9851 1075.2853,-731.9852 1085.7054,-733.9014 1084.6261,-726.9851"/>
 </g>
-<!-- Node193&#45;&gt;Node146 -->
+<!-- Node195&#45;&gt;Node148 -->
 <g id="edge106" class="edge">
-<title>Node193&#45;&gt;Node146</title>
+<title>Node195&#45;&gt;Node148</title>
 <path fill="none" stroke="#191970" d="M1012,-721.6733C1012,-693.7236 1012,-643.8037 1012,-622.7705"/>
 <polygon fill="#191970" stroke="#191970" points="1008.5001,-721.8416 1012,-731.8416 1015.5001,-721.8416 1008.5001,-721.8416"/>
 </g>
-<!-- Node193&#45;&gt;Node174 -->
+<!-- Node195&#45;&gt;Node176 -->
 <g id="edge109" class="edge">
-<title>Node193&#45;&gt;Node174</title>
+<title>Node195&#45;&gt;Node176</title>
 <path fill="none" stroke="#191970" d="M1060.0476,-729.401C1094.3247,-720.1592 1141.2046,-706.2667 1181,-690 1258.4682,-658.3342 1273.5335,-641.1941 1348,-603 1372.0056,-590.6875 1399.2923,-576.9223 1419.599,-566.7229"/>
 <polygon fill="#191970" stroke="#191970" points="1059.0475,-726.0452 1050.2843,-731.9997 1060.8481,-732.8097 1059.0475,-726.0452"/>
 </g>
-<!-- Node193&#45;&gt;Node181 -->
+<!-- Node195&#45;&gt;Node183 -->
 <g id="edge110" class="edge">
-<title>Node193&#45;&gt;Node181</title>
+<title>Node195&#45;&gt;Node183</title>
 <path fill="none" stroke="#191970" d="M996.1142,-724.1011C962.2041,-687.3922 879.496,-600.1277 802,-536 782.7057,-520.0341 758.6231,-503.9084 743.2743,-494.0593"/>
 <polygon fill="#191970" stroke="#191970" points="993.956,-726.9249 1003.301,-731.917 999.1088,-722.1869 993.956,-726.9249"/>
 </g>
-<!-- Node193&#45;&gt;Node188 -->
+<!-- Node195&#45;&gt;Node190 -->
 <g id="edge111" class="edge">
-<title>Node193&#45;&gt;Node188</title>
+<title>Node195&#45;&gt;Node190</title>
 <path fill="none" stroke="#191970" d="M969.2508,-728.8649C943.389,-720.139 910.4189,-707.0355 884,-690 836.9747,-659.677 833.9852,-641.093 793,-603 777.1744,-588.2912 758.5523,-571.575 747.0105,-561.2821"/>
 <polygon fill="#191970" stroke="#191970" points="968.1932,-732.2015 978.786,-731.994 970.3758,-725.5504 968.1932,-732.2015"/>
 </g>
-<!-- Node198&#45;&gt;Node174 -->
+<!-- Node200&#45;&gt;Node176 -->
 <g id="edge113" class="edge">
-<title>Node198&#45;&gt;Node174</title>
+<title>Node200&#45;&gt;Node176</title>
 <path fill="none" stroke="#191970" d="M1759.5448,-656.0989C1687.321,-631.6936 1561.212,-589.0799 1494.4287,-566.513"/>
 <polygon fill="#191970" stroke="#191970" points="1758.7722,-659.5322 1769.3665,-659.4178 1761.0132,-652.9006 1758.7722,-659.5322"/>
 </g>
-<!-- Node201&#45;&gt;Node151 -->
+<!-- Node203&#45;&gt;Node153 -->
 <g id="edge116" class="edge">
-<title>Node201&#45;&gt;Node151</title>
+<title>Node203&#45;&gt;Node153</title>
 <path fill="none" stroke="#191970" d="M944.9418,-875.0518C738.1216,-873.3209 225.3958,-864.9062 160,-824 95.3853,-783.5824 76,-750.7144 76,-674.5 76,-674.5 76,-674.5 76,-613 76,-556.9066 125.7988,-561.0261 176,-536 201.3902,-523.3426 274.3709,-505.0361 321.2938,-494.0005"/>
 <polygon fill="#191970" stroke="#191970" points="945.1232,-878.5533 955.1511,-875.1339 945.1796,-871.5535 945.1232,-878.5533"/>
 </g>
-<!-- Node201&#45;&gt;Node190 -->
+<!-- Node203&#45;&gt;Node192 -->
 <g id="edge118" class="edge">
-<title>Node201&#45;&gt;Node190</title>
+<title>Node203&#45;&gt;Node192</title>
 <path fill="none" stroke="#191970" d="M1116.9759,-865.326C1135.4401,-863.3621 1154.8548,-861.4635 1173,-860 1509.0385,-832.8969 1910.7766,-817.0222 2078.4204,-811.1352"/>
 <polygon fill="#191970" stroke="#191970" points="1116.3964,-861.8682 1106.8299,-866.4215 1117.1479,-868.8277 1116.3964,-861.8682"/>
 </g>
-<!-- Node201&#45;&gt;Node193 -->
+<!-- Node203&#45;&gt;Node195 -->
 <g id="edge117" class="edge">
-<title>Node201&#45;&gt;Node193</title>
+<title>Node203&#45;&gt;Node195</title>
 <path fill="none" stroke="#191970" d="M1012.1428,-852.2678C1006.4433,-843.8686 1000.9243,-833.9839 998,-824 990.4861,-798.3468 1001.0059,-766.9505 1007.5176,-751.3192"/>
 <polygon fill="#191970" stroke="#191970" points="1009.3481,-854.376 1018.0465,-860.4249 1015.0188,-850.2719 1009.3481,-854.376"/>
 </g>
-<!-- Node202&#45;&gt;Node151 -->
+<!-- Node204&#45;&gt;Node153 -->
 <g id="edge120" class="edge">
-<title>Node202&#45;&gt;Node151</title>
+<title>Node204&#45;&gt;Node153</title>
 <path fill="none" stroke="#191970" d="M1172.1994,-866.0163C915.9638,-837.5531 174,-754.0507 174,-741.5 174,-741.5 174,-741.5 174,-674.5 174,-610.4287 169.8791,-582.4595 214,-536 229.125,-520.0733 283.8521,-503.8741 322.5788,-494.0044"/>
 <polygon fill="#191970" stroke="#191970" points="1171.8809,-869.5024 1182.2061,-867.1273 1172.6534,-862.5451 1171.8809,-869.5024"/>
 </g>
-<!-- Node202&#45;&gt;Node190 -->
+<!-- Node204&#45;&gt;Node192 -->
 <g id="edge122" class="edge">
-<title>Node202&#45;&gt;Node190</title>
+<title>Node204&#45;&gt;Node192</title>
 <path fill="none" stroke="#191970" d="M1343.6992,-869.1202C1517.5778,-856.1759 1909.589,-826.9928 2078.3079,-814.4326"/>
 <polygon fill="#191970" stroke="#191970" points="1343.2838,-865.6413 1333.5713,-869.8741 1343.8035,-872.622 1343.2838,-865.6413"/>
 </g>
-<!-- Node202&#45;&gt;Node193 -->
+<!-- Node204&#45;&gt;Node195 -->
 <g id="edge121" class="edge">
-<title>Node202&#45;&gt;Node193</title>
+<title>Node204&#45;&gt;Node195</title>
 <path fill="none" stroke="#191970" d="M1221.4035,-855.5653C1167.6964,-826.3102 1069.5828,-772.8663 1029.6315,-751.1042"/>
 <polygon fill="#191970" stroke="#191970" points="1219.8029,-858.679 1230.2589,-860.389 1223.1514,-852.5318 1219.8029,-858.679"/>
 </g>
-<!-- Node203&#45;&gt;Node138 -->
+<!-- Node205&#45;&gt;Node138 -->
 <g id="edge126" class="edge">
-<title>Node203&#45;&gt;Node138</title>
+<title>Node205&#45;&gt;Node138</title>
 <path fill="none" stroke="#191970" d="M2592.2587,-935.603C2756.2823,-918.3322 3161.8729,-871.2157 3200,-824 3208.6559,-813.2807 3205.2459,-805.74 3200,-793 3178.5511,-740.9098 2876.2125,-478.5014 2843,-433 2814.0518,-393.3407 2818.2474,-376.4962 2792,-335 2783.9662,-322.2989 2773.6573,-308.7839 2765.6562,-298.7871"/>
 <polygon fill="#191970" stroke="#191970" points="2591.8603,-932.1255 2582.2794,-936.6483 2592.5896,-939.0874 2591.8603,-932.1255"/>
 </g>
-<!-- Node203&#45;&gt;Node34 -->
+<!-- Node205&#45;&gt;Node34 -->
 <g id="edge125" class="edge">
-<title>Node203&#45;&gt;Node34</title>
+<title>Node205&#45;&gt;Node34</title>
 <path fill="none" stroke="#191970" d="M2592.4171,-940.8658C2753.1328,-936.5162 3144.9942,-922.7521 3195,-891 3234.4368,-865.9589 3245.5951,-837.9854 3233,-793 3216.3847,-733.656 3161,-736.1261 3161,-674.5 3161,-674.5 3161,-674.5 3161,-551.5 3161,-496.0315 3103.0547,-453.7806 3066.3863,-432.5968"/>
 <polygon fill="#191970" stroke="#191970" points="2592.0746,-937.3736 2582.1714,-941.1387 2592.2611,-944.3711 2592.0746,-937.3736"/>
 </g>
-<!-- Node203&#45;&gt;Node190 -->
+<!-- Node205&#45;&gt;Node192 -->
 <g id="edge127" class="edge">
-<title>Node203&#45;&gt;Node190</title>
+<title>Node205&#45;&gt;Node192</title>
 <path fill="none" stroke="#191970" d="M2455.8034,-936.8404C2378.6228,-929.4219 2258.6118,-914.5178 2220,-891 2190.6206,-873.1055 2170.4353,-835.6465 2162.1789,-818.0354"/>
 <polygon fill="#191970" stroke="#191970" points="2455.6447,-940.3409 2465.9289,-937.7945 2456.3014,-933.3717 2455.6447,-940.3409"/>
 </g>
-<!-- Node203&#45;&gt;Node202 -->
+<!-- Node205&#45;&gt;Node204 -->
 <g id="edge128" class="edge">
-<title>Node203&#45;&gt;Node202</title>
+<title>Node205&#45;&gt;Node204</title>
 <path fill="none" stroke="#191970" d="M2455.8903,-938.8955C2238.4087,-927.3858 1561.5884,-891.5667 1333.7368,-879.5082"/>
 <polygon fill="#191970" stroke="#191970" points="2455.7247,-942.3915 2465.8957,-939.425 2456.0947,-935.4013 2455.7247,-942.3915"/>
 </g>
@@ -1243,15 +1243,15 @@
 </a>
 </g>
 </g>
-<!-- Node203&#45;&gt;Node21 -->
+<!-- Node205&#45;&gt;Node21 -->
 <g id="edge124" class="edge">
-<title>Node203&#45;&gt;Node21</title>
+<title>Node205&#45;&gt;Node21</title>
 <path fill="none" stroke="#191970" d="M2464.8776,-924.4123C2430.287,-913.8298 2387.1201,-900.6235 2354.3622,-890.6017"/>
 <polygon fill="#191970" stroke="#191970" points="2464.2657,-927.8852 2474.8521,-927.4639 2466.3136,-921.1914 2464.2657,-927.8852"/>
 </g>
-<!-- Node204 -->
+<!-- Node206 -->
 <g id="node45" class="node">
-<title>Node204</title>
+<title>Node206</title>
 <g id="a_node45"><a xlink:href="runtime_2debug_8h.html" target="_top" xlink:title="Helpers for debugging at runtime. ">
 <polygon fill="#ffffff" stroke="#000000" points="2399,-860.5 2399,-890.5 2515,-890.5 2515,-860.5 2399,-860.5"/>
 <text text-anchor="start" x="2407" y="-878.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -1259,15 +1259,15 @@
 </a>
 </g>
 </g>
-<!-- Node203&#45;&gt;Node204 -->
+<!-- Node205&#45;&gt;Node206 -->
 <g id="edge129" class="edge">
-<title>Node203&#45;&gt;Node204</title>
+<title>Node205&#45;&gt;Node206</title>
 <path fill="none" stroke="#191970" d="M2501.7113,-920.2113C2492.0009,-910.5009 2480.8698,-899.3698 2472.1432,-890.6432"/>
 <polygon fill="#191970" stroke="#191970" points="2499.2507,-922.7004 2508.7967,-927.2967 2504.2004,-917.7507 2499.2507,-922.7004"/>
 </g>
-<!-- Node205 -->
+<!-- Node207 -->
 <g id="node46" class="node">
-<title>Node205</title>
+<title>Node207</title>
 <g id="a_node46"><a xlink:href="device__api_8h.html" target="_top" xlink:title="Abstract device memory management API. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="3075,-793.5 3075,-823.5 3191,-823.5 3191,-793.5 3075,-793.5"/>
 <text text-anchor="start" x="3083" y="-811.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -1275,21 +1275,21 @@
 </a>
 </g>
 </g>
-<!-- Node203&#45;&gt;Node205 -->
+<!-- Node205&#45;&gt;Node207 -->
 <g id="edge130" class="edge">
-<title>Node203&#45;&gt;Node205</title>
+<title>Node205&#45;&gt;Node207</title>
 <path fill="none" stroke="#191970" d="M2591.9698,-931.9545C2650.9262,-922.4443 2738.4294,-907.4564 2814,-891 2908.3214,-870.4604 3016.9045,-841.1516 3079.9629,-823.5595"/>
 <polygon fill="#191970" stroke="#191970" points="2591.3512,-928.5089 2582.0321,-933.5493 2592.4604,-935.4205 2591.3512,-928.5089"/>
 </g>
-<!-- Node203&#45;&gt;Node209 -->
+<!-- Node205&#45;&gt;Node211 -->
 <g id="edge131" class="edge">
-<title>Node203&#45;&gt;Node209</title>
+<title>Node205&#45;&gt;Node211</title>
 <path fill="none" stroke="#191970" d="M2455.7934,-933.309C2387.574,-923.8269 2279.7414,-908.0528 2187,-891 2182.4728,-890.1676 2177.7951,-889.2627 2173.0986,-888.3223"/>
 <polygon fill="#191970" stroke="#191970" points="2455.5934,-936.8147 2465.9789,-934.7189 2456.5532,-929.8808 2455.5934,-936.8147"/>
 </g>
-<!-- Node215 -->
+<!-- Node217 -->
 <g id="node48" class="node">
-<title>Node215</title>
+<title>Node217</title>
 <g id="a_node48"><a xlink:href="serializer_8h.html" target="_top" xlink:title="Serializer extension to support TVM data types Include this file to enable serialization of DLDataTyp...">
 <polygon fill="#ffffff" stroke="#000000" points="2533,-860.5 2533,-890.5 2649,-890.5 2649,-860.5 2533,-860.5"/>
 <text text-anchor="start" x="2541" y="-878.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -1297,15 +1297,15 @@
 </a>
 </g>
 </g>
-<!-- Node203&#45;&gt;Node215 -->
+<!-- Node205&#45;&gt;Node217 -->
 <g id="edge149" class="edge">
-<title>Node203&#45;&gt;Node215</title>
+<title>Node205&#45;&gt;Node217</title>
 <path fill="none" stroke="#191970" d="M2540.4236,-919.6385C2549.417,-910.0653 2560.5116,-899.1987 2570.1033,-890.6432"/>
 <polygon fill="#191970" stroke="#191970" points="2537.5928,-917.5485 2533.4425,-927.2967 2542.766,-922.2643 2537.5928,-917.5485"/>
 </g>
-<!-- Node216 -->
+<!-- Node218 -->
 <g id="node49" class="node">
-<title>Node216</title>
+<title>Node218</title>
 <g id="a_node49"><a xlink:href="memory__manager_8h.html" target="_top" xlink:title="Abstract device memory management API. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="2667.5,-860.5 2667.5,-890.5 2804.5,-890.5 2804.5,-860.5 2667.5,-860.5"/>
 <text text-anchor="start" x="2675.5" y="-878.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/runtime</text>
@@ -1313,117 +1313,117 @@
 </a>
 </g>
 </g>
-<!-- Node203&#45;&gt;Node216 -->
+<!-- Node205&#45;&gt;Node218 -->
 <g id="edge151" class="edge">
-<title>Node203&#45;&gt;Node216</title>
+<title>Node205&#45;&gt;Node218</title>
 <path fill="none" stroke="#191970" d="M2581.2327,-924.4123C2614.7176,-913.8298 2656.5048,-900.6235 2688.2156,-890.6017"/>
 <polygon fill="#191970" stroke="#191970" points="2580.0574,-921.1131 2571.5769,-927.4639 2582.1668,-927.7877 2580.0574,-921.1131"/>
 </g>
-<!-- Node209&#45;&gt;Node136 -->
+<!-- Node211&#45;&gt;Node136 -->
 <g id="edge134" class="edge">
-<title>Node209&#45;&gt;Node136</title>
+<title>Node211&#45;&gt;Node136</title>
 <path fill="none" stroke="#191970" d="M2046.8808,-873.858C1901.9328,-869.7408 1573.0307,-856.8522 1535,-824 1501.603,-795.1505 1511.9308,-769.5624 1519,-726 1531.157,-651.0853 1567.4032,-448.9917 1627,-402 1699.864,-344.5473 1952.7672,-387.6338 2043,-366 2107.8274,-350.4573 2179.0622,-317.7714 2217.8482,-298.5349"/>
 <polygon fill="#191970" stroke="#191970" points="2046.8991,-877.3598 2056.9927,-874.1394 2047.0939,-870.3625 2046.8991,-877.3598"/>
 </g>
-<!-- Node209&#45;&gt;Node39 -->
+<!-- Node211&#45;&gt;Node39 -->
 <g id="edge143" class="edge">
-<title>Node209&#45;&gt;Node39</title>
+<title>Node211&#45;&gt;Node39</title>
 <path fill="none" stroke="#191970" d="M2183.3711,-864.962C2195.5574,-863.2108 2208.144,-861.4841 2220,-860 2375.1653,-840.577 2420.5592,-870.0539 2570,-824 2703.289,-782.9236 2759.25,-752.519 2811,-623 2869.7101,-476.0614 2944.8249,-384.7316 2838,-268 2832.4554,-261.9412 2623.5017,-237.8658 2507.214,-224.8901"/>
 <polygon fill="#191970" stroke="#191970" points="2182.6454,-861.5307 2173.2536,-866.4341 2183.6533,-868.4578 2182.6454,-861.5307"/>
 </g>
-<!-- Node209&#45;&gt;Node35 -->
+<!-- Node211&#45;&gt;Node35 -->
 <g id="edge137" class="edge">
-<title>Node209&#45;&gt;Node35</title>
+<title>Node211&#45;&gt;Node35</title>
 <path fill="none" stroke="#191970" d="M2183.3539,-864.8204C2195.542,-863.0844 2208.1337,-861.3994 2220,-860 2400.3453,-838.7312 2453.1704,-876.5362 2627,-824 2843.4636,-758.5786 2933,-643.6337 2933,-417.5 2933,-417.5 2933,-417.5 2933,-216.5 2933,-171.4217 2911.5201,-158.9858 2874,-134 2849.7351,-117.8412 2780.3703,-103.4143 2725.0098,-93.9704"/>
 <polygon fill="#191970" stroke="#191970" points="2182.6307,-861.3886 2173.2355,-866.2856 2183.6339,-868.3163 2182.6307,-861.3886"/>
 </g>
-<!-- Node209&#45;&gt;Node36 -->
+<!-- Node211&#45;&gt;Node36 -->
 <g id="edge145" class="edge">
-<title>Node209&#45;&gt;Node36</title>
+<title>Node211&#45;&gt;Node36</title>
 <path fill="none" stroke="#191970" d="M2183.3079,-864.3907C2195.501,-862.7007 2208.1062,-861.1425 2220,-860 2397.1423,-842.9839 2854.1111,-888.4191 3020,-824 3032.0779,-819.3098 3199,-687.4566 3199,-674.5 3199,-674.5 3199,-674.5 3199,-149.5 3199,-53.1249 2876.1887,-25.7697 2725.1862,-18.2457"/>
 <polygon fill="#191970" stroke="#191970" points="2182.5925,-860.9572 2173.1873,-865.835 2183.5815,-867.887 2182.5925,-860.9572"/>
 </g>
-<!-- Node209&#45;&gt;Node37 -->
+<!-- Node211&#45;&gt;Node37 -->
 <g id="edge146" class="edge">
-<title>Node209&#45;&gt;Node37</title>
+<title>Node211&#45;&gt;Node37</title>
 <path fill="none" stroke="#191970" d="M2046.8274,-874.7714C1896.2437,-872.4539 1545.7246,-862.8948 1510,-824 1480.5366,-791.922 1509.863,-769.5553 1510,-726 1510.2655,-641.5548 1517.7057,-620.1789 1511,-536 1506.7644,-482.8291 1492,-470.8394 1492,-417.5 1492,-417.5 1492,-417.5 1492,-350.5 1492,-245.0627 1838.5213,-182.5332 1995.8771,-159.574"/>
 <polygon fill="#191970" stroke="#191970" points="2046.8337,-878.2718 2056.8843,-874.92 2046.9372,-871.2726 2046.8337,-878.2718"/>
 </g>
-<!-- Node209&#45;&gt;Node38 -->
+<!-- Node211&#45;&gt;Node38 -->
 <g id="edge135" class="edge">
-<title>Node209&#45;&gt;Node38</title>
+<title>Node211&#45;&gt;Node38</title>
 <path fill="none" stroke="#191970" d="M2183.0314,-865.1298C2274.4047,-851.1366 2426.8091,-827.5427 2439,-824 2511.0678,-803.0566 2526.4226,-789.645 2594,-757 2650.7573,-729.582 2686.6259,-744.0838 2719,-690 2794.4285,-563.9899 2677.7033,-485.6584 2557,-402 2534.1784,-386.1825 2505.7623,-374.0357 2481.695,-365.5221"/>
 <polygon fill="#191970" stroke="#191970" points="2182.3787,-861.6888 2173.0233,-866.6614 2183.4377,-868.6082 2182.3787,-861.6888"/>
 </g>
-<!-- Node209&#45;&gt;Node138 -->
+<!-- Node211&#45;&gt;Node138 -->
 <g id="edge136" class="edge">
-<title>Node209&#45;&gt;Node138</title>
+<title>Node211&#45;&gt;Node138</title>
 <path fill="none" stroke="#191970" d="M2183.1879,-865.273C2195.4388,-863.4815 2208.0947,-861.6607 2220,-860 2460.8164,-826.4076 2564.9494,-858.7551 2740,-690 2767.7361,-663.2614 2782.0187,-659.273 2795,-623 2837.7248,-503.6166 2779.9737,-347.2286 2759.621,-298.5932"/>
 <polygon fill="#191970" stroke="#191970" points="2182.4029,-861.8507 2173.0177,-866.7668 2183.4202,-868.7764 2182.4029,-861.8507"/>
 </g>
-<!-- Node209&#45;&gt;Node139 -->
+<!-- Node211&#45;&gt;Node139 -->
 <g id="edge141" class="edge">
-<title>Node209&#45;&gt;Node139</title>
+<title>Node211&#45;&gt;Node139</title>
 <path fill="none" stroke="#191970" d="M2046.4015,-872.1988C1909.5224,-865.1493 1612.1374,-847.3458 1572,-824 1540.5579,-805.7118 1535.9528,-791.6857 1525,-757 1520.8513,-743.8617 1523.5774,-739.7041 1525,-726 1528.7981,-689.4123 1566.1367,-427.1286 1593,-402 1613.7389,-382.6003 1781.1915,-365.344 1881.937,-356.5852"/>
 <polygon fill="#191970" stroke="#191970" points="2046.5999,-875.7134 2056.7653,-872.7278 2046.9567,-868.7225 2046.5999,-875.7134"/>
 </g>
-<!-- Node209&#45;&gt;Node34 -->
+<!-- Node211&#45;&gt;Node34 -->
 <g id="edge133" class="edge">
-<title>Node209&#45;&gt;Node34</title>
+<title>Node211&#45;&gt;Node34</title>
 <path fill="none" stroke="#191970" d="M2183.3099,-864.4111C2195.5028,-862.7189 2208.1074,-861.1547 2220,-860 2389.3915,-843.5528 2827.2561,-887.8816 2985,-824 3005.8685,-815.5489 3048.9324,-776.0076 3061,-757 3081.4675,-724.7618 3085,-712.6867 3085,-674.5 3085,-674.5 3085,-674.5 3085,-551.5 3085,-506.031 3060.2031,-456.6991 3046.2549,-432.5465"/>
 <polygon fill="#191970" stroke="#191970" points="2182.5941,-860.9777 2173.1894,-865.8564 2183.5838,-867.9074 2182.5941,-860.9777"/>
 </g>
-<!-- Node209&#45;&gt;Node41 -->
+<!-- Node211&#45;&gt;Node41 -->
 <g id="edge140" class="edge">
-<title>Node209&#45;&gt;Node41</title>
+<title>Node211&#45;&gt;Node41</title>
 <path fill="none" stroke="#191970" d="M2183.3745,-862.3546C2304.6605,-838.3807 2547.1204,-787.2459 2572,-757 2657.5434,-653.0058 2513.3494,-484.2273 2464.9681,-432.8328"/>
 <polygon fill="#191970" stroke="#191970" points="2182.5755,-858.9446 2173.4403,-864.311 2183.9282,-865.8127 2182.5755,-858.9446"/>
 </g>
-<!-- Node209&#45;&gt;Node42 -->
+<!-- Node211&#45;&gt;Node42 -->
 <g id="edge144" class="edge">
-<title>Node209&#45;&gt;Node42</title>
+<title>Node211&#45;&gt;Node42</title>
 <path fill="none" stroke="#191970" d="M2046.6012,-873.4516C1904.3991,-867.4517 1585.6655,-844.3292 1533,-757 1525.8848,-745.2017 1529.4197,-739.3045 1533,-726 1541.8299,-693.1877 1667.1137,-489.8427 1702.6597,-432.5247"/>
 <polygon fill="#191970" stroke="#191970" points="2046.8049,-876.9627 2056.9384,-873.8706 2047.0884,-869.9684 2046.8049,-876.9627"/>
 </g>
-<!-- Node209&#45;&gt;Node151 -->
+<!-- Node211&#45;&gt;Node153 -->
 <g id="edge132" class="edge">
-<title>Node209&#45;&gt;Node151</title>
+<title>Node211&#45;&gt;Node153</title>
 <path fill="none" stroke="#191970" d="M2046.7074,-871.2955C1917.5825,-863.208 1628.807,-844.4646 1386,-824 1159.5106,-804.9107 572.6735,-796.7331 372,-690 294.1703,-648.6044 198.9491,-609.9065 247,-536 261.2926,-514.0168 287.3627,-501.343 311.1727,-494.0739"/>
 <polygon fill="#191970" stroke="#191970" points="2046.6438,-874.7982 2056.8427,-871.9289 2047.0805,-867.8118 2046.6438,-874.7982"/>
 </g>
-<!-- Node209&#45;&gt;Node144 -->
+<!-- Node211&#45;&gt;Node146 -->
 <g id="edge142" class="edge">
-<title>Node209&#45;&gt;Node144</title>
+<title>Node211&#45;&gt;Node146</title>
 <path fill="none" stroke="#191970" d="M2046.621,-863.7216C1948.9676,-845.3473 1777.3093,-806.7491 1741,-757 1697.2918,-697.1133 1751.198,-602.9849 1775.392,-566.5767"/>
 <polygon fill="#191970" stroke="#191970" points="2046.23,-867.2088 2056.7013,-865.596 2047.5097,-860.3268 2046.23,-867.2088"/>
 </g>
-<!-- Node209&#45;&gt;Node190 -->
+<!-- Node211&#45;&gt;Node192 -->
 <g id="edge147" class="edge">
-<title>Node209&#45;&gt;Node190</title>
+<title>Node211&#45;&gt;Node192</title>
 <path fill="none" stroke="#191970" d="M2130.183,-851.8428C2137.6505,-840.2073 2146.2338,-826.8334 2151.8677,-818.055"/>
 <polygon fill="#191970" stroke="#191970" points="2127.2131,-849.9903 2124.7574,-860.2967 2133.1042,-853.7712 2127.2131,-849.9903"/>
 </g>
-<!-- Node209&#45;&gt;Node142 -->
+<!-- Node211&#45;&gt;Node142 -->
 <g id="edge138" class="edge">
-<title>Node209&#45;&gt;Node142</title>
+<title>Node211&#45;&gt;Node142</title>
 <path fill="none" stroke="#191970" d="M2046.9803,-871.8842C1947.2273,-865.7442 1766.7286,-851.2124 1708,-824 1674.5361,-808.4942 1644.9844,-775.9537 1629.4863,-756.6983"/>
 <polygon fill="#191970" stroke="#191970" points="2046.7711,-875.3778 2056.964,-872.4872 2047.1933,-868.3905 2046.7711,-875.3778"/>
 </g>
-<!-- Node209&#45;&gt;Node143 -->
+<!-- Node211&#45;&gt;Node143 -->
 <g id="edge139" class="edge">
-<title>Node209&#45;&gt;Node143</title>
+<title>Node211&#45;&gt;Node143</title>
 <path fill="none" stroke="#191970" d="M2073.1461,-856.0937C2015.0284,-829.1464 1911.1114,-780.9634 1858.4711,-756.5558"/>
 <polygon fill="#191970" stroke="#191970" points="2071.8653,-859.3577 2082.4098,-860.389 2074.8099,-853.0071 2071.8653,-859.3577"/>
 </g>
-<!-- Node209&#45;&gt;Node205 -->
+<!-- Node211&#45;&gt;Node207 -->
 <g id="edge148" class="edge">
-<title>Node209&#45;&gt;Node205</title>
+<title>Node211&#45;&gt;Node207</title>
 <path fill="none" stroke="#191970" d="M2183.3058,-864.3689C2195.4991,-862.6812 2208.1049,-861.1295 2220,-860 2592.4448,-824.6348 2689.9125,-871.5381 3061,-824 3065.5658,-823.4151 3070.2713,-822.6882 3074.9869,-821.8708"/>
 <polygon fill="#191970" stroke="#191970" points="2182.5908,-860.9354 2173.1851,-865.8122 2183.5791,-867.8652 2182.5908,-860.9354"/>
 </g>
-<!-- Node215&#45;&gt;Node203 -->
+<!-- Node217&#45;&gt;Node205 -->
 <g id="edge150" class="edge">
-<title>Node215&#45;&gt;Node203</title>
+<title>Node217&#45;&gt;Node205</title>
 <path fill="none" stroke="#191970" d="M2574.6396,-898.2943C2565.656,-907.8614 2554.5628,-918.7303 2544.9642,-927.2967"/>
 <polygon fill="#191970" stroke="#191970" points="2577.4628,-900.3924 2581.6104,-890.6432 2572.2883,-895.678 2577.4628,-900.3924"/>
 </g>
diff --git a/docs/reference/api/doxygen/dir_000003_000017.html b/docs/reference/api/doxygen/dir_000003_000020.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000003_000017.html
rename to docs/reference/api/doxygen/dir_000003_000020.html
diff --git a/docs/reference/api/doxygen/dir_000003_000031.html b/docs/reference/api/doxygen/dir_000003_000034.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000003_000031.html
rename to docs/reference/api/doxygen/dir_000003_000034.html
diff --git a/docs/reference/api/doxygen/dir_000003_000032.html b/docs/reference/api/doxygen/dir_000003_000035.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000003_000032.html
rename to docs/reference/api/doxygen/dir_000003_000035.html
diff --git a/docs/reference/api/doxygen/dir_000004_000011.html b/docs/reference/api/doxygen/dir_000004_000011.html
index cda03dd014..8c62a2fa0d 100644
--- a/docs/reference/api/doxygen/dir_000004_000011.html
+++ b/docs/reference/api/doxygen/dir_000004_000011.html
@@ -62,7 +62,7 @@ $(function() {
 </div>
 </div><!-- top -->
 <div class="contents">
-<h3>meta_schedule &rarr; tir Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/meta_schedule</th><th class="dirtab">Includes file in include/tvm/tir</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="arg__info_8h.html">arg_info.h</a></td><td class="dirtab"><a class="el" href="tir_2function_8h.html">function.h</a></td></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="meta__schedule_2cost__model_8h.html">cost_model.h</ [...]
+<h3>meta_schedule &rarr; tir Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/meta_schedule</th><th class="dirtab">Includes file in include/tvm/tir</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="arg__info_8h.html">arg_info.h</a></td><td class="dirtab"><a class="el" href="tir_2function_8h.html">function.h</a></td></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="meta__schedule_2cost__model_8h.html">cost_model.h</ [...]
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
 Generated by &#160;<a href="http://www.doxygen.org/index.html">
diff --git a/docs/reference/api/doxygen/dir_000004_000017.html b/docs/reference/api/doxygen/dir_000004_000020.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000004_000017.html
rename to docs/reference/api/doxygen/dir_000004_000020.html
diff --git a/docs/reference/api/doxygen/dir_000004_000031.html b/docs/reference/api/doxygen/dir_000004_000034.html
similarity index 100%
copy from docs/reference/api/doxygen/dir_000004_000031.html
copy to docs/reference/api/doxygen/dir_000004_000034.html
diff --git a/docs/reference/api/doxygen/dir_000005_000017.html b/docs/reference/api/doxygen/dir_000005_000020.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000005_000017.html
rename to docs/reference/api/doxygen/dir_000005_000020.html
diff --git a/docs/reference/api/doxygen/dir_000005_000031.html b/docs/reference/api/doxygen/dir_000005_000034.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000005_000031.html
rename to docs/reference/api/doxygen/dir_000005_000034.html
diff --git a/docs/reference/api/doxygen/dir_000005_000032.html b/docs/reference/api/doxygen/dir_000005_000035.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000005_000032.html
rename to docs/reference/api/doxygen/dir_000005_000035.html
diff --git a/docs/reference/api/doxygen/dir_000006_000031.html b/docs/reference/api/doxygen/dir_000006_000034.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000006_000031.html
rename to docs/reference/api/doxygen/dir_000006_000034.html
diff --git a/docs/reference/api/doxygen/dir_000006_000032.html b/docs/reference/api/doxygen/dir_000006_000035.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000006_000032.html
rename to docs/reference/api/doxygen/dir_000006_000035.html
diff --git a/docs/reference/api/doxygen/dir_000007_000017.html b/docs/reference/api/doxygen/dir_000007_000020.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000007_000017.html
rename to docs/reference/api/doxygen/dir_000007_000020.html
diff --git a/docs/reference/api/doxygen/dir_000007_000018.html b/docs/reference/api/doxygen/dir_000007_000021.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000007_000018.html
rename to docs/reference/api/doxygen/dir_000007_000021.html
diff --git a/docs/reference/api/doxygen/dir_000007_000031.html b/docs/reference/api/doxygen/dir_000007_000034.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000007_000031.html
rename to docs/reference/api/doxygen/dir_000007_000034.html
diff --git a/docs/reference/api/doxygen/dir_000011_000017.html b/docs/reference/api/doxygen/dir_000011_000020.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000011_000017.html
rename to docs/reference/api/doxygen/dir_000011_000020.html
diff --git a/docs/reference/api/doxygen/dir_000011_000031.html b/docs/reference/api/doxygen/dir_000011_000034.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000011_000031.html
rename to docs/reference/api/doxygen/dir_000011_000034.html
diff --git a/docs/reference/api/doxygen/dir_000014_000031.html b/docs/reference/api/doxygen/dir_000014_000034.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000014_000031.html
rename to docs/reference/api/doxygen/dir_000014_000034.html
diff --git a/docs/reference/api/doxygen/dir_000015_000031.html b/docs/reference/api/doxygen/dir_000015_000034.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000015_000031.html
rename to docs/reference/api/doxygen/dir_000015_000034.html
diff --git a/docs/reference/api/doxygen/dir_000016_000031.html b/docs/reference/api/doxygen/dir_000016_000034.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000016_000031.html
rename to docs/reference/api/doxygen/dir_000016_000034.html
diff --git a/docs/reference/api/doxygen/dir_000016_000032.html b/docs/reference/api/doxygen/dir_000016_000035.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000016_000032.html
rename to docs/reference/api/doxygen/dir_000016_000035.html
diff --git a/docs/reference/api/doxygen/dir_000017_000008.html b/docs/reference/api/doxygen/dir_000017_000008.html
deleted file mode 100644
index ec3402ae9e..0000000000
--- a/docs/reference/api/doxygen/dir_000017_000008.html
+++ /dev/null
@@ -1,73 +0,0 @@
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml">
-<head>
-<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
-<meta http-equiv="X-UA-Compatible" content="IE=9"/>
-<meta name="generator" content="Doxygen 1.8.13"/>
-<meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>tvm: include/tvm/node -&gt; runtime Relation</title>
-<link href="tabs.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="jquery.js"></script>
-<script type="text/javascript" src="dynsections.js"></script>
-<link href="search/search.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="search/searchdata.js"></script>
-<script type="text/javascript" src="search/search.js"></script>
-<link href="doxygen.css" rel="stylesheet" type="text/css" />
-</head>
-<body>
-<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
-<div id="titlearea">
-<table cellspacing="0" cellpadding="0">
- <tbody>
- <tr style="height: 56px;">
-  <td id="projectalign" style="padding-left: 0.5em;">
-   <div id="projectname">tvm
-   </div>
-  </td>
- </tr>
- </tbody>
-</table>
-</div>
-<!-- end header part -->
-<!-- Generated by Doxygen 1.8.13 -->
-<script type="text/javascript">
-var searchBox = new SearchBox("searchBox", "search",false,'Search');
-</script>
-<script type="text/javascript" src="menudata.js"></script>
-<script type="text/javascript" src="menu.js"></script>
-<script type="text/javascript">
-$(function() {
-  initMenu('',true,false,'search.php','Search');
-  $(document).ready(function() { init_search(); });
-});
-</script>
-<div id="main-nav"></div>
-<!-- window showing the filter options -->
-<div id="MSearchSelectWindow"
-     onmouseover="return searchBox.OnSearchSelectShow()"
-     onmouseout="return searchBox.OnSearchSelectHide()"
-     onkeydown="return searchBox.OnSearchSelectKey(event)">
-</div>
-
-<!-- iframe showing the search results (closed by default) -->
-<div id="MSearchResultsWindow">
-<iframe src="javascript:void(0)" frameborder="0" 
-        name="MSearchResults" id="MSearchResults">
-</iframe>
-</div>
-
-<div id="nav-path" class="navpath">
-  <ul>
-<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_6cd4295f6ad5aa17e5b568d0e5b190e5.html">node</a></li>  </ul>
-</div>
-</div><!-- top -->
-<div class="contents">
-<h3>node &rarr; runtime Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/node</th><th class="dirtab">Includes file in include/tvm/runtime</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="attr__registry__map_8h.html">attr_registry_map.h</a></td><td class="dirtab"><a class="el" href="dir_5603323dd0cfe2b74c32570eba23c68c.html">container</a>&#160;/&#160;<a class="el" href="string_8h.html">string.h</a></td></tr><tr class="dirtab">< [...]
-<!-- start footer part -->
-<hr class="footer"/><address class="footer"><small>
-Generated by &#160;<a href="http://www.doxygen.org/index.html">
-<img class="footer" src="doxygen.png" alt="doxygen"/>
-</a> 1.8.13
-</small></address>
-</body>
-</html>
diff --git a/docs/reference/api/doxygen/dir_000037_000038.html b/docs/reference/api/doxygen/dir_000017_000011.html
similarity index 67%
copy from docs/reference/api/doxygen/dir_000037_000038.html
copy to docs/reference/api/doxygen/dir_000017_000011.html
index ffd59c2f70..0d175d7593 100644
--- a/docs/reference/api/doxygen/dir_000037_000038.html
+++ b/docs/reference/api/doxygen/dir_000017_000011.html
@@ -5,7 +5,7 @@
 <meta http-equiv="X-UA-Compatible" content="IE=9"/>
 <meta name="generator" content="Doxygen 1.8.13"/>
 <meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>tvm: include/tvm/topi/cuda -&gt; nn Relation</title>
+<title>tvm: include/tvm/meta_schedule/schedule -&gt; tir Relation</title>
 <link href="tabs.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript" src="jquery.js"></script>
 <script type="text/javascript" src="dynsections.js"></script>
@@ -58,11 +58,11 @@ $(function() {
 
 <div id="nav-path" class="navpath">
   <ul>
-<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_8e4e25e66b8623d88c5b5dd2040bca97.html">topi</a></li><li class="navelem"><a class="el" href="dir_ac57496531ccbad72f774fa62e6de987.html">cuda</a></li>  </ul>
+<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_4378f18824ae7d4ad48f8d7785cd7ac8.html">meta_schedule</a></li><li class="navelem"><a class="el" href="dir_af4961563c20a83bf971a498792e6dee.html">schedule</a></li>  </ul>
 </div>
 </div><!-- top -->
 <div class="contents">
-<h3>cuda &rarr; nn Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/topi/cuda</th><th class="dirtab">Includes file in include/tvm/topi/nn</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="cuda_2dense_8h.html">dense.h</a></td><td class="dirtab"><a class="el" href="nn_2dense_8h.html">dense.h</a></td></tr></table></div><!-- contents -->
+<h3>schedule &rarr; tir Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/meta_schedule/schedule</th><th class="dirtab">Includes file in include/tvm/tir</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="dir_c20c9fad5dedbc870b2ada04754d1b9b.html">cuda</a>&#160;/&#160;<a class="el" href="thread__bind_8h.html">thread_bind.h</a></td><td class="dirtab"><a class="el" href="dir_006b1f4ac353a18abb55f74cc4796db6.html">schedule</a>&#160;/ [...]
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
 Generated by &#160;<a href="http://www.doxygen.org/index.html">
diff --git a/docs/reference/api/doxygen/dir_000018_000007.html b/docs/reference/api/doxygen/dir_000018_000007.html
deleted file mode 100644
index ffa20f89a6..0000000000
--- a/docs/reference/api/doxygen/dir_000018_000007.html
+++ /dev/null
@@ -1,73 +0,0 @@
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml">
-<head>
-<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
-<meta http-equiv="X-UA-Compatible" content="IE=9"/>
-<meta name="generator" content="Doxygen 1.8.13"/>
-<meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>tvm: include/tvm/parser -&gt; ir Relation</title>
-<link href="tabs.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="jquery.js"></script>
-<script type="text/javascript" src="dynsections.js"></script>
-<link href="search/search.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="search/searchdata.js"></script>
-<script type="text/javascript" src="search/search.js"></script>
-<link href="doxygen.css" rel="stylesheet" type="text/css" />
-</head>
-<body>
-<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
-<div id="titlearea">
-<table cellspacing="0" cellpadding="0">
- <tbody>
- <tr style="height: 56px;">
-  <td id="projectalign" style="padding-left: 0.5em;">
-   <div id="projectname">tvm
-   </div>
-  </td>
- </tr>
- </tbody>
-</table>
-</div>
-<!-- end header part -->
-<!-- Generated by Doxygen 1.8.13 -->
-<script type="text/javascript">
-var searchBox = new SearchBox("searchBox", "search",false,'Search');
-</script>
-<script type="text/javascript" src="menudata.js"></script>
-<script type="text/javascript" src="menu.js"></script>
-<script type="text/javascript">
-$(function() {
-  initMenu('',true,false,'search.php','Search');
-  $(document).ready(function() { init_search(); });
-});
-</script>
-<div id="main-nav"></div>
-<!-- window showing the filter options -->
-<div id="MSearchSelectWindow"
-     onmouseover="return searchBox.OnSearchSelectShow()"
-     onmouseout="return searchBox.OnSearchSelectHide()"
-     onkeydown="return searchBox.OnSearchSelectKey(event)">
-</div>
-
-<!-- iframe showing the search results (closed by default) -->
-<div id="MSearchResultsWindow">
-<iframe src="javascript:void(0)" frameborder="0" 
-        name="MSearchResults" id="MSearchResults">
-</iframe>
-</div>
-
-<div id="nav-path" class="navpath">
-  <ul>
-<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_fafc18f54a755f417c55c769623cbfef.html">parser</a></li>  </ul>
-</div>
-</div><!-- top -->
-<div class="contents">
-<h3>parser &rarr; ir Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/parser</th><th class="dirtab">Includes file in include/tvm/ir</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="parser_8h.html">parser.h</a></td><td class="dirtab"><a class="el" href="ir_2module_8h.html">module.h</a></td></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="parser_8h.html">parser.h</a></td><td class="dirtab"><a class="el" href="ir_2t [...]
-<!-- start footer part -->
-<hr class="footer"/><address class="footer"><small>
-Generated by &#160;<a href="http://www.doxygen.org/index.html">
-<img class="footer" src="doxygen.png" alt="doxygen"/>
-</a> 1.8.13
-</small></address>
-</body>
-</html>
diff --git a/docs/reference/api/doxygen/dir_000037_000036.html b/docs/reference/api/doxygen/dir_000018_000011.html
similarity index 76%
copy from docs/reference/api/doxygen/dir_000037_000036.html
copy to docs/reference/api/doxygen/dir_000018_000011.html
index eca8758fb7..150fcbd15d 100644
--- a/docs/reference/api/doxygen/dir_000037_000036.html
+++ b/docs/reference/api/doxygen/dir_000018_000011.html
@@ -5,7 +5,7 @@
 <meta http-equiv="X-UA-Compatible" content="IE=9"/>
 <meta name="generator" content="Doxygen 1.8.13"/>
 <meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>tvm: include/tvm/topi/cuda -&gt; contrib Relation</title>
+<title>tvm: include/tvm/meta_schedule/schedule/cuda -&gt; tir Relation</title>
 <link href="tabs.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript" src="jquery.js"></script>
 <script type="text/javascript" src="dynsections.js"></script>
@@ -58,11 +58,11 @@ $(function() {
 
 <div id="nav-path" class="navpath">
   <ul>
-<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_8e4e25e66b8623d88c5b5dd2040bca97.html">topi</a></li><li class="navelem"><a class="el" href="dir_ac57496531ccbad72f774fa62e6de987.html">cuda</a></li>  </ul>
+<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_4378f18824ae7d4ad48f8d7785cd7ac8.html">meta_schedule</a></li><li class="navelem"><a class="el" href="dir_af4961563c20a83bf971a498792e6dee.html">schedule</a></li><li class="navelem"><a class="el" href="dir_c20c9fad5dedbc870b2ada04754d1b9b.html">cuda</a></ [...]
 </div>
 </div><!-- top -->
 <div class="contents">
-<h3>cuda &rarr; contrib Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/topi/cuda</th><th class="dirtab">Includes file in include/tvm/topi/contrib</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="cuda_2dense_8h.html">dense.h</a></td><td class="dirtab"><a class="el" href="cublas_8h.html">cublas.h</a></td></tr></table></div><!-- contents -->
+<h3>cuda &rarr; tir Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/meta_schedule/schedule/cuda</th><th class="dirtab">Includes file in include/tvm/tir</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="thread__bind_8h.html">thread_bind.h</a></td><td class="dirtab"><a class="el" href="dir_006b1f4ac353a18abb55f74cc4796db6.html">schedule</a>&#160;/&#160;<a class="el" href="tir_2schedule_2schedule_8h.html">schedule.h</a></td></tr> [...]
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
 Generated by &#160;<a href="http://www.doxygen.org/index.html">
diff --git a/docs/reference/api/doxygen/dir_000019_000009.html b/docs/reference/api/doxygen/dir_000019_000009.html
deleted file mode 100644
index 467d80f875..0000000000
--- a/docs/reference/api/doxygen/dir_000019_000009.html
+++ /dev/null
@@ -1,73 +0,0 @@
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml">
-<head>
-<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
-<meta http-equiv="X-UA-Compatible" content="IE=9"/>
-<meta name="generator" content="Doxygen 1.8.13"/>
-<meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>tvm: include/tvm/runtime/vm -&gt; container Relation</title>
-<link href="tabs.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="jquery.js"></script>
-<script type="text/javascript" src="dynsections.js"></script>
-<link href="search/search.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="search/searchdata.js"></script>
-<script type="text/javascript" src="search/search.js"></script>
-<link href="doxygen.css" rel="stylesheet" type="text/css" />
-</head>
-<body>
-<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
-<div id="titlearea">
-<table cellspacing="0" cellpadding="0">
- <tbody>
- <tr style="height: 56px;">
-  <td id="projectalign" style="padding-left: 0.5em;">
-   <div id="projectname">tvm
-   </div>
-  </td>
- </tr>
- </tbody>
-</table>
-</div>
-<!-- end header part -->
-<!-- Generated by Doxygen 1.8.13 -->
-<script type="text/javascript">
-var searchBox = new SearchBox("searchBox", "search",false,'Search');
-</script>
-<script type="text/javascript" src="menudata.js"></script>
-<script type="text/javascript" src="menu.js"></script>
-<script type="text/javascript">
-$(function() {
-  initMenu('',true,false,'search.php','Search');
-  $(document).ready(function() { init_search(); });
-});
-</script>
-<div id="main-nav"></div>
-<!-- window showing the filter options -->
-<div id="MSearchSelectWindow"
-     onmouseover="return searchBox.OnSearchSelectShow()"
-     onmouseout="return searchBox.OnSearchSelectHide()"
-     onkeydown="return searchBox.OnSearchSelectKey(event)">
-</div>
-
-<!-- iframe showing the search results (closed by default) -->
-<div id="MSearchResultsWindow">
-<iframe src="javascript:void(0)" frameborder="0" 
-        name="MSearchResults" id="MSearchResults">
-</iframe>
-</div>
-
-<div id="nav-path" class="navpath">
-  <ul>
-<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_404558507ed35459f0d7a6d81d8c508d.html">runtime</a></li><li class="navelem"><a class="el" href="dir_5baffeed82c1190bfdf7a4f918ab5ac6.html">vm</a></li>  </ul>
-</div>
-</div><!-- top -->
-<div class="contents">
-<h3>vm &rarr; container Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/runtime/vm</th><th class="dirtab">Includes file in include/tvm/runtime/container</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="executable_8h.html">executable.h</a></td><td class="dirtab"><a class="el" href="map_8h.html">map.h</a></td></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="executable_8h.html">executable.h</a></td><td class="dirta [...]
-<!-- start footer part -->
-<hr class="footer"/><address class="footer"><small>
-Generated by &#160;<a href="http://www.doxygen.org/index.html">
-<img class="footer" src="doxygen.png" alt="doxygen"/>
-</a> 1.8.13
-</small></address>
-</body>
-</html>
diff --git a/docs/reference/api/doxygen/dir_000033_000007.html b/docs/reference/api/doxygen/dir_000019_000011.html
similarity index 74%
copy from docs/reference/api/doxygen/dir_000033_000007.html
copy to docs/reference/api/doxygen/dir_000019_000011.html
index 91b24f88f7..74a0682eec 100644
--- a/docs/reference/api/doxygen/dir_000033_000007.html
+++ b/docs/reference/api/doxygen/dir_000019_000011.html
@@ -5,7 +5,7 @@
 <meta http-equiv="X-UA-Compatible" content="IE=9"/>
 <meta name="generator" content="Doxygen 1.8.13"/>
 <meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>tvm: include/tvm/tir/schedule -&gt; ir Relation</title>
+<title>tvm: include/tvm/meta_schedule/schedule/generic -&gt; tir Relation</title>
 <link href="tabs.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript" src="jquery.js"></script>
 <script type="text/javascript" src="dynsections.js"></script>
@@ -58,11 +58,11 @@ $(function() {
 
 <div id="nav-path" class="navpath">
   <ul>
-<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_72c2f11201cd7636dc7624de0754daa5.html">tir</a></li><li class="navelem"><a class="el" href="dir_006b1f4ac353a18abb55f74cc4796db6.html">schedule</a></li>  </ul>
+<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_4378f18824ae7d4ad48f8d7785cd7ac8.html">meta_schedule</a></li><li class="navelem"><a class="el" href="dir_af4961563c20a83bf971a498792e6dee.html">schedule</a></li><li class="navelem"><a class="el" href="dir_437a885699bf6787e92bcac6040bb86f.html">generic</a [...]
 </div>
 </div><!-- top -->
 <div class="contents">
-<h3>schedule &rarr; ir Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/tir/schedule</th><th class="dirtab">Includes file in include/tvm/ir</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="state_8h.html">state.h</a></td><td class="dirtab"><a class="el" href="ir_2module_8h.html">module.h</a></td></tr></table></div><!-- contents -->
+<h3>generic &rarr; tir Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/meta_schedule/schedule/generic</th><th class="dirtab">Includes file in include/tvm/tir</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="winograd_8h.html">winograd.h</a></td><td class="dirtab"><a class="el" href="dir_006b1f4ac353a18abb55f74cc4796db6.html">schedule</a>&#160;/&#160;<a class="el" href="tir_2schedule_2schedule_8h.html">schedule.h</a></td></tr>< [...]
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
 Generated by &#160;<a href="http://www.doxygen.org/index.html">
diff --git a/docs/reference/api/doxygen/dir_000020_000008.html b/docs/reference/api/doxygen/dir_000020_000008.html
index 728ff1aaec..ec3402ae9e 100644
--- a/docs/reference/api/doxygen/dir_000020_000008.html
+++ b/docs/reference/api/doxygen/dir_000020_000008.html
@@ -5,7 +5,7 @@
 <meta http-equiv="X-UA-Compatible" content="IE=9"/>
 <meta name="generator" content="Doxygen 1.8.13"/>
 <meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>tvm: include/tvm/script -&gt; runtime Relation</title>
+<title>tvm: include/tvm/node -&gt; runtime Relation</title>
 <link href="tabs.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript" src="jquery.js"></script>
 <script type="text/javascript" src="dynsections.js"></script>
@@ -58,11 +58,11 @@ $(function() {
 
 <div id="nav-path" class="navpath">
   <ul>
-<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_84875704194fd544d29fe0c7fedd8939.html">script</a></li>  </ul>
+<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_6cd4295f6ad5aa17e5b568d0e5b190e5.html">node</a></li>  </ul>
 </div>
 </div><!-- top -->
 <div class="contents">
-<h3>script &rarr; runtime Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/script</th><th class="dirtab">Includes file in include/tvm/runtime</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="dir_a59a89c7dd2e4e6561fe59bf359ce2f3.html">printer</a>&#160;/&#160;<a class="el" href="doc_8h.html">doc.h</a></td><td class="dirtab"><a class="el" href="data__type_8h.html">data_type.h</a></td></tr><tr class="dirtab"><td class="dirtab"><a  [...]
+<h3>node &rarr; runtime Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/node</th><th class="dirtab">Includes file in include/tvm/runtime</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="attr__registry__map_8h.html">attr_registry_map.h</a></td><td class="dirtab"><a class="el" href="dir_5603323dd0cfe2b74c32570eba23c68c.html">container</a>&#160;/&#160;<a class="el" href="string_8h.html">string.h</a></td></tr><tr class="dirtab">< [...]
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
 Generated by &#160;<a href="http://www.doxygen.org/index.html">
diff --git a/docs/reference/api/doxygen/dir_000021_000007.html b/docs/reference/api/doxygen/dir_000021_000007.html
index 4353af73c5..ffa20f89a6 100644
--- a/docs/reference/api/doxygen/dir_000021_000007.html
+++ b/docs/reference/api/doxygen/dir_000021_000007.html
@@ -5,7 +5,7 @@
 <meta http-equiv="X-UA-Compatible" content="IE=9"/>
 <meta name="generator" content="Doxygen 1.8.13"/>
 <meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>tvm: include/tvm/script/ir_builder -&gt; ir Relation</title>
+<title>tvm: include/tvm/parser -&gt; ir Relation</title>
 <link href="tabs.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript" src="jquery.js"></script>
 <script type="text/javascript" src="dynsections.js"></script>
@@ -58,11 +58,11 @@ $(function() {
 
 <div id="nav-path" class="navpath">
   <ul>
-<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_84875704194fd544d29fe0c7fedd8939.html">script</a></li><li class="navelem"><a class="el" href="dir_9e615ec4a59e46584bcc4e2226e148a2.html">ir_builder</a></li>  </ul>
+<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_fafc18f54a755f417c55c769623cbfef.html">parser</a></li>  </ul>
 </div>
 </div><!-- top -->
 <div class="contents">
-<h3>ir_builder &rarr; ir Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/script/ir_builder</th><th class="dirtab">Includes file in include/tvm/ir</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="script_2ir__builder_2base_8h.html">base.h</a></td><td class="dirtab"><a class="el" href="ir_2expr_8h.html">expr.h</a></td></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="script_2ir__builder_2base_8h.html">base.h</a></td [...]
+<h3>parser &rarr; ir Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/parser</th><th class="dirtab">Includes file in include/tvm/ir</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="parser_8h.html">parser.h</a></td><td class="dirtab"><a class="el" href="ir_2module_8h.html">module.h</a></td></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="parser_8h.html">parser.h</a></td><td class="dirtab"><a class="el" href="ir_2t [...]
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
 Generated by &#160;<a href="http://www.doxygen.org/index.html">
diff --git a/docs/reference/api/doxygen/dir_000018_000008.html b/docs/reference/api/doxygen/dir_000021_000008.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000018_000008.html
rename to docs/reference/api/doxygen/dir_000021_000008.html
diff --git a/docs/reference/api/doxygen/dir_000022_000009.html b/docs/reference/api/doxygen/dir_000022_000009.html
index 507155aaa2..467d80f875 100644
--- a/docs/reference/api/doxygen/dir_000022_000009.html
+++ b/docs/reference/api/doxygen/dir_000022_000009.html
@@ -5,7 +5,7 @@
 <meta http-equiv="X-UA-Compatible" content="IE=9"/>
 <meta name="generator" content="Doxygen 1.8.13"/>
 <meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>tvm: include/tvm/runtime/contrib -&gt; container Relation</title>
+<title>tvm: include/tvm/runtime/vm -&gt; container Relation</title>
 <link href="tabs.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript" src="jquery.js"></script>
 <script type="text/javascript" src="dynsections.js"></script>
@@ -58,11 +58,11 @@ $(function() {
 
 <div id="nav-path" class="navpath">
   <ul>
-<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_404558507ed35459f0d7a6d81d8c508d.html">runtime</a></li><li class="navelem"><a class="el" href="dir_536029070df27a3ee03a4230630922c5.html">contrib</a></li>  </ul>
+<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_404558507ed35459f0d7a6d81d8c508d.html">runtime</a></li><li class="navelem"><a class="el" href="dir_5baffeed82c1190bfdf7a4f918ab5ac6.html">vm</a></li>  </ul>
 </div>
 </div><!-- top -->
 <div class="contents">
-<h3>contrib &rarr; container Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/runtime/contrib</th><th class="dirtab">Includes file in include/tvm/runtime/container</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="papi_8h.html">papi.h</a></td><td class="dirtab"><a class="el" href="array_8h.html">array.h</a></td></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="papi_8h.html">papi.h</a></td><td class="dirtab"><a clas [...]
+<h3>vm &rarr; container Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/runtime/vm</th><th class="dirtab">Includes file in include/tvm/runtime/container</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="executable_8h.html">executable.h</a></td><td class="dirtab"><a class="el" href="map_8h.html">map.h</a></td></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="executable_8h.html">executable.h</a></td><td class="dirta [...]
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
 Generated by &#160;<a href="http://www.doxygen.org/index.html">
diff --git a/docs/reference/api/doxygen/dir_000020_000007.html b/docs/reference/api/doxygen/dir_000023_000007.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000020_000007.html
rename to docs/reference/api/doxygen/dir_000023_000007.html
diff --git a/docs/reference/api/doxygen/dir_000020_000008.html b/docs/reference/api/doxygen/dir_000023_000008.html
similarity index 100%
copy from docs/reference/api/doxygen/dir_000020_000008.html
copy to docs/reference/api/doxygen/dir_000023_000008.html
diff --git a/docs/reference/api/doxygen/dir_000020_000011.html b/docs/reference/api/doxygen/dir_000023_000011.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000020_000011.html
rename to docs/reference/api/doxygen/dir_000023_000011.html
diff --git a/docs/reference/api/doxygen/dir_000020_000013.html b/docs/reference/api/doxygen/dir_000023_000013.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000020_000013.html
rename to docs/reference/api/doxygen/dir_000023_000013.html
diff --git a/docs/reference/api/doxygen/dir_000020_000017.html b/docs/reference/api/doxygen/dir_000023_000020.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000020_000017.html
rename to docs/reference/api/doxygen/dir_000023_000020.html
diff --git a/docs/reference/api/doxygen/dir_000021_000007.html b/docs/reference/api/doxygen/dir_000024_000007.html
similarity index 100%
copy from docs/reference/api/doxygen/dir_000021_000007.html
copy to docs/reference/api/doxygen/dir_000024_000007.html
diff --git a/docs/reference/api/doxygen/dir_000021_000011.html b/docs/reference/api/doxygen/dir_000024_000011.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000021_000011.html
rename to docs/reference/api/doxygen/dir_000024_000011.html
diff --git a/docs/reference/api/doxygen/dir_000021_000017.html b/docs/reference/api/doxygen/dir_000024_000020.html
similarity index 100%
copy from docs/reference/api/doxygen/dir_000021_000017.html
copy to docs/reference/api/doxygen/dir_000024_000020.html
diff --git a/docs/reference/api/doxygen/dir_000022_000009.html b/docs/reference/api/doxygen/dir_000025_000009.html
similarity index 100%
copy from docs/reference/api/doxygen/dir_000022_000009.html
copy to docs/reference/api/doxygen/dir_000025_000009.html
diff --git a/docs/reference/api/doxygen/dir_000028_000007.html b/docs/reference/api/doxygen/dir_000028_000007.html
deleted file mode 100644
index 9c4cd2bd32..0000000000
--- a/docs/reference/api/doxygen/dir_000028_000007.html
+++ /dev/null
@@ -1,73 +0,0 @@
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml">
-<head>
-<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
-<meta http-equiv="X-UA-Compatible" content="IE=9"/>
-<meta name="generator" content="Doxygen 1.8.13"/>
-<meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>tvm: include/tvm/script/ir_builder/ir -&gt; ir Relation</title>
-<link href="tabs.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="jquery.js"></script>
-<script type="text/javascript" src="dynsections.js"></script>
-<link href="search/search.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="search/searchdata.js"></script>
-<script type="text/javascript" src="search/search.js"></script>
-<link href="doxygen.css" rel="stylesheet" type="text/css" />
-</head>
-<body>
-<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
-<div id="titlearea">
-<table cellspacing="0" cellpadding="0">
- <tbody>
- <tr style="height: 56px;">
-  <td id="projectalign" style="padding-left: 0.5em;">
-   <div id="projectname">tvm
-   </div>
-  </td>
- </tr>
- </tbody>
-</table>
-</div>
-<!-- end header part -->
-<!-- Generated by Doxygen 1.8.13 -->
-<script type="text/javascript">
-var searchBox = new SearchBox("searchBox", "search",false,'Search');
-</script>
-<script type="text/javascript" src="menudata.js"></script>
-<script type="text/javascript" src="menu.js"></script>
-<script type="text/javascript">
-$(function() {
-  initMenu('',true,false,'search.php','Search');
-  $(document).ready(function() { init_search(); });
-});
-</script>
-<div id="main-nav"></div>
-<!-- window showing the filter options -->
-<div id="MSearchSelectWindow"
-     onmouseover="return searchBox.OnSearchSelectShow()"
-     onmouseout="return searchBox.OnSearchSelectHide()"
-     onkeydown="return searchBox.OnSearchSelectKey(event)">
-</div>
-
-<!-- iframe showing the search results (closed by default) -->
-<div id="MSearchResultsWindow">
-<iframe src="javascript:void(0)" frameborder="0" 
-        name="MSearchResults" id="MSearchResults">
-</iframe>
-</div>
-
-<div id="nav-path" class="navpath">
-  <ul>
-<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_84875704194fd544d29fe0c7fedd8939.html">script</a></li><li class="navelem"><a class="el" href="dir_9e615ec4a59e46584bcc4e2226e148a2.html">ir_builder</a></li><li class="navelem"><a class="el" href="dir_e4a1a856a30057b9b1543256279fc7a1.html">ir</a></li>  </ul>
-</div>
-</div><!-- top -->
-<div class="contents">
-<h3>ir &rarr; ir Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/script/ir_builder/ir</th><th class="dirtab">Includes file in include/tvm/ir</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="ir__builder_2ir_2frame_8h.html">frame.h</a></td><td class="dirtab"><a class="el" href="ir_2expr_8h.html">expr.h</a></td></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="ir__builder_2ir_2frame_8h.html">frame.h</a></td><td clas [...]
-<!-- start footer part -->
-<hr class="footer"/><address class="footer"><small>
-Generated by &#160;<a href="http://www.doxygen.org/index.html">
-<img class="footer" src="doxygen.png" alt="doxygen"/>
-</a> 1.8.13
-</small></address>
-</body>
-</html>
diff --git a/docs/reference/api/doxygen/dir_000029_000011.html b/docs/reference/api/doxygen/dir_000029_000011.html
deleted file mode 100644
index c5dd4b458c..0000000000
--- a/docs/reference/api/doxygen/dir_000029_000011.html
+++ /dev/null
@@ -1,73 +0,0 @@
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml">
-<head>
-<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
-<meta http-equiv="X-UA-Compatible" content="IE=9"/>
-<meta name="generator" content="Doxygen 1.8.13"/>
-<meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>tvm: include/tvm/script/ir_builder/tir -&gt; tir Relation</title>
-<link href="tabs.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="jquery.js"></script>
-<script type="text/javascript" src="dynsections.js"></script>
-<link href="search/search.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="search/searchdata.js"></script>
-<script type="text/javascript" src="search/search.js"></script>
-<link href="doxygen.css" rel="stylesheet" type="text/css" />
-</head>
-<body>
-<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
-<div id="titlearea">
-<table cellspacing="0" cellpadding="0">
- <tbody>
- <tr style="height: 56px;">
-  <td id="projectalign" style="padding-left: 0.5em;">
-   <div id="projectname">tvm
-   </div>
-  </td>
- </tr>
- </tbody>
-</table>
-</div>
-<!-- end header part -->
-<!-- Generated by Doxygen 1.8.13 -->
-<script type="text/javascript">
-var searchBox = new SearchBox("searchBox", "search",false,'Search');
-</script>
-<script type="text/javascript" src="menudata.js"></script>
-<script type="text/javascript" src="menu.js"></script>
-<script type="text/javascript">
-$(function() {
-  initMenu('',true,false,'search.php','Search');
-  $(document).ready(function() { init_search(); });
-});
-</script>
-<div id="main-nav"></div>
-<!-- window showing the filter options -->
-<div id="MSearchSelectWindow"
-     onmouseover="return searchBox.OnSearchSelectShow()"
-     onmouseout="return searchBox.OnSearchSelectHide()"
-     onkeydown="return searchBox.OnSearchSelectKey(event)">
-</div>
-
-<!-- iframe showing the search results (closed by default) -->
-<div id="MSearchResultsWindow">
-<iframe src="javascript:void(0)" frameborder="0" 
-        name="MSearchResults" id="MSearchResults">
-</iframe>
-</div>
-
-<div id="nav-path" class="navpath">
-  <ul>
-<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_84875704194fd544d29fe0c7fedd8939.html">script</a></li><li class="navelem"><a class="el" href="dir_9e615ec4a59e46584bcc4e2226e148a2.html">ir_builder</a></li><li class="navelem"><a class="el" href="dir_67fdee7a5e0396034822418fa5baa4b4.html">tir</a></li>  </ul>
-</div>
-</div><!-- top -->
-<div class="contents">
-<h3>tir &rarr; tir Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/script/ir_builder/tir</th><th class="dirtab">Includes file in include/tvm/tir</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="ir__builder_2tir_2frame_8h.html">frame.h</a></td><td class="dirtab"><a class="el" href="stmt_8h.html">stmt.h</a></td></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="tir_2ir_8h.html">ir.h</a></td><td class="dirtab"><a cla [...]
-<!-- start footer part -->
-<hr class="footer"/><address class="footer"><small>
-Generated by &#160;<a href="http://www.doxygen.org/index.html">
-<img class="footer" src="doxygen.png" alt="doxygen"/>
-</a> 1.8.13
-</small></address>
-</body>
-</html>
diff --git a/docs/reference/api/doxygen/dir_000030_000007.html b/docs/reference/api/doxygen/dir_000030_000007.html
deleted file mode 100644
index 77bf207f04..0000000000
--- a/docs/reference/api/doxygen/dir_000030_000007.html
+++ /dev/null
@@ -1,73 +0,0 @@
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml">
-<head>
-<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
-<meta http-equiv="X-UA-Compatible" content="IE=9"/>
-<meta name="generator" content="Doxygen 1.8.13"/>
-<meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>tvm: include/tvm/script/printer -&gt; ir Relation</title>
-<link href="tabs.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="jquery.js"></script>
-<script type="text/javascript" src="dynsections.js"></script>
-<link href="search/search.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="search/searchdata.js"></script>
-<script type="text/javascript" src="search/search.js"></script>
-<link href="doxygen.css" rel="stylesheet" type="text/css" />
-</head>
-<body>
-<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
-<div id="titlearea">
-<table cellspacing="0" cellpadding="0">
- <tbody>
- <tr style="height: 56px;">
-  <td id="projectalign" style="padding-left: 0.5em;">
-   <div id="projectname">tvm
-   </div>
-  </td>
- </tr>
- </tbody>
-</table>
-</div>
-<!-- end header part -->
-<!-- Generated by Doxygen 1.8.13 -->
-<script type="text/javascript">
-var searchBox = new SearchBox("searchBox", "search",false,'Search');
-</script>
-<script type="text/javascript" src="menudata.js"></script>
-<script type="text/javascript" src="menu.js"></script>
-<script type="text/javascript">
-$(function() {
-  initMenu('',true,false,'search.php','Search');
-  $(document).ready(function() { init_search(); });
-});
-</script>
-<div id="main-nav"></div>
-<!-- window showing the filter options -->
-<div id="MSearchSelectWindow"
-     onmouseover="return searchBox.OnSearchSelectShow()"
-     onmouseout="return searchBox.OnSearchSelectHide()"
-     onkeydown="return searchBox.OnSearchSelectKey(event)">
-</div>
-
-<!-- iframe showing the search results (closed by default) -->
-<div id="MSearchResultsWindow">
-<iframe src="javascript:void(0)" frameborder="0" 
-        name="MSearchResults" id="MSearchResults">
-</iframe>
-</div>
-
-<div id="nav-path" class="navpath">
-  <ul>
-<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_84875704194fd544d29fe0c7fedd8939.html">script</a></li><li class="navelem"><a class="el" href="dir_a59a89c7dd2e4e6561fe59bf359ce2f3.html">printer</a></li>  </ul>
-</div>
-</div><!-- top -->
-<div class="contents">
-<h3>printer &rarr; ir Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/script/printer</th><th class="dirtab">Includes file in include/tvm/ir</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="doc_8h.html">doc.h</a></td><td class="dirtab"><a class="el" href="ir_2expr_8h.html">expr.h</a></td></tr></table></div><!-- contents -->
-<!-- start footer part -->
-<hr class="footer"/><address class="footer"><small>
-Generated by &#160;<a href="http://www.doxygen.org/index.html">
-<img class="footer" src="doxygen.png" alt="doxygen"/>
-</a> 1.8.13
-</small></address>
-</body>
-</html>
diff --git a/docs/reference/api/doxygen/dir_000030_000013.html b/docs/reference/api/doxygen/dir_000030_000013.html
deleted file mode 100644
index 505fd2d1a4..0000000000
--- a/docs/reference/api/doxygen/dir_000030_000013.html
+++ /dev/null
@@ -1,73 +0,0 @@
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml">
-<head>
-<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
-<meta http-equiv="X-UA-Compatible" content="IE=9"/>
-<meta name="generator" content="Doxygen 1.8.13"/>
-<meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>tvm: include/tvm/script/printer -&gt; support Relation</title>
-<link href="tabs.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="jquery.js"></script>
-<script type="text/javascript" src="dynsections.js"></script>
-<link href="search/search.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="search/searchdata.js"></script>
-<script type="text/javascript" src="search/search.js"></script>
-<link href="doxygen.css" rel="stylesheet" type="text/css" />
-</head>
-<body>
-<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
-<div id="titlearea">
-<table cellspacing="0" cellpadding="0">
- <tbody>
- <tr style="height: 56px;">
-  <td id="projectalign" style="padding-left: 0.5em;">
-   <div id="projectname">tvm
-   </div>
-  </td>
- </tr>
- </tbody>
-</table>
-</div>
-<!-- end header part -->
-<!-- Generated by Doxygen 1.8.13 -->
-<script type="text/javascript">
-var searchBox = new SearchBox("searchBox", "search",false,'Search');
-</script>
-<script type="text/javascript" src="menudata.js"></script>
-<script type="text/javascript" src="menu.js"></script>
-<script type="text/javascript">
-$(function() {
-  initMenu('',true,false,'search.php','Search');
-  $(document).ready(function() { init_search(); });
-});
-</script>
-<div id="main-nav"></div>
-<!-- window showing the filter options -->
-<div id="MSearchSelectWindow"
-     onmouseover="return searchBox.OnSearchSelectShow()"
-     onmouseout="return searchBox.OnSearchSelectHide()"
-     onkeydown="return searchBox.OnSearchSelectKey(event)">
-</div>
-
-<!-- iframe showing the search results (closed by default) -->
-<div id="MSearchResultsWindow">
-<iframe src="javascript:void(0)" frameborder="0" 
-        name="MSearchResults" id="MSearchResults">
-</iframe>
-</div>
-
-<div id="nav-path" class="navpath">
-  <ul>
-<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_84875704194fd544d29fe0c7fedd8939.html">script</a></li><li class="navelem"><a class="el" href="dir_a59a89c7dd2e4e6561fe59bf359ce2f3.html">printer</a></li>  </ul>
-</div>
-</div><!-- top -->
-<div class="contents">
-<h3>printer &rarr; support Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/script/printer</th><th class="dirtab">Includes file in include/tvm/support</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="ir__docsifier_8h.html">ir_docsifier.h</a></td><td class="dirtab"><a class="el" href="with_8h.html">with.h</a></td></tr></table></div><!-- contents -->
-<!-- start footer part -->
-<hr class="footer"/><address class="footer"><small>
-Generated by &#160;<a href="http://www.doxygen.org/index.html">
-<img class="footer" src="doxygen.png" alt="doxygen"/>
-</a> 1.8.13
-</small></address>
-</body>
-</html>
diff --git a/docs/reference/api/doxygen/dir_000031_000007.html b/docs/reference/api/doxygen/dir_000031_000007.html
index 9c631dac41..9c4cd2bd32 100644
--- a/docs/reference/api/doxygen/dir_000031_000007.html
+++ b/docs/reference/api/doxygen/dir_000031_000007.html
@@ -5,7 +5,7 @@
 <meta http-equiv="X-UA-Compatible" content="IE=9"/>
 <meta name="generator" content="Doxygen 1.8.13"/>
 <meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>tvm: include/tvm/target -&gt; ir Relation</title>
+<title>tvm: include/tvm/script/ir_builder/ir -&gt; ir Relation</title>
 <link href="tabs.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript" src="jquery.js"></script>
 <script type="text/javascript" src="dynsections.js"></script>
@@ -58,11 +58,11 @@ $(function() {
 
 <div id="nav-path" class="navpath">
   <ul>
-<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_5da96592f3a7c442b838b075c58254c2.html">target</a></li>  </ul>
+<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_84875704194fd544d29fe0c7fedd8939.html">script</a></li><li class="navelem"><a class="el" href="dir_9e615ec4a59e46584bcc4e2226e148a2.html">ir_builder</a></li><li class="navelem"><a class="el" href="dir_e4a1a856a30057b9b1543256279fc7a1.html">ir</a></li>  </ul>
 </div>
 </div><!-- top -->
 <div class="contents">
-<h3>target &rarr; ir Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/target</th><th class="dirtab">Includes file in include/tvm/ir</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="codegen_8h.html">codegen.h</a></td><td class="dirtab"><a class="el" href="ir_2module_8h.html">module.h</a></td></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="target_8h.html">target.h</a></td><td class="dirtab"><a class="el" href="ir_ [...]
+<h3>ir &rarr; ir Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/script/ir_builder/ir</th><th class="dirtab">Includes file in include/tvm/ir</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="ir__builder_2ir_2frame_8h.html">frame.h</a></td><td class="dirtab"><a class="el" href="ir_2expr_8h.html">expr.h</a></td></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="ir__builder_2ir_2frame_8h.html">frame.h</a></td><td clas [...]
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
 Generated by &#160;<a href="http://www.doxygen.org/index.html">
diff --git a/docs/reference/api/doxygen/dir_000028_000017.html b/docs/reference/api/doxygen/dir_000031_000020.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000028_000017.html
rename to docs/reference/api/doxygen/dir_000031_000020.html
diff --git a/docs/reference/api/doxygen/dir_000032_000002.html b/docs/reference/api/doxygen/dir_000032_000002.html
deleted file mode 100644
index 25d0a0947c..0000000000
--- a/docs/reference/api/doxygen/dir_000032_000002.html
+++ /dev/null
@@ -1,73 +0,0 @@
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml">
-<head>
-<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
-<meta http-equiv="X-UA-Compatible" content="IE=9"/>
-<meta name="generator" content="Doxygen 1.8.13"/>
-<meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>tvm: include/tvm/te -&gt; arith Relation</title>
-<link href="tabs.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="jquery.js"></script>
-<script type="text/javascript" src="dynsections.js"></script>
-<link href="search/search.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="search/searchdata.js"></script>
-<script type="text/javascript" src="search/search.js"></script>
-<link href="doxygen.css" rel="stylesheet" type="text/css" />
-</head>
-<body>
-<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
-<div id="titlearea">
-<table cellspacing="0" cellpadding="0">
- <tbody>
- <tr style="height: 56px;">
-  <td id="projectalign" style="padding-left: 0.5em;">
-   <div id="projectname">tvm
-   </div>
-  </td>
- </tr>
- </tbody>
-</table>
-</div>
-<!-- end header part -->
-<!-- Generated by Doxygen 1.8.13 -->
-<script type="text/javascript">
-var searchBox = new SearchBox("searchBox", "search",false,'Search');
-</script>
-<script type="text/javascript" src="menudata.js"></script>
-<script type="text/javascript" src="menu.js"></script>
-<script type="text/javascript">
-$(function() {
-  initMenu('',true,false,'search.php','Search');
-  $(document).ready(function() { init_search(); });
-});
-</script>
-<div id="main-nav"></div>
-<!-- window showing the filter options -->
-<div id="MSearchSelectWindow"
-     onmouseover="return searchBox.OnSearchSelectShow()"
-     onmouseout="return searchBox.OnSearchSelectHide()"
-     onkeydown="return searchBox.OnSearchSelectKey(event)">
-</div>
-
-<!-- iframe showing the search results (closed by default) -->
-<div id="MSearchResultsWindow">
-<iframe src="javascript:void(0)" frameborder="0" 
-        name="MSearchResults" id="MSearchResults">
-</iframe>
-</div>
-
-<div id="nav-path" class="navpath">
-  <ul>
-<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_f97d855a3173728370e632aa77170e34.html">te</a></li>  </ul>
-</div>
-</div><!-- top -->
-<div class="contents">
-<h3>te &rarr; arith Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/te</th><th class="dirtab">Includes file in include/tvm/arith</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="operation_8h.html">operation.h</a></td><td class="dirtab"><a class="el" href="analyzer_8h.html">analyzer.h</a></td></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="tensor_8h.html">tensor.h</a></td><td class="dirtab"><a class="el" href="b [...]
-<!-- start footer part -->
-<hr class="footer"/><address class="footer"><small>
-Generated by &#160;<a href="http://www.doxygen.org/index.html">
-<img class="footer" src="doxygen.png" alt="doxygen"/>
-</a> 1.8.13
-</small></address>
-</body>
-</html>
diff --git a/docs/reference/api/doxygen/dir_000032_000011.html b/docs/reference/api/doxygen/dir_000032_000011.html
index 38974afb8a..c5dd4b458c 100644
--- a/docs/reference/api/doxygen/dir_000032_000011.html
+++ b/docs/reference/api/doxygen/dir_000032_000011.html
@@ -5,7 +5,7 @@
 <meta http-equiv="X-UA-Compatible" content="IE=9"/>
 <meta name="generator" content="Doxygen 1.8.13"/>
 <meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>tvm: include/tvm/te -&gt; tir Relation</title>
+<title>tvm: include/tvm/script/ir_builder/tir -&gt; tir Relation</title>
 <link href="tabs.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript" src="jquery.js"></script>
 <script type="text/javascript" src="dynsections.js"></script>
@@ -58,11 +58,11 @@ $(function() {
 
 <div id="nav-path" class="navpath">
   <ul>
-<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_f97d855a3173728370e632aa77170e34.html">te</a></li>  </ul>
+<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_84875704194fd544d29fe0c7fedd8939.html">script</a></li><li class="navelem"><a class="el" href="dir_9e615ec4a59e46584bcc4e2226e148a2.html">ir_builder</a></li><li class="navelem"><a class="el" href="dir_67fdee7a5e0396034822418fa5baa4b4.html">tir</a></li>  </ul>
 </div>
 </div><!-- top -->
 <div class="contents">
-<h3>te &rarr; tir Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/te</th><th class="dirtab">Includes file in include/tvm/tir</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="autodiff_8h.html">autodiff.h</a></td><td class="dirtab"><a class="el" href="tir_2expr_8h.html">expr.h</a></td></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="operation_8h.html">operation.h</a></td><td class="dirtab"><a class="el" href="buff [...]
+<h3>tir &rarr; tir Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/script/ir_builder/tir</th><th class="dirtab">Includes file in include/tvm/tir</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="ir__builder_2tir_2frame_8h.html">frame.h</a></td><td class="dirtab"><a class="el" href="stmt_8h.html">stmt.h</a></td></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="tir_2ir_8h.html">ir.h</a></td><td class="dirtab"><a cla [...]
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
 Generated by &#160;<a href="http://www.doxygen.org/index.html">
diff --git a/docs/reference/api/doxygen/dir_000029_000028.html b/docs/reference/api/doxygen/dir_000032_000031.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000029_000028.html
rename to docs/reference/api/doxygen/dir_000032_000031.html
diff --git a/docs/reference/api/doxygen/dir_000033_000007.html b/docs/reference/api/doxygen/dir_000033_000007.html
index 91b24f88f7..77bf207f04 100644
--- a/docs/reference/api/doxygen/dir_000033_000007.html
+++ b/docs/reference/api/doxygen/dir_000033_000007.html
@@ -5,7 +5,7 @@
 <meta http-equiv="X-UA-Compatible" content="IE=9"/>
 <meta name="generator" content="Doxygen 1.8.13"/>
 <meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>tvm: include/tvm/tir/schedule -&gt; ir Relation</title>
+<title>tvm: include/tvm/script/printer -&gt; ir Relation</title>
 <link href="tabs.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript" src="jquery.js"></script>
 <script type="text/javascript" src="dynsections.js"></script>
@@ -58,11 +58,11 @@ $(function() {
 
 <div id="nav-path" class="navpath">
   <ul>
-<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_72c2f11201cd7636dc7624de0754daa5.html">tir</a></li><li class="navelem"><a class="el" href="dir_006b1f4ac353a18abb55f74cc4796db6.html">schedule</a></li>  </ul>
+<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_84875704194fd544d29fe0c7fedd8939.html">script</a></li><li class="navelem"><a class="el" href="dir_a59a89c7dd2e4e6561fe59bf359ce2f3.html">printer</a></li>  </ul>
 </div>
 </div><!-- top -->
 <div class="contents">
-<h3>schedule &rarr; ir Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/tir/schedule</th><th class="dirtab">Includes file in include/tvm/ir</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="state_8h.html">state.h</a></td><td class="dirtab"><a class="el" href="ir_2module_8h.html">module.h</a></td></tr></table></div><!-- contents -->
+<h3>printer &rarr; ir Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/script/printer</th><th class="dirtab">Includes file in include/tvm/ir</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="doc_8h.html">doc.h</a></td><td class="dirtab"><a class="el" href="ir_2expr_8h.html">expr.h</a></td></tr></table></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
 Generated by &#160;<a href="http://www.doxygen.org/index.html">
diff --git a/docs/reference/api/doxygen/dir_000030_000008.html b/docs/reference/api/doxygen/dir_000033_000008.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000030_000008.html
rename to docs/reference/api/doxygen/dir_000033_000008.html
diff --git a/docs/reference/api/doxygen/dir_000033_000013.html b/docs/reference/api/doxygen/dir_000033_000013.html
index 562658abec..505fd2d1a4 100644
--- a/docs/reference/api/doxygen/dir_000033_000013.html
+++ b/docs/reference/api/doxygen/dir_000033_000013.html
@@ -5,7 +5,7 @@
 <meta http-equiv="X-UA-Compatible" content="IE=9"/>
 <meta name="generator" content="Doxygen 1.8.13"/>
 <meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>tvm: include/tvm/tir/schedule -&gt; support Relation</title>
+<title>tvm: include/tvm/script/printer -&gt; support Relation</title>
 <link href="tabs.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript" src="jquery.js"></script>
 <script type="text/javascript" src="dynsections.js"></script>
@@ -58,11 +58,11 @@ $(function() {
 
 <div id="nav-path" class="navpath">
   <ul>
-<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_72c2f11201cd7636dc7624de0754daa5.html">tir</a></li><li class="navelem"><a class="el" href="dir_006b1f4ac353a18abb55f74cc4796db6.html">schedule</a></li>  </ul>
+<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_84875704194fd544d29fe0c7fedd8939.html">script</a></li><li class="navelem"><a class="el" href="dir_a59a89c7dd2e4e6561fe59bf359ce2f3.html">printer</a></li>  </ul>
 </div>
 </div><!-- top -->
 <div class="contents">
-<h3>schedule &rarr; support Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/tir/schedule</th><th class="dirtab">Includes file in include/tvm/support</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="tir_2schedule_2schedule_8h.html">schedule.h</a></td><td class="dirtab"><a class="el" href="random__engine_8h.html">random_engine.h</a></td></tr></table></div><!-- contents -->
+<h3>printer &rarr; support Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/script/printer</th><th class="dirtab">Includes file in include/tvm/support</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="ir__docsifier_8h.html">ir_docsifier.h</a></td><td class="dirtab"><a class="el" href="with_8h.html">with.h</a></td></tr></table></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
 Generated by &#160;<a href="http://www.doxygen.org/index.html">
diff --git a/docs/reference/api/doxygen/dir_000030_000017.html b/docs/reference/api/doxygen/dir_000033_000020.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000030_000017.html
rename to docs/reference/api/doxygen/dir_000033_000020.html
diff --git a/docs/reference/api/doxygen/dir_000031_000007.html b/docs/reference/api/doxygen/dir_000034_000007.html
similarity index 100%
copy from docs/reference/api/doxygen/dir_000031_000007.html
copy to docs/reference/api/doxygen/dir_000034_000007.html
diff --git a/docs/reference/api/doxygen/dir_000034_000008.html b/docs/reference/api/doxygen/dir_000034_000008.html
index 0dca21380f..b3cc2f095b 100644
--- a/docs/reference/api/doxygen/dir_000034_000008.html
+++ b/docs/reference/api/doxygen/dir_000034_000008.html
@@ -5,7 +5,7 @@
 <meta http-equiv="X-UA-Compatible" content="IE=9"/>
 <meta name="generator" content="Doxygen 1.8.13"/>
 <meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>tvm: include/tvm/tir/usmp/algo -&gt; runtime Relation</title>
+<title>tvm: include/tvm/target -&gt; runtime Relation</title>
 <link href="tabs.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript" src="jquery.js"></script>
 <script type="text/javascript" src="dynsections.js"></script>
@@ -58,11 +58,11 @@ $(function() {
 
 <div id="nav-path" class="navpath">
   <ul>
-<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_72c2f11201cd7636dc7624de0754daa5.html">tir</a></li><li class="navelem"><a class="el" href="dir_05ffda4d144d7985f926507abde48dbb.html">usmp</a></li><li class="navelem"><a class="el" href="dir_d4a54fa981698f72ef4cd62f8b9e1a8f.html">algo</a></li>  </ul>
+<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_5da96592f3a7c442b838b075c58254c2.html">target</a></li>  </ul>
 </div>
 </div><!-- top -->
 <div class="contents">
-<h3>algo &rarr; runtime Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/tir/usmp/algo</th><th class="dirtab">Includes file in include/tvm/runtime</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="greedy_8h.html">greedy.h</a></td><td class="dirtab"><a class="el" href="device__api_8h.html">device_api.h</a></td></tr></table></div><!-- contents -->
+<h3>target &rarr; runtime Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/target</th><th class="dirtab">Includes file in include/tvm/runtime</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="codegen_8h.html">codegen.h</a></td><td class="dirtab"><a class="el" href="packed__func_8h.html">packed_func.h</a></td></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="generic__func_8h.html">generic_func.h</a></td><td class="d [...]
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
 Generated by &#160;<a href="http://www.doxygen.org/index.html">
diff --git a/docs/reference/api/doxygen/dir_000031_000011.html b/docs/reference/api/doxygen/dir_000034_000011.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000031_000011.html
rename to docs/reference/api/doxygen/dir_000034_000011.html
diff --git a/docs/reference/api/doxygen/dir_000031_000013.html b/docs/reference/api/doxygen/dir_000034_000013.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000031_000013.html
rename to docs/reference/api/doxygen/dir_000034_000013.html
diff --git a/docs/reference/api/doxygen/dir_000031_000017.html b/docs/reference/api/doxygen/dir_000034_000020.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000031_000017.html
rename to docs/reference/api/doxygen/dir_000034_000020.html
diff --git a/docs/reference/api/doxygen/dir_000035_000002.html b/docs/reference/api/doxygen/dir_000035_000002.html
index d190cb5151..25d0a0947c 100644
--- a/docs/reference/api/doxygen/dir_000035_000002.html
+++ b/docs/reference/api/doxygen/dir_000035_000002.html
@@ -5,7 +5,7 @@
 <meta http-equiv="X-UA-Compatible" content="IE=9"/>
 <meta name="generator" content="Doxygen 1.8.13"/>
 <meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>tvm: include/tvm/topi/detail -&gt; arith Relation</title>
+<title>tvm: include/tvm/te -&gt; arith Relation</title>
 <link href="tabs.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript" src="jquery.js"></script>
 <script type="text/javascript" src="dynsections.js"></script>
@@ -58,11 +58,11 @@ $(function() {
 
 <div id="nav-path" class="navpath">
   <ul>
-<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_8e4e25e66b8623d88c5b5dd2040bca97.html">topi</a></li><li class="navelem"><a class="el" href="dir_1f1b12d204a071c9e67e47fcbb552b86.html">detail</a></li>  </ul>
+<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_f97d855a3173728370e632aa77170e34.html">te</a></li>  </ul>
 </div>
 </div><!-- top -->
 <div class="contents">
-<h3>detail &rarr; arith Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/topi/detail</th><th class="dirtab">Includes file in include/tvm/arith</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="constant__utils_8h.html">constant_utils.h</a></td><td class="dirtab"><a class="el" href="analyzer_8h.html">analyzer.h</a></td></tr></table></div><!-- contents -->
+<h3>te &rarr; arith Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/te</th><th class="dirtab">Includes file in include/tvm/arith</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="operation_8h.html">operation.h</a></td><td class="dirtab"><a class="el" href="analyzer_8h.html">analyzer.h</a></td></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="tensor_8h.html">tensor.h</a></td><td class="dirtab"><a class="el" href="b [...]
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
 Generated by &#160;<a href="http://www.doxygen.org/index.html">
diff --git a/docs/reference/api/doxygen/dir_000032_000008.html b/docs/reference/api/doxygen/dir_000035_000008.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000032_000008.html
rename to docs/reference/api/doxygen/dir_000035_000008.html
diff --git a/docs/reference/api/doxygen/dir_000035_000011.html b/docs/reference/api/doxygen/dir_000035_000011.html
index 710d4acefd..38974afb8a 100644
--- a/docs/reference/api/doxygen/dir_000035_000011.html
+++ b/docs/reference/api/doxygen/dir_000035_000011.html
@@ -5,7 +5,7 @@
 <meta http-equiv="X-UA-Compatible" content="IE=9"/>
 <meta name="generator" content="Doxygen 1.8.13"/>
 <meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>tvm: include/tvm/topi/detail -&gt; tir Relation</title>
+<title>tvm: include/tvm/te -&gt; tir Relation</title>
 <link href="tabs.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript" src="jquery.js"></script>
 <script type="text/javascript" src="dynsections.js"></script>
@@ -58,11 +58,11 @@ $(function() {
 
 <div id="nav-path" class="navpath">
   <ul>
-<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_8e4e25e66b8623d88c5b5dd2040bca97.html">topi</a></li><li class="navelem"><a class="el" href="dir_1f1b12d204a071c9e67e47fcbb552b86.html">detail</a></li>  </ul>
+<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_f97d855a3173728370e632aa77170e34.html">te</a></li>  </ul>
 </div>
 </div><!-- top -->
 <div class="contents">
-<h3>detail &rarr; tir Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/topi/detail</th><th class="dirtab">Includes file in include/tvm/tir</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="constant__utils_8h.html">constant_utils.h</a></td><td class="dirtab"><a class="el" href="tir_2analysis_8h.html">analysis.h</a></td></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="constant__utils_8h.html">constant_utils.h</a></t [...]
+<h3>te &rarr; tir Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/te</th><th class="dirtab">Includes file in include/tvm/tir</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="autodiff_8h.html">autodiff.h</a></td><td class="dirtab"><a class="el" href="tir_2expr_8h.html">expr.h</a></td></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="operation_8h.html">operation.h</a></td><td class="dirtab"><a class="el" href="buff [...]
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
 Generated by &#160;<a href="http://www.doxygen.org/index.html">
diff --git a/docs/reference/api/doxygen/dir_000032_000013.html b/docs/reference/api/doxygen/dir_000035_000013.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000032_000013.html
rename to docs/reference/api/doxygen/dir_000035_000013.html
diff --git a/docs/reference/api/doxygen/dir_000035_000032.html b/docs/reference/api/doxygen/dir_000035_000032.html
deleted file mode 100644
index 355d22fe42..0000000000
--- a/docs/reference/api/doxygen/dir_000035_000032.html
+++ /dev/null
@@ -1,73 +0,0 @@
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml">
-<head>
-<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
-<meta http-equiv="X-UA-Compatible" content="IE=9"/>
-<meta name="generator" content="Doxygen 1.8.13"/>
-<meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>tvm: include/tvm/topi/detail -&gt; te Relation</title>
-<link href="tabs.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="jquery.js"></script>
-<script type="text/javascript" src="dynsections.js"></script>
-<link href="search/search.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="search/searchdata.js"></script>
-<script type="text/javascript" src="search/search.js"></script>
-<link href="doxygen.css" rel="stylesheet" type="text/css" />
-</head>
-<body>
-<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
-<div id="titlearea">
-<table cellspacing="0" cellpadding="0">
- <tbody>
- <tr style="height: 56px;">
-  <td id="projectalign" style="padding-left: 0.5em;">
-   <div id="projectname">tvm
-   </div>
-  </td>
- </tr>
- </tbody>
-</table>
-</div>
-<!-- end header part -->
-<!-- Generated by Doxygen 1.8.13 -->
-<script type="text/javascript">
-var searchBox = new SearchBox("searchBox", "search",false,'Search');
-</script>
-<script type="text/javascript" src="menudata.js"></script>
-<script type="text/javascript" src="menu.js"></script>
-<script type="text/javascript">
-$(function() {
-  initMenu('',true,false,'search.php','Search');
-  $(document).ready(function() { init_search(); });
-});
-</script>
-<div id="main-nav"></div>
-<!-- window showing the filter options -->
-<div id="MSearchSelectWindow"
-     onmouseover="return searchBox.OnSearchSelectShow()"
-     onmouseout="return searchBox.OnSearchSelectHide()"
-     onkeydown="return searchBox.OnSearchSelectKey(event)">
-</div>
-
-<!-- iframe showing the search results (closed by default) -->
-<div id="MSearchResultsWindow">
-<iframe src="javascript:void(0)" frameborder="0" 
-        name="MSearchResults" id="MSearchResults">
-</iframe>
-</div>
-
-<div id="nav-path" class="navpath">
-  <ul>
-<li class="navelem"><a class="el" href="dir_d44c64559bbebec7f509842c48db8b23.html">include</a></li><li class="navelem"><a class="el" href="dir_b4c7d8e826c599ba55146c099a14beb5.html">tvm</a></li><li class="navelem"><a class="el" href="dir_8e4e25e66b8623d88c5b5dd2040bca97.html">topi</a></li><li class="navelem"><a class="el" href="dir_1f1b12d204a071c9e67e47fcbb552b86.html">detail</a></li>  </ul>
-</div>
-</div><!-- top -->
-<div class="contents">
-<h3>detail &rarr; te Relation</h3><table class="dirtab"><tr class="dirtab"><th class="dirtab">File in include/tvm/topi/detail</th><th class="dirtab">Includes file in include/tvm/te</th></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="array__utils_8h.html">array_utils.h</a></td><td class="dirtab"><a class="el" href="operation_8h.html">operation.h</a></td></tr><tr class="dirtab"><td class="dirtab"><a class="el" href="detail_2broadcast_8h.html">broadcast.h</a></td><td class="d [...]
-<!-- start footer part -->
-<hr class="footer"/><address class="footer"><small>
-Generated by &#160;<a href="http://www.doxygen.org/index.html">
-<img class="footer" src="doxygen.png" alt="doxygen"/>
-</a> 1.8.13
-</small></address>
-</body>
-</html>
diff --git a/docs/reference/api/doxygen/dir_000033_000007.html b/docs/reference/api/doxygen/dir_000036_000007.html
similarity index 100%
copy from docs/reference/api/doxygen/dir_000033_000007.html
copy to docs/reference/api/doxygen/dir_000036_000007.html
diff --git a/docs/reference/api/doxygen/dir_000033_000013.html b/docs/reference/api/doxygen/dir_000036_000013.html
similarity index 100%
copy from docs/reference/api/doxygen/dir_000033_000013.html
copy to docs/reference/api/doxygen/dir_000036_000013.html
diff --git a/docs/reference/api/doxygen/dir_000033_000017.html b/docs/reference/api/doxygen/dir_000036_000020.html
similarity index 100%
rename from docs/reference/api/doxygen/dir_000033_000017.html
rename to docs/reference/api/doxygen/dir_000036_000020.html
diff --git a/docs/reference/api/doxygen/dir_000036_000032.html b/docs/reference/api/doxygen/dir_000036_000032.html
deleted file mode 100644
index 23a68bdc3d..0000000000
... 37895 lines suppressed ...