You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by lm...@apache.org on 2021/03/01 13:14:21 UTC

[tvm-site] branch asf-site updated: Docs build at Mon Mar 1 05:14:05 PST 2021

This is an automated email from the ASF dual-hosted git repository.

lmzheng pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/tvm-site.git


The following commit(s) were added to refs/heads/asf-site by this push:
     new 90f0d5d  Docs build at Mon Mar  1 05:14:05 PST 2021
90f0d5d is described below

commit 90f0d5d35855b144a3866c1509704ad7e09b6588
Author: Lianmin Zheng <li...@gmail.com>
AuthorDate: Mon Mar 1 05:14:06 2021 -0800

    Docs build at Mon Mar  1 05:14:05 PST 2021
---
 .../from_tflite.py                                 |    2 +-
 .../micro_reference_vm.ipynb                       |    2 +-
 .../from_tflite.ipynb                              |    2 +-
 .../tune_network_mali.ipynb                        |    2 +-
 .../tune_network_cuda.py                           |    2 +-
 .../deploy_sparse.py                               |   20 +-
 .../tune_relay_vta.ipynb                           |    2 +-
 .../tune_relay_vta.py                              |    1 +
 .../schedule_primitives.ipynb                      |   12 +-
 .../micro_reference_vm.py                          |   27 +-
 .../tune_network_mali.py                           |    2 +-
 .../deploy_ssd_gluoncv.ipynb                       |    2 +-
 .../deploy_sparse.ipynb                            |    4 +-
 .../schedule_primitives.py                         |   14 +-
 .../low_level_custom_pass.py                       |    4 +-
 .../tune_network_x86.ipynb                         |    2 +-
 .../tune_network_x86.py                            |    2 +-
 .../tune_network_arm.ipynb                         |  161 +
 .../deploy_ssd_gluoncv.py                          |    4 +
 .../micro_tflite.ipynb                             |    4 +-
 .../tune_network_cuda.ipynb                        |    2 +-
 .../low_level_custom_pass.ipynb                    |    2 +-
 .../tune_network_arm.py}                           |  285 +-
 .../micro_tflite.py                                |   21 +-
 docs/_images/sphx_glr_tune_network_arm_thumb.png   |  Bin 0 -> 26786 bytes
 docs/_sources/deploy/vitis_ai.rst.txt              |   95 +-
 docs/_sources/langref/relay_pattern.rst.txt        |   29 +
 .../auto_scheduler/sg_execution_times.rst.txt      |   15 +-
 .../auto_scheduler/tune_conv2d_layer_cuda.rst.txt  | 1361 ++++-
 .../auto_scheduler/tune_matmul_x86.rst.txt         |    6 +-
 ...twork_mali.rst.txt => tune_network_arm.rst.txt} |  339 +-
 .../auto_scheduler/tune_network_cuda.rst.txt       |   52 +-
 .../auto_scheduler/tune_network_mali.rst.txt       |   48 +-
 .../auto_scheduler/tune_network_x86.rst.txt        |   62 +-
 .../tutorials/autotvm/sg_execution_times.rst.txt   |   12 +-
 .../tutorials/autotvm/tune_conv2d_cuda.rst.txt     |   44 +-
 .../tutorials/autotvm/tune_simple_template.rst.txt |   20 +-
 .../tutorials/dev/bring_your_own_datatypes.rst.txt |   10 +-
 .../tutorials/dev/low_level_custom_pass.rst.txt    |   12 +-
 .../tutorials/dev/sg_execution_times.rst.txt       |    8 +-
 .../frontend/deploy_model_on_android.rst.txt       |    2 +-
 .../deploy_object_detection_pytorch.rst.txt        |    2 +-
 .../tutorials/frontend/deploy_prequantized.rst.txt |    2 +-
 .../frontend/deploy_prequantized_tflite.rst.txt    |    4 +-
 .../tutorials/frontend/deploy_sparse.rst.txt       |   20 +-
 .../tutorials/frontend/deploy_ssd_gluoncv.rst.txt  |    6 +-
 docs/_sources/tutorials/frontend/from_onnx.rst.txt |    2 +-
 .../tutorials/frontend/from_pytorch.rst.txt        |    5 +-
 .../tutorials/frontend/from_tensorflow.rst.txt     |    7 +-
 .../tutorials/frontend/from_tflite.rst.txt         |    2 +-
 .../tutorials/frontend/sg_execution_times.rst.txt  |   40 +-
 .../get_started/cross_compilation_and_rpc.rst.txt  |    2 +-
 .../get_started/relay_quick_start.rst.txt          |    2 +-
 .../get_started/sg_execution_times.rst.txt         |   10 +-
 .../get_started/tensor_expr_get_started.rst.txt    |   16 +-
 docs/_sources/tutorials/index.rst.txt              |   20 +
 .../tutorials/language/intrin_math.rst.txt         |   56 +
 docs/_sources/tutorials/language/reduction.rst.txt |   14 +
 .../tutorials/language/schedule_primitives.rst.txt |   28 +-
 .../tutorials/language/sg_execution_times.rst.txt  |   18 +-
 docs/_sources/tutorials/language/tensorize.rst.txt |    8 +-
 .../tutorials/language/tuple_inputs.rst.txt        |   20 +-
 .../tutorials/micro/micro_reference_vm.rst.txt     |   27 +-
 docs/_sources/tutorials/micro/micro_tflite.rst.txt |   21 +-
 .../tutorials/micro/sg_execution_times.rst.txt     |    4 +-
 .../tutorials/optimize/opt_conv_cuda.rst.txt       |    2 +-
 .../tutorials/optimize/opt_conv_tensorcore.rst.txt |    2 +-
 docs/_sources/tutorials/optimize/opt_gemm.rst.txt  |   20 +-
 .../tutorials/optimize/sg_execution_times.rst.txt  |   10 +-
 docs/_sources/tutorials/topi/intro_topi.rst.txt    |    2 +-
 .../tutorials/topi/sg_execution_times.rst.txt      |    4 +-
 .../tutorials/autotvm/sg_execution_times.rst.txt   |    4 +-
 .../vta/tutorials/autotvm/tune_relay_vta.rst.txt   |    3 +-
 .../frontend/deploy_classification.rst.txt         |    4 +-
 .../tutorials/frontend/sg_execution_times.rst.txt  |    4 +-
 .../tutorials/optimize/matrix_multiply_opt.rst.txt |    4 +-
 .../tutorials/optimize/sg_execution_times.rst.txt  |    6 +-
 .../vta/tutorials/sg_execution_times.rst.txt       |    6 +-
 .../_sources/vta/tutorials/vta_get_started.rst.txt |    4 +-
 docs/api/doxygen/algorithm_8h.html                 |    2 +-
 docs/api/doxygen/algorithm_8h__incl.svg            | 1467 ++---
 docs/api/doxygen/analyzer_8h.html                  |    4 +-
 docs/api/doxygen/analyzer_8h__dep__incl.svg        | 1014 ++--
 docs/api/doxygen/analyzer_8h__incl.svg             | 1079 ++--
 docs/api/doxygen/annotated.html                    |  339 +-
 docs/api/doxygen/annotation_8h.html                |    2 +-
 docs/api/doxygen/annotation_8h__incl.svg           |  941 ++--
 docs/api/doxygen/attr__registry__map_8h.html       |    4 +-
 .../doxygen/attr__registry__map_8h__dep__incl.svg  |  649 +--
 docs/api/doxygen/attr__registry__map_8h__incl.svg  |  491 +-
 .../api/doxygen/attr__registry__map_8h_source.html |    2 +-
 docs/api/doxygen/auto__schedule_8h.html            |    2 +-
 docs/api/doxygen/auto__schedule_8h__incl.svg       | 1175 ++--
 docs/api/doxygen/autodiff_8h.html                  |    2 +-
 docs/api/doxygen/autodiff_8h__incl.svg             | 1283 ++---
 docs/api/doxygen/base_8h.html                      |    2 +-
 docs/api/doxygen/base_8h__incl.svg                 | 1033 ++--
 docs/api/doxygen/bias__add_8h.html                 |    2 +-
 docs/api/doxygen/bias__add_8h__incl.svg            | 1055 ++--
 docs/api/doxygen/bitserial_8h.html                 |    2 +-
 docs/api/doxygen/bitserial_8h__incl.svg            | 1125 ++--
 docs/api/doxygen/bound_8h.html                     |    4 +-
 docs/api/doxygen/bound_8h__dep__incl.svg           | 1006 ++--
 docs/api/doxygen/bound_8h__incl.svg                | 1091 ++--
 docs/api/doxygen/buffer_8h.html                    |    4 +-
 docs/api/doxygen/buffer_8h__dep__incl.svg          | 1071 ++--
 docs/api/doxygen/buffer_8h__incl.svg               |  905 +--
 docs/api/doxygen/builtin_8h.html                   |    2 +-
 docs/api/doxygen/builtin_8h__incl.svg              | 1299 ++---
 docs/api/doxygen/c__runtime__api_8h.html           |   89 +-
 docs/api/doxygen/c__runtime__api_8h__dep__incl.svg |  855 +--
 docs/api/doxygen/c__runtime__api_8h_source.html    |    5 +-
 docs/api/doxygen/classes.html                      |  337 +-
 docs/api/doxygen/classtvm_1_1BaseAttrsNode.html    |   12 +-
 docs/api/doxygen/classtvm_1_1Target-members.html   |    7 +-
 docs/api/doxygen/classtvm_1_1Target.html           |   55 +-
 .../classtvm_1_1TargetKindRegEntry-members.html    |   15 +-
 .../doxygen/classtvm_1_1TargetKindRegEntry.html    |   31 +-
 ...classtvm_1_1TargetKindRegEntry__coll__graph.svg |   31 +-
 .../doxygen/classtvm_1_1TargetNode-members.html    |   15 +-
 docs/api/doxygen/classtvm_1_1TargetNode.html       |   21 +-
 .../classtvm_1_1TargetNode__coll__graph.svg        |  333 +-
 .../classtvm_1_1TargetNode__inherit__graph.svg     |   33 +-
 .../doxygen/classtvm_1_1Target__coll__graph.svg    |   35 +-
 .../doxygen/classtvm_1_1Target__inherit__graph.svg |   35 +-
 ...m_1_1auto__scheduler_1_1ComputeDAG-members.html |   11 +-
 .../classtvm_1_1auto__scheduler_1_1ComputeDAG.html |   32 +-
 ...1auto__scheduler_1_1ComputeDAG__coll__graph.svg |   45 +-
 ...to__scheduler_1_1ComputeDAG__inherit__graph.svg |   45 +-
 ...sstvm_1_1auto__scheduler_1_1SearchTaskNode.html |    2 +-
 ...o__scheduler_1_1SearchTaskNode__coll__graph.svg |  234 +-
 .../doxygen/classtvm_1_1relay_1_1DFPattern.html    |    2 +-
 ...Pattern_01_6n_00_01Args_8_8_8_08_4-members.html |   23 +-
 ...nst_01DFPattern_01_6n_00_01Args_8_8_8_08_4.html |   74 +-
 ...ern_01_6n_00_01Args_8_8_8_08_4__coll__graph.svg |    2 +-
 .../classtvm_1_1relay_1_1DFPatternNode.html        |    2 +-
 ...m_1_1relay_1_1DFPatternNode__inherit__graph.svg |  217 +-
 ...sstvm_1_1relay_1_1DFPatternVisitor-members.html |   11 +-
 .../classtvm_1_1relay_1_1DFPatternVisitor.html     |   46 +-
 ...m_1_1relay_1_1DFPatternVisitor__coll__graph.svg |    2 +-
 ..._1relay_1_1DFPatternVisitor__inherit__graph.svg |    2 +-
 ...sstvm_1_1relay_1_1DFPattern__inherit__graph.svg |  209 +-
 ...> classtvm_1_1relay_1_1LetPattern-members.html} |   37 +-
 .../doxygen/classtvm_1_1relay_1_1LetPattern.html   |  248 +
 ...asstvm_1_1relay_1_1LetPatternNode-members.html} |   20 +-
 .../classtvm_1_1relay_1_1LetPatternNode.html       |  274 +
 ...tvm_1_1relay_1_1LetPatternNode__coll__graph.svg |  100 +
 ..._1_1relay_1_1LetPatternNode__inherit__graph.svg |   57 +
 ...lasstvm_1_1relay_1_1LetPattern__coll__graph.svg |   67 +
 ...stvm_1_1relay_1_1LetPattern__inherit__graph.svg |   67 +
 .../classtvm_1_1runtime_1_1DeviceAPI-members.html  |    6 +-
 .../doxygen/classtvm_1_1runtime_1_1DeviceAPI.html  |  138 +-
 ...asstvm_1_1runtime_1_1DeviceAPI__coll__graph.svg |   50 +-
 .../classtvm_1_1runtime_1_1NDArray-members.html    |    2 +-
 .../doxygen/classtvm_1_1runtime_1_1NDArray.html    |   19 +-
 ..._1TVMMovableArgValueWithContext__-members.html} |   10 +-
 ...untime_1_1TVMMovableArgValueWithContext__.html} |  103 +-
 ...VMMovableArgValueWithContext____coll__graph.svg |   24 +
 ...asstvm_1_1runtime_1_1TVMPODValue__-members.html |   13 +-
 .../classtvm_1_1runtime_1_1TVMPODValue__.html      |   22 +
 ...PackedFunc_3_01R_07Args_8_8_8_08_4-members.html |   34 +-
 ...1_1TypedPackedFunc_3_01R_07Args_8_8_8_08_4.html |  115 +-
 ...edFunc_3_01R_07Args_8_8_8_08_4__coll__graph.svg |   40 +-
 ..._1runtime_1_1micro__rpc_1_1Session-members.html |    4 +-
 ...asstvm_1_1runtime_1_1micro__rpc_1_1Session.html |   29 +-
 .../doxygen/classtvm_1_1tir_1_1For-members.html    |    2 +-
 docs/api/doxygen/classtvm_1_1tir_1_1For.html       |   24 +-
 .../classtvm_1_1tir_1_1ForNode-members.html        |   13 +-
 docs/api/doxygen/classtvm_1_1tir_1_1ForNode.html   |   52 +-
 .../classtvm_1_1tir_1_1ForNode__coll__graph.svg    |  285 +-
 .../classtvm_1_1tir_1_1ForNode__inherit__graph.svg |   73 +-
 ...classtvm_1_1tir_1_1StmtNode__inherit__graph.svg |   33 +-
 docs/api/doxygen/codegen_8h.html                   |    2 +-
 docs/api/doxygen/codegen_8h__incl.svg              | 1197 ++--
 docs/api/doxygen/codegen_8h_source.html            |    4 +-
 docs/api/doxygen/compute__dag_8h_source.html       |    2 +-
 docs/api/doxygen/constant__utils_8h.html           |    4 +-
 docs/api/doxygen/constant__utils_8h__dep__incl.svg |  143 +-
 docs/api/doxygen/constant__utils_8h__incl.svg      | 1533 ++---
 docs/api/doxygen/cost__model_8h.html               |    2 +-
 docs/api/doxygen/cost__model_8h__incl.svg          | 1135 ++--
 docs/api/doxygen/cuda_2dense_8h.html               |    2 +-
 docs/api/doxygen/cuda_2dense_8h__incl.svg          | 1313 ++---
 docs/api/doxygen/cuda_2dense_8h_source.html        |    4 +-
 docs/api/doxygen/cuda_2injective_8h.html           |    2 +-
 docs/api/doxygen/cuda_2injective_8h__incl.svg      | 1341 ++---
 docs/api/doxygen/cuda_2injective_8h_source.html    |    2 +-
 docs/api/doxygen/cuda_2normalization_8h.html       |    2 +-
 docs/api/doxygen/cuda_2normalization_8h__incl.svg  | 1333 ++---
 docs/api/doxygen/cuda_2pooling_8h.html             |    2 +-
 docs/api/doxygen/cuda_2pooling_8h__incl.svg        | 1351 ++---
 docs/api/doxygen/cuda_2pooling_8h_source.html      |    4 +-
 docs/api/doxygen/cuda_2reduction_8h.html           |    2 +-
 docs/api/doxygen/cuda_2reduction_8h__incl.svg      | 1341 ++---
 docs/api/doxygen/cuda_2reduction_8h_source.html    |    6 +-
 docs/api/doxygen/cuda_2softmax_8h.html             |    2 +-
 docs/api/doxygen/cuda_2softmax_8h__incl.svg        | 1341 ++---
 docs/api/doxygen/cuda_2softmax_8h_source.html      |    2 +-
 docs/api/doxygen/data__layout_8h.html              |    4 +-
 docs/api/doxygen/data__layout_8h__dep__incl.svg    |  185 +-
 docs/api/doxygen/data__layout_8h__incl.svg         | 1283 ++---
 docs/api/doxygen/data__layout_8h_source.html       |    3 +-
 docs/api/doxygen/dataflow__pattern_8h.html         |   11 +-
 docs/api/doxygen/dataflow__pattern_8h__incl.svg    | 1461 ++---
 docs/api/doxygen/dataflow__pattern_8h_source.html  |  104 +-
 .../api/doxygen/dataflow__pattern__functor_8h.html |    2 +-
 .../dataflow__pattern__functor_8h__incl.svg        | 1465 ++---
 .../dataflow__pattern__functor_8h_source.html      |   27 +-
 docs/api/doxygen/debug_8h.html                     |    2 +-
 docs/api/doxygen/debug_8h__incl.svg                |  969 ++--
 docs/api/doxygen/device__api_8h.html               |    2 +-
 docs/api/doxygen/device__api_8h__incl.svg          |  451 +-
 docs/api/doxygen/device__api_8h_source.html        |   23 +-
 docs/api/doxygen/device__copy_8h.html              |    2 +-
 docs/api/doxygen/device__copy_8h__incl.svg         |  941 ++--
 docs/api/doxygen/diagnostic_8h.html                |    4 +-
 docs/api/doxygen/diagnostic_8h__dep__incl.svg      | 1034 ++--
 docs/api/doxygen/diagnostic_8h__incl.svg           | 1155 ++--
 docs/api/doxygen/dir_000004_000020.html            |    2 +-
 docs/api/doxygen/dir_000012_000008.html            |    2 +-
 docs/api/doxygen/dir_000012_000021.html            |    2 +-
 .../dir_63946bee875c6d52bce55e72a67a86ad_dep.svg   |    4 +-
 .../dir_8e4e25e66b8623d88c5b5dd2040bca97.html      |    3 +
 .../dir_b4c7d8e826c599ba55146c099a14beb5_dep.svg   |   12 +-
 docs/api/doxygen/driver__api_8h.html               |    2 +-
 docs/api/doxygen/driver__api_8h__incl.svg          | 1179 ++--
 docs/api/doxygen/einsum_8h.html                    |  220 +
 docs/api/doxygen/einsum_8h__incl.svg               | 1329 +++++
 docs/api/doxygen/einsum_8h_source.html             |  140 +
 docs/api/doxygen/env__func_8h.html                 |    2 +-
 docs/api/doxygen/env__func_8h__incl.svg            |  657 +--
 docs/api/doxygen/env__func_8h_source.html          |    2 +-
 docs/api/doxygen/error_8h.html                     |    4 +-
 docs/api/doxygen/error_8h__dep__incl.svg           |  243 +-
 docs/api/doxygen/error_8h__incl.svg                | 1149 ++--
 docs/api/doxygen/executable_8h.html                |    2 +-
 docs/api/doxygen/executable_8h__incl.svg           |  509 +-
 docs/api/doxygen/files.html                        |   13 +-
 docs/api/doxygen/functions.html                    |    1 +
 docs/api/doxygen/functions_a.html                  |   12 +-
 docs/api/doxygen/functions_b.html                  |    3 +-
 docs/api/doxygen/functions_c.html                  |   13 +-
 docs/api/doxygen/functions_d.html                  |    4 +-
 docs/api/doxygen/functions_e.html                  |    5 +-
 docs/api/doxygen/functions_f.html                  |    7 +-
 docs/api/doxygen/functions_func_a.html             |    4 +-
 docs/api/doxygen/functions_func_c.html             |   11 +-
 docs/api/doxygen/functions_func_e.html             |    2 +-
 docs/api/doxygen/functions_func_f.html             |    2 +-
 docs/api/doxygen/functions_func_i.html             |    2 +-
 docs/api/doxygen/functions_func_l.html             |    6 +
 docs/api/doxygen/functions_func_o.html             |   29 +-
 docs/api/doxygen/functions_func_p.html             |    3 +
 docs/api/doxygen/functions_func_s.html             |    4 +-
 docs/api/doxygen/functions_func_t.html             |   15 +-
 docs/api/doxygen/functions_func_v.html             |   25 +-
 docs/api/doxygen/functions_h.html                  |    1 +
 docs/api/doxygen/functions_i.html                  |    2 +-
 docs/api/doxygen/functions_k.html                  |    1 +
 docs/api/doxygen/functions_l.html                  |    6 +
 docs/api/doxygen/functions_m.html                  |    1 +
 docs/api/doxygen/functions_o.html                  |   27 +-
 docs/api/doxygen/functions_p.html                  |    5 +-
 docs/api/doxygen/functions_r.html                  |    9 +-
 docs/api/doxygen/functions_rela.html               |    1 +
 docs/api/doxygen/functions_s.html                  |    9 +-
 docs/api/doxygen/functions_t.html                  |   19 +-
 docs/api/doxygen/functions_v.html                  |   35 +-
 docs/api/doxygen/functions_vars.html               |    1 +
 docs/api/doxygen/functions_vars_a.html             |    4 +
 docs/api/doxygen/functions_vars_b.html             |    1 +
 docs/api/doxygen/functions_vars_d.html             |    4 +-
 docs/api/doxygen/functions_vars_e.html             |    3 +
 docs/api/doxygen/functions_vars_f.html             |    3 -
 docs/api/doxygen/functions_vars_h.html             |    1 +
 docs/api/doxygen/functions_vars_k.html             |    1 +
 docs/api/doxygen/functions_vars_m.html             |    1 +
 docs/api/doxygen/functions_vars_r.html             |    3 +
 docs/api/doxygen/functions_vars_s.html             |    3 +
 docs/api/doxygen/functions_vars_t.html             |    3 +
 docs/api/doxygen/functions_vars_v.html             |    2 +
 docs/api/doxygen/generic_2default_8h.html          |    2 +-
 docs/api/doxygen/generic_2default_8h__incl.svg     | 1341 ++---
 docs/api/doxygen/generic_2default_8h_source.html   |    2 +-
 docs/api/doxygen/generic_2extern_8h.html           |    2 +-
 docs/api/doxygen/generic_2extern_8h__incl.svg      | 1373 ++---
 docs/api/doxygen/generic_2extern_8h_source.html    |    2 +-
 docs/api/doxygen/generic_2injective_8h.html        |    2 +-
 docs/api/doxygen/generic_2injective_8h__incl.svg   | 1341 ++---
 docs/api/doxygen/generic_2injective_8h_source.html |    2 +-
 docs/api/doxygen/generic__func_8h.html             |    2 +-
 docs/api/doxygen/generic__func_8h__incl.svg        |  919 +--
 docs/api/doxygen/generic__func_8h_source.html      |    6 +-
 docs/api/doxygen/globals.html                      |    2 +
 docs/api/doxygen/globals_defs.html                 |   19 +
 docs/api/doxygen/globals_e.html                    |    2 +
 docs/api/doxygen/globals_f.html                    |    2 +
 docs/api/doxygen/globals_func.html                 |    8 +-
 docs/api/doxygen/globals_g.html                    |    2 +
 docs/api/doxygen/globals_i.html                    |    2 +
 docs/api/doxygen/globals_k.html                    |    2 +
 .../api/doxygen/{globals_v.html => globals_l.html} |   10 +-
 docs/api/doxygen/globals_m.html                    |    2 +
 .../api/doxygen/{globals_g.html => globals_n.html} |   14 +-
 docs/api/doxygen/globals_p.html                    |    2 +
 docs/api/doxygen/globals_r.html                    |    2 +
 docs/api/doxygen/globals_s.html                    |    2 +
 docs/api/doxygen/globals_t.html                    |   10 +-
 docs/api/doxygen/globals_u.html                    |    2 +
 docs/api/doxygen/globals_v.html                    |    2 +
 docs/api/doxygen/hierarchy.html                    | 1505 ++---
 docs/api/doxygen/image_8h.html                     |    2 +-
 docs/api/doxygen/image_8h__incl.svg                | 1125 ++--
 docs/api/doxygen/inherit_graph_124.svg             |   55 +-
 docs/api/doxygen/inherit_graph_125.svg             |   58 +-
 docs/api/doxygen/inherit_graph_126.svg             |   19 +-
 docs/api/doxygen/inherit_graph_127.svg             |    4 +-
 docs/api/doxygen/inherit_graph_128.svg             |   19 +-
 docs/api/doxygen/inherit_graph_129.svg             |   21 +-
 docs/api/doxygen/inherit_graph_130.svg             |   18 +-
 docs/api/doxygen/inherit_graph_131.svg             |   15 +-
 docs/api/doxygen/inherit_graph_132.svg             |   12 +-
 docs/api/doxygen/inherit_graph_133.svg             |   12 +-
 docs/api/doxygen/inherit_graph_134.svg             |   12 +-
 docs/api/doxygen/inherit_graph_135.svg             |   15 +-
 docs/api/doxygen/inherit_graph_136.svg             |   15 +-
 docs/api/doxygen/inherit_graph_137.svg             |   12 +-
 docs/api/doxygen/inherit_graph_138.svg             |   15 +-
 docs/api/doxygen/inherit_graph_139.svg             |   15 +-
 docs/api/doxygen/inherit_graph_140.svg             |   15 +-
 docs/api/doxygen/inherit_graph_141.svg             |   15 +-
 docs/api/doxygen/inherit_graph_142.svg             |   12 +-
 docs/api/doxygen/inherit_graph_143.svg             |   12 +-
 docs/api/doxygen/inherit_graph_144.svg             |   12 +-
 docs/api/doxygen/inherit_graph_145.svg             |   12 +-
 docs/api/doxygen/inherit_graph_146.svg             |   15 +-
 docs/api/doxygen/inherit_graph_147.svg             |   17 +-
 docs/api/doxygen/inherit_graph_148.svg             |   16 +-
 docs/api/doxygen/inherit_graph_149.svg             |   15 +-
 docs/api/doxygen/inherit_graph_150.svg             |   14 +-
 docs/api/doxygen/inherit_graph_151.svg             |   12 +-
 docs/api/doxygen/inherit_graph_152.svg             |   69 +-
 docs/api/doxygen/inherit_graph_153.svg             |   54 +-
 docs/api/doxygen/inherit_graph_154.svg             |   72 +-
 docs/api/doxygen/inherit_graph_155.svg             |   19 +-
 docs/api/doxygen/inherit_graph_156.svg             |   15 +-
 docs/api/doxygen/inherit_graph_157.svg             |   15 +-
 docs/api/doxygen/inherit_graph_158.svg             |   27 +-
 docs/api/doxygen/inherit_graph_159.svg             |   24 +-
 docs/api/doxygen/inherit_graph_160.svg             |   28 +-
 docs/api/doxygen/inherit_graph_161.svg             |   12 +-
 docs/api/doxygen/inherit_graph_162.svg             |   12 +-
 docs/api/doxygen/inherit_graph_163.svg             |   12 +-
 docs/api/doxygen/inherit_graph_164.svg             |   12 +-
 docs/api/doxygen/inherit_graph_165.svg             |   12 +-
 docs/api/doxygen/inherit_graph_166.svg             |   12 +-
 docs/api/doxygen/inherit_graph_167.svg             |   12 +-
 docs/api/doxygen/inherit_graph_168.svg             |   12 +-
 docs/api/doxygen/inherit_graph_169.svg             |   12 +-
 docs/api/doxygen/inherit_graph_170.svg             |   12 +-
 ...inherit_graph_170.svg => inherit_graph_171.svg} |    0
 docs/api/doxygen/inherit_graph_62.svg              | 1019 ++--
 docs/api/doxygen/inherit_graph_87.svg              | 5871 ++++++++++----------
 docs/api/doxygen/inherits.html                     |   96 +-
 docs/api/doxygen/int__set_8h.html                  |    4 +-
 docs/api/doxygen/int__set_8h__dep__incl.svg        | 1066 ++--
 docs/api/doxygen/int__set_8h__incl.svg             | 1021 ++--
 docs/api/doxygen/ir_2adt_8h.html                   |    4 +-
 docs/api/doxygen/ir_2adt_8h__dep__incl.svg         | 1188 ++--
 docs/api/doxygen/ir_2adt_8h__incl.svg              |  893 +--
 docs/api/doxygen/ir_2attrs_8h.html                 |    2 +-
 docs/api/doxygen/ir_2attrs_8h__incl.svg            |  925 +--
 docs/api/doxygen/ir_2attrs_8h_source.html          |    8 +-
 docs/api/doxygen/ir_2expr_8h.html                  |    2 +-
 docs/api/doxygen/ir_2expr_8h__incl.svg             |  855 +--
 docs/api/doxygen/ir_2expr_8h_source.html           |    8 +-
 docs/api/doxygen/ir_2function_8h.html              |    2 +-
 docs/api/doxygen/ir_2function_8h__dep__incl.svg    |   11 +-
 docs/api/doxygen/ir_2function_8h__incl.svg         |  957 ++--
 docs/api/doxygen/ir_2module_8h.html                |    4 +-
 docs/api/doxygen/ir_2module_8h__dep__incl.svg      |  899 ++-
 docs/api/doxygen/ir_2module_8h__incl.svg           | 1145 ++--
 docs/api/doxygen/ir_2op_8h.html                    |    4 +-
 docs/api/doxygen/ir_2op_8h__dep__incl.svg          |  734 +--
 docs/api/doxygen/ir_2op_8h__incl.svg               | 1245 ++---
 docs/api/doxygen/ir_2op_8h_source.html             |    4 +-
 docs/api/doxygen/ir_2transform_8h.html             |    4 +-
 docs/api/doxygen/ir_2transform_8h__dep__incl.svg   |  197 +-
 docs/api/doxygen/ir_2transform_8h__incl.svg        | 1173 ++--
 docs/api/doxygen/ir_2transform_8h_source.html      |    6 +-
 docs/api/doxygen/ir_2type_8h.html                  |    2 +-
 docs/api/doxygen/ir_2type_8h__incl.svg             |  797 +--
 docs/api/doxygen/iter__affine__map_8h.html         |    2 +-
 docs/api/doxygen/iter__affine__map_8h__incl.svg    | 1101 ++--
 docs/api/doxygen/loop__state_8h.html               |    2 +-
 docs/api/doxygen/loop__state_8h__incl.svg          | 1179 ++--
 docs/api/doxygen/measure_8h.html                   |    2 +-
 docs/api/doxygen/measure_8h__incl.svg              | 1161 ++--
 docs/api/doxygen/measure__record_8h.html           |    2 +-
 docs/api/doxygen/measure__record_8h__incl.svg      | 1137 ++--
 docs/api/doxygen/memory__manager_8h.html           |    2 +-
 docs/api/doxygen/memory__manager_8h__incl.svg      |  516 +-
 docs/api/doxygen/memory__manager_8h_source.html    |    2 +-
 docs/api/doxygen/namespacemembers_c.html           |    3 +
 docs/api/doxygen/namespacemembers_d.html           |    9 +-
 docs/api/doxygen/namespacemembers_e.html           |   12 +-
 docs/api/doxygen/namespacemembers_enum.html        |    7 +-
 docs/api/doxygen/namespacemembers_f.html           |    4 +-
 docs/api/doxygen/namespacemembers_func_c.html      |    3 +
 docs/api/doxygen/namespacemembers_func_e.html      |   10 +-
 docs/api/doxygen/namespacemembers_func_g.html      |   12 +-
 docs/api/doxygen/namespacemembers_func_i.html      |    3 +
 docs/api/doxygen/namespacemembers_func_m.html      |    9 +-
 docs/api/doxygen/namespacemembers_func_n.html      |    5 +-
 docs/api/doxygen/namespacemembers_func_o.html      |    2 +-
 docs/api/doxygen/namespacemembers_func_p.html      |   14 +-
 docs/api/doxygen/namespacemembers_func_s.html      |   10 +-
 docs/api/doxygen/namespacemembers_g.html           |   10 +-
 docs/api/doxygen/namespacemembers_i.html           |    3 +
 docs/api/doxygen/namespacemembers_k.html           |    5 +-
 docs/api/doxygen/namespacemembers_m.html           |    9 +-
 docs/api/doxygen/namespacemembers_n.html           |    5 +-
 docs/api/doxygen/namespacemembers_o.html           |    2 +-
 docs/api/doxygen/namespacemembers_p.html           |   12 +
 docs/api/doxygen/namespacemembers_s.html           |   12 +-
 docs/api/doxygen/namespacemembers_vars.html        |    3 +
 docs/api/doxygen/namespacetvm_1_1relay.html        |   66 +
 .../namespacetvm_1_1relay_1_1transform.html        |   39 +
 docs/api/doxygen/namespacetvm_1_1runtime.html      |    3 +
 docs/api/doxygen/namespacetvm_1_1tir.html          |   91 +-
 docs/api/doxygen/namespacetvm_1_1topi.html         | 1275 +++--
 docs/api/doxygen/ndarray_8h.html                   |    3 +-
 docs/api/doxygen/ndarray_8h__incl.svg              |  436 +-
 docs/api/doxygen/ndarray_8h_source.html            |   63 +-
 docs/api/doxygen/nn_2pooling_8h.html               |    2 +-
 docs/api/doxygen/nn_2pooling_8h__incl.svg          | 1143 ++--
 docs/api/doxygen/nn_2softmax_8h.html               |    2 +-
 docs/api/doxygen/nn_2softmax_8h__incl.svg          | 1083 ++--
 docs/api/doxygen/node_2container_8h.html           |    6 +-
 docs/api/doxygen/node_2container_8h__dep__incl.svg |  863 +--
 docs/api/doxygen/node_2container_8h__incl.svg      |  465 +-
 docs/api/doxygen/node_2container_8h_source.html    |   20 +-
 docs/api/doxygen/node_8h.html                      |    4 +-
 docs/api/doxygen/node_8h__dep__incl.svg            | 1407 ++---
 docs/api/doxygen/node_8h__incl.svg                 |  739 +--
 docs/api/doxygen/object_8h.html                    |    2 +-
 docs/api/doxygen/object_8h__dep__incl.svg          | 1041 ++--
 docs/api/doxygen/object_8h_source.html             |   10 +-
 docs/api/doxygen/op__strategy_8h_source.html       |    2 +-
 docs/api/doxygen/operation_8h.html                 |    2 +-
 docs/api/doxygen/operation_8h__dep__incl.svg       |  900 +--
 docs/api/doxygen/packed__func_8h.html              |   19 +-
 docs/api/doxygen/packed__func_8h__incl.svg         |  431 +-
 docs/api/doxygen/packed__func_8h_source.html       |  209 +-
 docs/api/doxygen/parser_8h.html                    |    2 +-
 docs/api/doxygen/parser_8h__incl.svg               |  491 +-
 docs/api/doxygen/pattern_8h.html                   |    2 +-
 docs/api/doxygen/pattern_8h__incl.svg              | 1019 ++--
 docs/api/doxygen/platform_8h.html                  |   42 +
 docs/api/doxygen/platform_8h_source.html           |    3 +-
 docs/api/doxygen/random_8h.html                    |    2 +-
 docs/api/doxygen/random_8h__incl.svg               |  935 ++--
 docs/api/doxygen/ravel__unravel_8h.html            |    2 +-
 docs/api/doxygen/ravel__unravel_8h__dep__incl.svg  |  165 +-
 docs/api/doxygen/reduce_8h.html                    |    2 +-
 docs/api/doxygen/reduce_8h__incl.svg               |  941 ++--
 docs/api/doxygen/reduction_8h.html                 |    2 +-
 docs/api/doxygen/reduction_8h__incl.svg            | 1011 ++--
 docs/api/doxygen/reflection_8h.html                |    2 +-
 docs/api/doxygen/reflection_8h__incl.svg           |  603 +-
 docs/api/doxygen/reflection_8h_source.html         |    6 +-
 docs/api/doxygen/registry_8h.html                  |    2 +-
 docs/api/doxygen/registry_8h__incl.svg             |  459 +-
 docs/api/doxygen/registry_8h_source.html           |    2 +-
 docs/api/doxygen/relay_2adt_8h.html                |    2 +-
 docs/api/doxygen/relay_2adt_8h__incl.svg           | 1495 ++---
 docs/api/doxygen/relay_2attrs_2memory_8h.html      |    2 +-
 docs/api/doxygen/relay_2attrs_2memory_8h__incl.svg | 1437 ++---
 docs/api/doxygen/relay_2attrs_2nn_8h.html          |    2 +-
 docs/api/doxygen/relay_2attrs_2nn_8h__incl.svg     | 1129 ++--
 docs/api/doxygen/relay_2attrs_2transform_8h.html   |    8 +-
 .../doxygen/relay_2attrs_2transform_8h__incl.svg   | 1467 ++---
 .../doxygen/relay_2attrs_2transform_8h_source.html |   11 +-
 docs/api/doxygen/relay_2attrs_2vm_8h.html          |    2 +-
 docs/api/doxygen/relay_2attrs_2vm_8h__incl.svg     |  933 ++--
 docs/api/doxygen/relay_2expr_8h.html               |    2 +-
 docs/api/doxygen/relay_2expr_8h__incl.svg          | 1417 ++---
 docs/api/doxygen/relay_2expr_8h_source.html        |    2 +-
 docs/api/doxygen/relay_2expr__functor_8h.html      |    2 +
 .../doxygen/relay_2expr__functor_8h_source.html    |    3 +-
 docs/api/doxygen/relay_2feature_8h.html            |    2 +-
 docs/api/doxygen/relay_2feature_8h__incl.svg       | 1279 ++---
 docs/api/doxygen/relay_2function_8h.html           |    2 +-
 docs/api/doxygen/relay_2function_8h__incl.svg      | 1407 ++---
 docs/api/doxygen/relay_2op_8h.html                 |    2 +-
 docs/api/doxygen/relay_2op_8h__incl.svg            | 1451 ++---
 .../doxygen/relay_2op__attr__types_8h_source.html  |    2 +-
 docs/api/doxygen/relay_2qnn_2attrs_8h.html         |    2 +-
 docs/api/doxygen/relay_2qnn_2attrs_8h__incl.svg    |  941 ++--
 docs/api/doxygen/relay_2qnn_2transform_8h.html     |    2 +-
 .../api/doxygen/relay_2qnn_2transform_8h__incl.svg | 1949 +++----
 .../doxygen/relay_2qnn_2transform_8h_source.html   |    2 +-
 docs/api/doxygen/relay_2transform_8h.html          |    6 +-
 docs/api/doxygen/relay_2transform_8h__incl.svg     | 1863 ++++---
 docs/api/doxygen/relay_2transform_8h_source.html   |   19 +-
 docs/api/doxygen/relay_2type_8h.html               |    2 +-
 docs/api/doxygen/relay_2type_8h__incl.svg          | 1385 ++---
 docs/api/doxygen/reorg_8h.html                     |    2 +-
 docs/api/doxygen/reorg_8h__incl.svg                | 1045 ++--
 docs/api/doxygen/repr__printer_8h.html             |    2 +-
 docs/api/doxygen/repr__printer_8h__dep__incl.svg   | 1028 ++--
 docs/api/doxygen/rocm_2dense_8h.html               |    2 +-
 docs/api/doxygen/rocm_2dense_8h__incl.svg          | 1341 ++---
 docs/api/doxygen/rocm_2dense_8h_source.html        |    2 +-
 docs/api/doxygen/rocm_2injective_8h.html           |    2 +-
 docs/api/doxygen/rocm_2injective_8h__incl.svg      | 1341 ++---
 docs/api/doxygen/rocm_2injective_8h_source.html    |    2 +-
 docs/api/doxygen/rocm_2normalization_8h.html       |    2 +-
 docs/api/doxygen/rocm_2normalization_8h__incl.svg  | 1295 ++---
 docs/api/doxygen/rocm_2pooling_8h.html             |    2 +-
 docs/api/doxygen/rocm_2pooling_8h__incl.svg        | 1353 ++---
 docs/api/doxygen/rocm_2pooling_8h_source.html      |    2 +-
 docs/api/doxygen/rocm_2reduction_8h.html           |    2 +-
 docs/api/doxygen/rocm_2reduction_8h__incl.svg      | 1341 ++---
 docs/api/doxygen/rocm_2reduction_8h_source.html    |    2 +-
 docs/api/doxygen/rocm_2softmax_8h.html             |    2 +-
 docs/api/doxygen/rocm_2softmax_8h__incl.svg        | 1341 ++---
 docs/api/doxygen/rocm_2softmax_8h_source.html      |    2 +-
 docs/api/doxygen/runtime_2container_8h.html        |    2 +-
 .../doxygen/runtime_2container_8h__dep__incl.svg   |  967 ++--
 docs/api/doxygen/runtime_2container_8h_source.html |    2 +-
 docs/api/doxygen/runtime_2memory_8h.html           |    2 +-
 docs/api/doxygen/runtime_2memory_8h__dep__incl.svg | 1327 ++---
 docs/api/doxygen/runtime_2module_8h.html           |    2 +-
 docs/api/doxygen/runtime_2module_8h__incl.svg      |  433 +-
 docs/api/doxygen/runtime_2module_8h_source.html    |    2 +-
 docs/api/doxygen/runtime_2vm_2vm_8h.html           |    2 +-
 docs/api/doxygen/runtime_2vm_2vm_8h__incl.svg      |  675 +--
 docs/api/doxygen/schedule_8h.html                  |    4 +-
 docs/api/doxygen/schedule_8h__dep__incl.svg        |  841 +--
 docs/api/doxygen/schedule_8h__incl.svg             | 1251 ++---
 docs/api/doxygen/schedule_8h_source.html           |    3 +-
 docs/api/doxygen/schedule__pass_8h.html            |    2 +-
 docs/api/doxygen/schedule__pass_8h__incl.svg       | 1355 ++---
 docs/api/doxygen/search/all_0.js                   |    2 +-
 docs/api/doxygen/search/all_1.js                   |   21 +-
 docs/api/doxygen/search/all_10.js                  |   20 +-
 docs/api/doxygen/search/all_12.js                  |   11 +-
 docs/api/doxygen/search/all_13.js                  |   23 +-
 docs/api/doxygen/search/all_14.js                  |   47 +-
 docs/api/doxygen/search/all_15.js                  |    6 +-
 docs/api/doxygen/search/all_16.js                  |   17 +-
 docs/api/doxygen/search/all_2.js                   |    2 +-
 docs/api/doxygen/search/all_3.js                   |   11 +-
 docs/api/doxygen/search/all_4.js                   |    8 +-
 docs/api/doxygen/search/all_5.js                   |   18 +-
 docs/api/doxygen/search/all_6.js                   |   19 +-
 docs/api/doxygen/search/all_7.js                   |    4 +-
 docs/api/doxygen/search/all_8.js                   |    4 +-
 docs/api/doxygen/search/all_9.js                   |   11 +-
 docs/api/doxygen/search/all_b.js                   |   11 +-
 docs/api/doxygen/search/all_c.js                   |   17 +-
 docs/api/doxygen/search/all_d.js                   |   11 +-
 docs/api/doxygen/search/all_e.js                   |    6 +-
 docs/api/doxygen/search/all_f.js                   |    4 +-
 docs/api/doxygen/search/classes_0.js               |    2 +
 docs/api/doxygen/search/classes_10.js              |    2 +-
 docs/api/doxygen/search/classes_11.js              |    3 +-
 docs/api/doxygen/search/classes_12.js              |    1 +
 docs/api/doxygen/search/classes_13.js              |    2 +-
 docs/api/doxygen/search/classes_2.js               |    3 +-
 docs/api/doxygen/search/classes_4.js               |    6 +-
 docs/api/doxygen/search/classes_7.js               |    2 +-
 docs/api/doxygen/search/classes_9.js               |    2 +
 docs/api/doxygen/search/classes_f.js               |    2 +-
 docs/api/doxygen/search/defines_3.js               |    2 +-
 docs/api/doxygen/search/defines_4.js               |    9 +-
 docs/api/doxygen/search/defines_5.js               |    2 +-
 docs/api/doxygen/search/defines_6.js               |   74 +-
 docs/api/doxygen/search/defines_7.js               |    2 +-
 .../search/{enumvalues_b.html => defines_8.html}   |    2 +-
 .../doxygen/search/{defines_6.js => defines_8.js}  |    0
 .../search/{enumvalues_c.html => defines_9.html}   |    2 +-
 .../doxygen/search/{defines_7.js => defines_9.js}  |    0
 docs/api/doxygen/search/enums_3.js                 |    1 -
 docs/api/doxygen/search/enums_5.js                 |    2 +-
 docs/api/doxygen/search/enumvalues_5.js            |    8 +-
 docs/api/doxygen/search/enumvalues_8.js            |    1 -
 docs/api/doxygen/search/enumvalues_9.js            |    4 +-
 docs/api/doxygen/search/enumvalues_a.js            |    4 +-
 docs/api/doxygen/search/enumvalues_b.js            |    5 -
 docs/api/doxygen/search/enumvalues_c.js            |    4 -
 docs/api/doxygen/search/enumvalues_d.html          |   26 -
 docs/api/doxygen/search/enumvalues_d.js            |    4 -
 docs/api/doxygen/search/files_4.js                 |    1 +
 docs/api/doxygen/search/functions_1.js             |    4 +-
 docs/api/doxygen/search/functions_10.js            |    6 +-
 docs/api/doxygen/search/functions_13.js            |    5 +-
 docs/api/doxygen/search/functions_14.js            |   17 +-
 docs/api/doxygen/search/functions_16.js            |    4 +-
 docs/api/doxygen/search/functions_3.js             |    6 +-
 docs/api/doxygen/search/functions_5.js             |    4 +-
 docs/api/doxygen/search/functions_6.js             |    2 +-
 docs/api/doxygen/search/functions_7.js             |    2 +
 docs/api/doxygen/search/functions_9.js             |    3 +-
 docs/api/doxygen/search/functions_c.js             |    2 +
 docs/api/doxygen/search/functions_d.js             |    3 +-
 docs/api/doxygen/search/functions_e.js             |    3 +-
 docs/api/doxygen/search/functions_f.js             |    4 +-
 docs/api/doxygen/search/related_b.js               |    2 +-
 docs/api/doxygen/search/searchdata.js              |    4 +-
 docs/api/doxygen/search/variables_0.js             |    2 +-
 docs/api/doxygen/search/variables_1.js             |    3 +-
 docs/api/doxygen/search/variables_10.js            |    1 +
 docs/api/doxygen/search/variables_11.js            |    1 +
 docs/api/doxygen/search/variables_12.js            |    1 +
 docs/api/doxygen/search/variables_14.js            |    4 +-
 docs/api/doxygen/search/variables_2.js             |    2 +-
 docs/api/doxygen/search/variables_4.js             |    3 +-
 docs/api/doxygen/search/variables_5.js             |    1 +
 docs/api/doxygen/search/variables_6.js             |    1 -
 docs/api/doxygen/search/variables_8.js             |    2 +-
 docs/api/doxygen/search/variables_a.js             |    3 +-
 docs/api/doxygen/search/variables_c.js             |    2 +-
 docs/api/doxygen/search__policy_8h.html            |    2 +-
 docs/api/doxygen/search__policy_8h__incl.svg       | 1191 ++--
 docs/api/doxygen/search__task_8h.html              |    2 +-
 docs/api/doxygen/search__task_8h__incl.svg         | 1239 +++--
 docs/api/doxygen/search__task_8h_source.html       |    2 +-
 docs/api/doxygen/serializer_8h.html                |    2 +-
 docs/api/doxygen/serializer_8h__incl.svg           |  400 +-
 docs/api/doxygen/session_8h_source.html            |    8 +-
 docs/api/doxygen/source__map_8h.html               |    4 +-
 docs/api/doxygen/source__map_8h__dep__incl.svg     | 1180 ++--
 docs/api/doxygen/source__map_8h__incl.svg          |  827 +--
 docs/api/doxygen/span_8h.html                      |    2 +-
 docs/api/doxygen/span_8h__incl.svg                 |  759 +--
 docs/api/doxygen/stmt_8h.html                      |   27 +-
 docs/api/doxygen/stmt_8h__dep__incl.svg            |  963 ++--
 docs/api/doxygen/stmt_8h__incl.svg                 | 1029 ++--
 docs/api/doxygen/stmt_8h_source.html               |  130 +-
 docs/api/doxygen/stmt__functor_8h.html             |    2 +-
 docs/api/doxygen/stmt__functor_8h__incl.svg        | 1107 ++--
 docs/api/doxygen/stmt__functor_8h_source.html      |    4 +-
 ...structtvm_1_1relay_1_1CumsumAttrs-members.html} |   39 +-
 ...html => structtvm_1_1relay_1_1CumsumAttrs.html} |   80 +-
 ...ucttvm_1_1relay_1_1CumsumAttrs__coll__graph.svg |  222 +
 ...tvm_1_1relay_1_1CumsumAttrs__inherit__graph.svg |   88 +
 ...tructtvm_1_1relay_1_1ROIAlignAttrs-members.html |   27 +-
 .../structtvm_1_1relay_1_1ROIAlignAttrs.html       |   18 +-
 ...ttvm_1_1relay_1_1ROIAlignAttrs__coll__graph.svg |  121 +-
 ...m_1_1relay_1_1ROIAlignAttrs__inherit__graph.svg |  105 +-
 ...structtvm_1_1relay_1_1UniqueAttrs-members.html} |   26 +-
 ...html => structtvm_1_1relay_1_1UniqueAttrs.html} |   86 +-
 ...ucttvm_1_1relay_1_1UniqueAttrs__coll__graph.svg |   87 +
 ...tvm_1_1relay_1_1UniqueAttrs__inherit__graph.svg |   87 +
 ...vm_1_1runtime_1_1ObjectTypeChecker-members.html |    3 +-
 .../structtvm_1_1runtime_1_1ObjectTypeChecker.html |   54 +-
 ...eChecker_3_01Array_3_01T_01_4_01_4-members.html |    3 +-
 ...bjectTypeChecker_3_01Array_3_01T_01_4_01_4.html |   34 +-
 ...cker_3_01Array_3_01T_01_4_01_4__coll__graph.svg |   21 +-
 ...ker_3_01Map_3_01K_00_01V_01_4_01_4-members.html |    3 +-
 ...TypeChecker_3_01Map_3_01K_00_01V_01_4_01_4.html |   30 +-
 ...3_01Map_3_01K_00_01V_01_4_01_4__coll__graph.svg |   21 +-
 ..._1runtime_1_1ObjectTypeChecker__coll__graph.svg |   21 +-
 docs/api/doxygen/structural__equal_8h.html         |    4 +-
 .../doxygen/structural__equal_8h__dep__incl.svg    |  759 +--
 docs/api/doxygen/structural__equal_8h__incl.svg    |  531 +-
 docs/api/doxygen/structural__hash_8h.html          |    4 +-
 .../api/doxygen/structural__hash_8h__dep__incl.svg |  759 +--
 docs/api/doxygen/structural__hash_8h__incl.svg     |  537 +-
 docs/api/doxygen/tag_8h.html                       |    2 +-
 docs/api/doxygen/tag_8h__incl.svg                  |  887 +--
 docs/api/doxygen/tag_8h_source.html                |    2 +-
 docs/api/doxygen/tags_8h.html                      |    4 +-
 docs/api/doxygen/tags_8h__dep__incl.svg            |  529 +-
 docs/api/doxygen/tags_8h_source.html               |    9 +-
 docs/api/doxygen/target_8h.html                    |    4 +-
 docs/api/doxygen/target_8h__dep__incl.svg          |  473 +-
 docs/api/doxygen/target_8h__incl.svg               |  863 +--
 docs/api/doxygen/target_8h_source.html             |   21 +-
 docs/api/doxygen/target__info_8h.html              |    2 +-
 docs/api/doxygen/target__info_8h__incl.svg         |  871 +--
 docs/api/doxygen/target__kind_8h.html              |   14 +-
 docs/api/doxygen/target__kind_8h__dep__incl.svg    |  483 +-
 docs/api/doxygen/target__kind_8h__incl.svg         |  803 +--
 docs/api/doxygen/target__kind_8h_source.html       |   22 +-
 docs/api/doxygen/tensor_8h.html                    |    4 +-
 docs/api/doxygen/tensor_8h__dep__incl.svg          |  988 ++--
 docs/api/doxygen/tensor_8h__incl.svg               | 1289 ++---
 docs/api/doxygen/tensor__intrin_8h.html            |    4 +-
 docs/api/doxygen/tensor__intrin_8h__dep__incl.svg  |  789 ++-
 docs/api/doxygen/tensor__intrin_8h__incl.svg       | 1283 ++---
 docs/api/doxygen/tensor__type_8h.html              |    2 +-
 docs/api/doxygen/tensor__type_8h__incl.svg         |  871 +--
 docs/api/doxygen/tensor__utils_8h.html             |    3 +-
 docs/api/doxygen/tensor__utils_8h__dep__incl.svg   |   93 +-
 docs/api/doxygen/tensor__utils_8h__incl.svg        | 1061 ++--
 docs/api/doxygen/tensor__utils_8h_source.html      |    3 +-
 docs/api/doxygen/tir_2analysis_8h.html             |    4 +-
 docs/api/doxygen/tir_2analysis_8h__dep__incl.svg   |  151 +-
 docs/api/doxygen/tir_2analysis_8h__incl.svg        | 1379 ++---
 docs/api/doxygen/tir_2analysis_8h_source.html      |    4 +-
 docs/api/doxygen/tir_2expr_8h.html                 |    2 +-
 docs/api/doxygen/tir_2expr_8h__incl.svg            |  997 ++--
 docs/api/doxygen/tir_2expr__functor_8h.html        |    2 +-
 docs/api/doxygen/tir_2expr__functor_8h__incl.svg   | 1007 ++--
 docs/api/doxygen/tir_2function_8h.html             |    4 +-
 docs/api/doxygen/tir_2function_8h__dep__incl.svg   |  407 +-
 docs/api/doxygen/tir_2function_8h__incl.svg        | 1173 ++--
 docs/api/doxygen/tir_2function_8h_source.html      |    2 +-
 docs/api/doxygen/tir_2op_8h.html                   |    2 +-
 docs/api/doxygen/tir_2op_8h__dep__incl.svg         |  966 ++--
 docs/api/doxygen/tir_2op__attr__types_8h.html      |    2 +-
 .../doxygen/tir_2op__attr__types_8h__dep__incl.svg |  161 +-
 docs/api/doxygen/tir_2transform_8h.html            |    2 +-
 docs/api/doxygen/tir_2transform_8h__incl.svg       | 1331 ++---
 docs/api/doxygen/tir_2transform_8h_source.html     |   16 +-
 docs/api/doxygen/topi_2nn_8h.html                  |    2 +-
 docs/api/doxygen/topi_2nn_8h__incl.svg             | 1111 ++--
 docs/api/doxygen/topi_2nn_8h_source.html           |    2 +-
 docs/api/doxygen/topi_2transform_8h.html           |    2 +-
 docs/api/doxygen/topi_2transform_8h__incl.svg      |  989 ++--
 docs/api/doxygen/transform__step_8h.html           |    2 +-
 docs/api/doxygen/transform__step_8h__incl.svg      | 1163 ++--
 docs/api/doxygen/type__relation_8h.html            |    2 +-
 docs/api/doxygen/type__relation_8h__incl.svg       | 1183 ++--
 docs/api/doxygen/utils_8h.html                     |    2 +-
 docs/api/doxygen/utils_8h__incl.svg                |  869 +--
 docs/api/doxygen/utils_8h_source.html              |    4 +-
 docs/api/doxygen/var_8h.html                       |   10 +-
 docs/api/doxygen/var_8h__dep__incl.svg             |  919 +--
 docs/api/doxygen/var_8h__incl.svg                  |  851 +--
 docs/api/doxygen/vision_8h.html                    |    2 +-
 docs/api/doxygen/vision_8h__incl.svg               | 1125 ++--
 docs/api/doxygen/vision_8h_source.html             |   41 +-
 docs/api/doxygen/with_8h.html                      |    2 +-
 docs/api/doxygen/with_8h__dep__incl.svg            |  785 +--
 docs/api/doxygen/x86_2bnn_8h.html                  |    2 +-
 docs/api/doxygen/x86_2bnn_8h__incl.svg             | 1303 ++---
 docs/api/doxygen/x86_2bnn_8h_source.html           |    4 +-
 docs/api/doxygen/x86_2default_8h.html              |    2 +-
 docs/api/doxygen/x86_2default_8h__incl.svg         | 1341 ++---
 docs/api/doxygen/x86_2default_8h_source.html       |    2 +-
 docs/api/doxygen/x86_2injective_8h.html            |    2 +-
 docs/api/doxygen/x86_2injective_8h__incl.svg       | 1303 ++---
 docs/api/doxygen/x86_2injective_8h_source.html     |    2 +-
 docs/api/python/auto_scheduler.html                |  198 +-
 docs/api/python/autotvm.html                       |    6 +-
 docs/api/python/graph_runtime.html                 |    4 +-
 docs/api/python/micro.html                         |  112 +-
 docs/api/python/ndarray.html                       |   11 +-
 docs/api/python/relay/dataflow_pattern.html        |   76 +-
 docs/api/python/relay/frontend.html                |    4 +-
 docs/api/python/relay/image.html                   |   16 +-
 docs/api/python/relay/index.html                   |  328 +-
 docs/api/python/relay/nn.html                      |  144 +-
 docs/api/python/relay/testing.html                 |    2 +-
 docs/api/python/relay/transform.html               |   77 +-
 docs/api/python/relay/vision.html                  |    3 +-
 docs/api/python/target.html                        |   13 +-
 docs/api/python/tir.html                           |   94 +-
 docs/api/python/topi.html                          |  496 +-
 docs/api/rust/compiler_ext/fn.tvm_export.html      |    2 +-
 docs/api/rust/search-index.js                      |    6 +-
 docs/api/rust/settings.html                        |    4 +-
 docs/api/rust/tvm/context/enum.DeviceType.html     |    4 +-
 docs/api/rust/tvm/context/struct.Context.html      |   12 +-
 docs/api/rust/tvm/enum.DeviceType.html             |    4 +-
 docs/api/rust/tvm/enum.Error.html                  |    6 +-
 docs/api/rust/tvm/enum.NDArrayError.html           |    6 +-
 docs/api/rust/tvm/errors/enum.Error.html           |    6 +-
 docs/api/rust/tvm/errors/enum.NDArrayError.html    |    6 +-
 docs/api/rust/tvm/function/enum.ArgValue.html      |  156 +-
 docs/api/rust/tvm/function/enum.RetValue.html      |  100 +-
 .../rust/tvm/function/ffi/struct.DLContext.html    |   26 +-
 .../rust/tvm/function/ffi/struct.DLDataType.html   |   24 +-
 .../api/rust/tvm/function/ffi/struct.DLTensor.html |   12 +-
 .../rust/tvm/function/ffi/struct.TVMByteArray.html |    4 +-
 docs/api/rust/tvm/function/ffi/union.TVMValue.html |   32 +-
 docs/api/rust/tvm/function/struct.Function.html    |    8 +-
 docs/api/rust/tvm/ir/relay/struct.DataType.html    |    6 +-
 docs/api/rust/tvm/module/struct.Module.html        |   12 +-
 docs/api/rust/tvm/ndarray/struct.NDArray.html      |   16 +-
 .../rust/tvm/ndarray/struct.NDArrayContainer.html  |    4 +-
 docs/api/rust/tvm/runtime/array/struct.Array.html  |    8 +-
 .../rust/tvm/runtime/context/enum.DeviceType.html  |    4 +-
 .../rust/tvm/runtime/context/struct.Context.html   |   12 +-
 docs/api/rust/tvm/runtime/enum.ArgValue.html       |  156 +-
 docs/api/rust/tvm/runtime/enum.DeviceType.html     |    4 +-
 docs/api/rust/tvm/runtime/enum.Error.html          |    6 +-
 docs/api/rust/tvm/runtime/enum.NDArrayError.html   |    6 +-
 docs/api/rust/tvm/runtime/enum.RetValue.html       |  100 +-
 docs/api/rust/tvm/runtime/errors/enum.Error.html   |    6 +-
 .../rust/tvm/runtime/errors/enum.NDArrayError.html |    6 +-
 .../rust/tvm/runtime/function/enum.ArgValue.html   |  156 +-
 .../rust/tvm/runtime/function/enum.RetValue.html   |  100 +-
 .../tvm/runtime/function/ffi/struct.DLContext.html |   26 +-
 .../runtime/function/ffi/struct.DLDataType.html    |   24 +-
 .../tvm/runtime/function/ffi/struct.DLTensor.html  |   12 +-
 .../runtime/function/ffi/struct.TVMByteArray.html  |    4 +-
 .../tvm/runtime/function/ffi/union.TVMValue.html   |   32 +-
 .../rust/tvm/runtime/function/struct.Function.html |    8 +-
 .../api/rust/tvm/runtime/module/struct.Module.html |   12 +-
 .../rust/tvm/runtime/ndarray/struct.NDArray.html   |   16 +-
 .../runtime/ndarray/struct.NDArrayContainer.html   |    4 +-
 .../rust/tvm/runtime/object/struct.ObjectPtr.html  |    4 +-
 .../rust/tvm/runtime/object/struct.ObjectRef.html  |   10 +-
 .../api/rust/tvm/runtime/string/struct.String.html |   20 +-
 .../rust/tvm/runtime/string/struct.StringObj.html  |    4 +-
 docs/api/rust/tvm/runtime/struct.ByteArray.html    |   12 +-
 docs/api/rust/tvm/runtime/struct.Context.html      |   12 +-
 docs/api/rust/tvm/runtime/struct.DataType.html     |    6 +-
 docs/api/rust/tvm/runtime/struct.Function.html     |    8 +-
 docs/api/rust/tvm/runtime/struct.Module.html       |   12 +-
 docs/api/rust/tvm/runtime/struct.NDArray.html      |   16 +-
 docs/api/rust/tvm/runtime/struct.ObjectPtr.html    |    4 +-
 docs/api/rust/tvm/runtime/struct.ObjectRef.html    |   10 +-
 docs/api/rust/tvm/runtime/struct.String.html       |   20 +-
 docs/api/rust/tvm/runtime/struct.StringObj.html    |    4 +-
 docs/api/rust/tvm/struct.Context.html              |   12 +-
 docs/api/rust/tvm/struct.DataType.html             |    6 +-
 docs/api/rust/tvm/struct.Function.html             |    8 +-
 docs/api/rust/tvm/struct.Module.html               |   12 +-
 docs/api/rust/tvm/struct.NDArray.html              |   16 +-
 docs/api/rust/tvm_graph_rt/enum.ArgValue.html      |   92 +-
 docs/api/rust/tvm_graph_rt/enum.RetValue.html      |   52 +-
 .../rust/tvm_graph_rt/ffi/struct.DLContext.html    |   26 +-
 .../rust/tvm_graph_rt/ffi/struct.DLDataType.html   |   22 +-
 .../api/rust/tvm_graph_rt/ffi/struct.DLTensor.html |   12 +-
 .../rust/tvm_graph_rt/ffi/struct.TVMByteArray.html |    4 +-
 docs/api/rust/tvm_graph_rt/ffi/union.TVMValue.html |   32 +-
 .../tvm_graph_rt/packed_func/enum.ArgValue.html    |   92 +-
 .../tvm_graph_rt/packed_func/enum.RetValue.html    |   52 +-
 .../tvm_graph_rt/packed_func/union.TVMValue.html   |   32 +-
 docs/api/rust/tvm_graph_rt/struct.DLTensor.html    |   12 +-
 docs/api/rust/tvm_graph_rt/struct.Entry.html       |    6 +-
 docs/api/rust/tvm_graph_rt/struct.Graph.html       |    6 +-
 docs/api/rust/tvm_graph_rt/struct.Node.html        |    6 +-
 docs/api/rust/tvm_graph_rt/union.TVMValue.html     |   32 +-
 docs/api/rust/tvm_rt/context/enum.DeviceType.html  |    4 +-
 docs/api/rust/tvm_rt/context/struct.Context.html   |   12 +-
 docs/api/rust/tvm_rt/enum.ArgValue.html            |   92 +-
 docs/api/rust/tvm_rt/enum.DeviceType.html          |    4 +-
 docs/api/rust/tvm_rt/enum.RetValue.html            |   68 +-
 docs/api/rust/tvm_rt/function/enum.ArgValue.html   |   92 +-
 docs/api/rust/tvm_rt/function/enum.RetValue.html   |   68 +-
 .../rust/tvm_rt/function/ffi/struct.DLContext.html |   26 +-
 .../tvm_rt/function/ffi/struct.DLDataType.html     |   24 +-
 .../rust/tvm_rt/function/ffi/struct.DLTensor.html  |   12 +-
 .../tvm_rt/function/ffi/struct.TVMByteArray.html   |    4 +-
 .../rust/tvm_rt/function/ffi/union.TVMValue.html   |   32 +-
 docs/api/rust/tvm_rt/macro.external.html           |    2 +-
 docs/api/rust/tvm_rt/struct.ByteArray.html         |   12 +-
 docs/api/rust/tvm_rt/struct.Context.html           |   12 +-
 docs/api/rust/tvm_rt/struct.DataType.html          |    6 +-
 docs/api/typedoc/assets/js/search.json             |    2 +-
 docs/api/typedoc/classes/bytestreamreader.html     |   12 +-
 docs/api/typedoc/classes/cachedcallstack.html      |   34 +-
 docs/api/typedoc/classes/dlcontext.html            |   10 +-
 docs/api/typedoc/classes/dldatatype.html           |   12 +-
 docs/api/typedoc/classes/environment.html          |   12 +-
 docs/api/typedoc/classes/ffilibrary.html           |   20 +-
 docs/api/typedoc/classes/graphruntime.html         |   16 +-
 docs/api/typedoc/classes/instance.html             |   40 +-
 docs/api/typedoc/classes/memory.html               |   34 +-
 docs/api/typedoc/classes/module.html               |   10 +-
 docs/api/typedoc/classes/ndarray.html              |   22 +-
 docs/api/typedoc/classes/packedfunccell.html       |    6 +-
 docs/api/typedoc/classes/rpcserver.html            |   14 +-
 docs/api/typedoc/classes/scalar.html               |    6 +-
 docs/api/typedoc/classes/webgpucontext.html        |   12 +-
 docs/api/typedoc/enums/argtypecode.html            |   30 +-
 docs/api/typedoc/enums/aynccallbackcode.html       |    4 +-
 docs/api/typedoc/enums/dldatatypecode.html         |    8 +-
 docs/api/typedoc/enums/rpcserverstate.html         |   12 +-
 docs/api/typedoc/enums/sizeof.html                 |   18 +-
 docs/api/typedoc/index.html                        |  114 +-
 docs/api/typedoc/interfaces/disposable.html        |    2 +-
 docs/api/typedoc/interfaces/functioninfo.html      |    6 +-
 docs/api/typedoc/interfaces/libraryprovider.html   |    4 +-
 docs/deploy/vitis_ai.html                          |   98 +-
 docs/genindex.html                                 |  110 +-
 docs/langref/relay_op.html                         |    2 +-
 docs/langref/relay_pattern.html                    |   24 +
 docs/objects.inv                                   |  Bin 17902 -> 18131 bytes
 docs/searchindex.js                                |    2 +-
 .../auto_scheduler/sg_execution_times.html         |   13 +-
 .../auto_scheduler/tune_conv2d_layer_cuda.html     | 1368 ++++-
 docs/tutorials/auto_scheduler/tune_matmul_x86.html |   12 +-
 ...une_network_mali.html => tune_network_arm.html} |  370 +-
 .../auto_scheduler/tune_network_cuda.html          |   57 +-
 .../auto_scheduler/tune_network_mali.html          |  129 +-
 .../tutorials/auto_scheduler/tune_network_x86.html |   63 +-
 docs/tutorials/autotvm/sg_execution_times.html     |   12 +-
 docs/tutorials/autotvm/tune_conv2d_cuda.html       |   48 +-
 docs/tutorials/autotvm/tune_simple_template.html   |   24 +-
 docs/tutorials/dev/bring_your_own_datatypes.html   |   10 +-
 docs/tutorials/dev/low_level_custom_pass.html      |   12 +-
 docs/tutorials/dev/sg_execution_times.html         |    8 +-
 .../frontend/deploy_model_on_android.html          |    2 +-
 .../frontend/deploy_object_detection_pytorch.html  |    2 +-
 docs/tutorials/frontend/deploy_prequantized.html   |    2 +-
 .../frontend/deploy_prequantized_tflite.html       |    4 +-
 docs/tutorials/frontend/deploy_sparse.html         |   20 +-
 docs/tutorials/frontend/deploy_ssd_gluoncv.html    |    9 +-
 docs/tutorials/frontend/from_onnx.html             |    6 +-
 docs/tutorials/frontend/from_pytorch.html          |    9 +-
 docs/tutorials/frontend/from_tensorflow.html       |    7 +-
 docs/tutorials/frontend/from_tflite.html           |    2 +-
 docs/tutorials/frontend/sg_execution_times.html    |   40 +-
 .../get_started/cross_compilation_and_rpc.html     |    2 +-
 docs/tutorials/get_started/relay_quick_start.html  |  120 +-
 docs/tutorials/get_started/sg_execution_times.html |   10 +-
 .../get_started/tensor_expr_get_started.html       |   16 +-
 docs/tutorials/index.html                          |   33 +-
 docs/tutorials/language/intrin_math.html           |   60 +-
 docs/tutorials/language/reduction.html             |   15 +-
 docs/tutorials/language/schedule_primitives.html   |   28 +-
 docs/tutorials/language/sg_execution_times.html    |   18 +-
 docs/tutorials/language/tensorize.html             |    8 +-
 docs/tutorials/language/tuple_inputs.html          |   20 +-
 docs/tutorials/micro/micro_reference_vm.html       |   23 +-
 docs/tutorials/micro/micro_tflite.html             |   21 +-
 docs/tutorials/micro/sg_execution_times.html       |    4 +-
 docs/tutorials/optimize/opt_conv_cuda.html         |    2 +-
 docs/tutorials/optimize/opt_conv_tensorcore.html   |    2 +-
 docs/tutorials/optimize/opt_gemm.html              |   20 +-
 .../optimize/opt_matmul_auto_tensorcore.html       |   44 +-
 docs/tutorials/optimize/sg_execution_times.html    |   10 +-
 docs/tutorials/topi/intro_topi.html                |    2 +-
 docs/tutorials/topi/sg_execution_times.html        |    4 +-
 docs/vta/tutorials/autotvm/sg_execution_times.html |    4 +-
 docs/vta/tutorials/autotvm/tune_relay_vta.html     |  187 +-
 .../tutorials/frontend/deploy_classification.html  |   18 +-
 .../vta/tutorials/frontend/sg_execution_times.html |    4 +-
 .../tutorials/optimize/matrix_multiply_opt.html    |    4 +-
 .../vta/tutorials/optimize/sg_execution_times.html |    6 +-
 docs/vta/tutorials/sg_execution_times.html         |    6 +-
 docs/vta/tutorials/vta_get_started.html            |    4 +-
 940 files changed, 100067 insertions(+), 89902 deletions(-)

diff --git a/docs/_downloads/02fc8627299fa0b05eb017773b471bfa/from_tflite.py b/docs/_downloads/02fc8627299fa0b05eb017773b471bfa/from_tflite.py
index a3014f9..f7e8422 100644
--- a/docs/_downloads/02fc8627299fa0b05eb017773b471bfa/from_tflite.py
+++ b/docs/_downloads/02fc8627299fa0b05eb017773b471bfa/from_tflite.py
@@ -26,7 +26,7 @@ To get started, TFLite package needs to be installed as prerequisite.
 .. code-block:: bash
 
     # install tflite
-    pip install tflite=2.1.0 --user
+    pip install tflite==2.1.0 --user
 
 
 or you could generate TFLite package yourself. The steps are the following:
diff --git a/docs/_downloads/08e39628455fe618afd9eb5b958a433e/micro_reference_vm.ipynb b/docs/_downloads/08e39628455fe618afd9eb5b958a433e/micro_reference_vm.ipynb
index 078d3ec..c0d921a 100644
--- a/docs/_downloads/08e39628455fe618afd9eb5b958a433e/micro_reference_vm.ipynb
+++ b/docs/_downloads/08e39628455fe618afd9eb5b958a433e/micro_reference_vm.ipynb
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n\n# microTVM Reference Virtual Machines\n\n**Author**: `Andrew Reusch <ar...@octoml.ai>`_\n\nThis tutorial explains how to launch microTVM Reference Virtual Machines. You can use these to\ndevelop on real physical hardware without needing to individually install the microTVM\ndependencies. These are also particularly useful when trying to reproduce behavior with\nmicroTVM, such as when filing bug reports.\n\nmicroTVM is the effort to allow TVM to build and execute models on  [...]
+        "\n\n# microTVM Reference Virtual Machines\n\n**Author**: `Andrew Reusch <ar...@octoml.ai>`_\n\nThis tutorial explains how to launch microTVM Reference Virtual Machines. You can use these to\ndevelop on real physical hardware without needing to individually install the microTVM\ndependencies. These are also particularly useful when trying to reproduce behavior with\nmicroTVM, such as when filing bug reports.\n\nmicroTVM is the effort to allow TVM to build and execute models on  [...]
       ]
     }
   ],
diff --git a/docs/_downloads/0c30ce88b67b0e8d46494348ab36c9fb/from_tflite.ipynb b/docs/_downloads/0c30ce88b67b0e8d46494348ab36c9fb/from_tflite.ipynb
index e0ed6f8..a5e08f1 100644
--- a/docs/_downloads/0c30ce88b67b0e8d46494348ab36c9fb/from_tflite.ipynb
+++ b/docs/_downloads/0c30ce88b67b0e8d46494348ab36c9fb/from_tflite.ipynb
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\nCompile TFLite Models\n=====================\n**Author**: `Zhao Wu <https://github.com/FrozenGene>`_\n\nThis article is an introductory tutorial to deploy TFLite models with Relay.\n\nTo get started, TFLite package needs to be installed as prerequisite.\n\n.. code-block:: bash\n\n    # install tflite\n    pip install tflite=2.1.0 --user\n\n\nor you could generate TFLite package yourself. The steps are the following:\n\n.. code-block:: bash\n\n    # Get the flatc compiler.\n    [...]
+        "\nCompile TFLite Models\n=====================\n**Author**: `Zhao Wu <https://github.com/FrozenGene>`_\n\nThis article is an introductory tutorial to deploy TFLite models with Relay.\n\nTo get started, TFLite package needs to be installed as prerequisite.\n\n.. code-block:: bash\n\n    # install tflite\n    pip install tflite==2.1.0 --user\n\n\nor you could generate TFLite package yourself. The steps are the following:\n\n.. code-block:: bash\n\n    # Get the flatc compiler.\n   [...]
       ]
     },
     {
diff --git a/docs/_downloads/0c8b1cb0bb1d1dff7899c341215a0f35/tune_network_mali.ipynb b/docs/_downloads/0c8b1cb0bb1d1dff7899c341215a0f35/tune_network_mali.ipynb
index 4254721..b0a4054 100644
--- a/docs/_downloads/0c8b1cb0bb1d1dff7899c341215a0f35/tune_network_mali.ipynb
+++ b/docs/_downloads/0c8b1cb0bb1d1dff7899c341215a0f35/tune_network_mali.ipynb
@@ -126,7 +126,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Other Tips\n----------\n1. During the tuning, the auto-scheduler needs to compile many programs and\n   extract feature from them. This part is CPU-intensive,\n   so a high-performance CPU with many cores is recommended for faster search.\n2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill --i log.json`\n   to distill the large log file and only save the best useful records.\n3. You can resume a search from the previous log file. You just need to\n [...]
+        "Other Tips\n----------\n1. During the tuning, the auto-scheduler needs to compile many programs and\n   extract feature from them. This part is CPU-intensive,\n   so a high-performance CPU with many cores is recommended for faster search.\n2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json`\n   to distill the large log file and only save the best useful records.\n3. You can resume a search from the previous log file. You just need to\n  [...]
       ]
     }
   ],
diff --git a/docs/_downloads/2771a7fc8bf8eeb7788823ff349aacc0/tune_network_cuda.py b/docs/_downloads/2771a7fc8bf8eeb7788823ff349aacc0/tune_network_cuda.py
index b098869..5ed3cee 100644
--- a/docs/_downloads/2771a7fc8bf8eeb7788823ff349aacc0/tune_network_cuda.py
+++ b/docs/_downloads/2771a7fc8bf8eeb7788823ff349aacc0/tune_network_cuda.py
@@ -299,7 +299,7 @@ print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), n
 # 1. During the tuning, the auto-scheduler needs to compile many programs and
 #    extract feature from them. This part is CPU-intensive,
 #    so a high-performance CPU with many cores is recommended for faster search.
-# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill --i log.json`
+# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json`
 #    to distill the large log file and only save the best useful records.
 # 3. You can resume a search from the previous log file. You just need to
 #    add a new argument :code:`load_log_file` when creating the task scheduler
diff --git a/docs/_downloads/33a19782c8aaf9fc62e565c57df5caca/deploy_sparse.py b/docs/_downloads/33a19782c8aaf9fc62e565c57df5caca/deploy_sparse.py
index dcf2fc4..9641fb8 100644
--- a/docs/_downloads/33a19782c8aaf9fc62e565c57df5caca/deploy_sparse.py
+++ b/docs/_downloads/33a19782c8aaf9fc62e565c57df5caca/deploy_sparse.py
@@ -102,10 +102,8 @@ name = "huggingface/prunebert-base-uncased-6-finepruned-w-distil-squad"
 batch_size = 1
 # The length of each input sequence.
 seq_len = 128
-# TVM platform identifier. Although cuda is also supported, it requires
-# tuning that is outside the scope of this tutorial. Note that best
-# cpu performance can be achieved by setting -mcpu appropriately for
-# your specific machine.
+# TVM platform identifier. Note that best cpu performance can be achieved by setting -mcpu
+# appropriately for your specific machine. CUDA and ROCm are also supported.
 target = "llvm"
 # Which device to run on. Should be one of tvm.cpu() or tvm.gpu().
 ctx = tvm.cpu()
@@ -339,3 +337,17 @@ def benchmark():
 # Runtime:             165.26 ms           (12.83 ms)
 # Block Sparse Model with 1x1 blocks:
 # Runtime:             67.75 ms            (8.83 ms)
+
+# Here is the output of this script on a GPU (GTX 1070) with the target "cuda -libs=cublas".
+#
+# Dense Model Benchmark:
+# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (1, 768), 'float32'), ('TENSOR', (2, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
+# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (1, 768), 'float32'), ('TENSOR', (768, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
+# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (128, 3072), 'float32'), ('TENSOR', (768, 3072), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
+# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (128, 768), 'float32'), ('TENSOR', (3072, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
+# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (128, 768), 'float32'), ('TENSOR', (768, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
+# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('batch_matmul_cublas.cuda', ('TENSOR', (12, 128, 128), 'float32'), ('TENSOR', (12, 64, 128), 'float32'), (12, 128, 64)). A fallback configuration is used, which may bring great performance regression.
+# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('batch_matmul_cublas.cuda', ('TENSOR', (12, 128, 64), 'float32'), ('TENSOR', (12, 128, 64), 'float32'), (12, 128, 128)). A fallback configuration is used, which may bring great performance regression.
+# Runtime:             10.64 ms            (0.29 ms)
+# Block Sparse Model with 1x1 blocks:
+# Runtime:             6.46 ms             (0.05 ms)
diff --git a/docs/_downloads/48bd751ebaae08fce134e559f86a25cc/tune_relay_vta.ipynb b/docs/_downloads/48bd751ebaae08fce134e559f86a25cc/tune_relay_vta.ipynb
index 92757f6..a14424b 100644
--- a/docs/_downloads/48bd751ebaae08fce134e559f86a25cc/tune_relay_vta.ipynb
+++ b/docs/_downloads/48bd751ebaae08fce134e559f86a25cc/tune_relay_vta.ipynb
@@ -83,7 +83,7 @@
       },
       "outputs": [],
       "source": [
-        "# Tracker host and port can be set by your environment\ntracker_host = os.environ.get(\"TVM_TRACKER_HOST\", \"0.0.0.0\")\ntracker_port = int(os.environ.get(\"TVM_TRACKER_PORT\", 9190))\n\n# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file\nenv = vta.get_env()\n\n# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.\n# Set ``device=arm_cpu`` to run inference on the CPU\n# or ``device=vta`` to run inference on the F [...]
+        "# Tracker host and port can be set by your environment\ntracker_host = os.environ.get(\"TVM_TRACKER_HOST\", \"0.0.0.0\")\ntracker_port = int(os.environ.get(\"TVM_TRACKER_PORT\", 9190))\n\n# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file\nenv = vta.get_env()\n\n# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.\n# Set ``device=arm_cpu`` to run inference on the CPU\n# or ``device=vta`` to run inference on the F [...]
       ]
     },
     {
diff --git a/docs/_downloads/612f9e42b0247df5c8ab277534e2af65/tune_relay_vta.py b/docs/_downloads/612f9e42b0247df5c8ab277534e2af65/tune_relay_vta.py
index c5885b6..ed2671c 100644
--- a/docs/_downloads/612f9e42b0247df5c8ab277534e2af65/tune_relay_vta.py
+++ b/docs/_downloads/612f9e42b0247df5c8ab277534e2af65/tune_relay_vta.py
@@ -215,6 +215,7 @@ tuning_option = {
             port=tracker_port,
             number=5,
             timeout=60,
+            module_loader=vta.module_loader(),
             # check_correctness=True, # TODO: re-enable when check_correctness works again.
         ),
     ),
diff --git a/docs/_downloads/64a7765a4ac55f228cf82b8462944a61/schedule_primitives.ipynb b/docs/_downloads/64a7765a4ac55f228cf82b8462944a61/schedule_primitives.ipynb
index 0b6fc25..054dad8 100644
--- a/docs/_downloads/64a7765a4ac55f228cf82b8462944a61/schedule_primitives.ipynb
+++ b/docs/_downloads/64a7765a4ac55f228cf82b8462944a61/schedule_primitives.ipynb
@@ -76,7 +76,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "split\n-----\n:code:`split` can split a specified axis into two axises by\n:code:`factor`.\n\n"
+        "split\n-----\n:code:`split` can split a specified axis into two axes by\n:code:`factor`.\n\n"
       ]
     },
     {
@@ -112,7 +112,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "tile\n----\n:code:`tile` help you execute the computation tile by tile over two\naxises.\n\n"
+        "tile\n----\n:code:`tile` help you execute the computation tile by tile over two\naxes.\n\n"
       ]
     },
     {
@@ -130,7 +130,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "fuse\n----\n:code:`fuse` can fuse two consecutive axises of one computation.\n\n"
+        "fuse\n----\n:code:`fuse` can fuse two consecutive axes of one computation.\n\n"
       ]
     },
     {
@@ -141,14 +141,14 @@
       },
       "outputs": [],
       "source": [
-        "A = te.placeholder((m, n), name=\"A\")\nB = te.compute((m, n), lambda i, j: A[i, j], name=\"B\")\n\ns = te.create_schedule(B.op)\n# tile to four axises first: (i.outer, j.outer, i.inner, j.inner)\nxo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)\n# then fuse (i.inner, j.inner) into one axis: (i.inner.j.inner.fused)\nfused = s[B].fuse(xi, yi)\nprint(tvm.lower(s, [A, B], simple_mode=True))"
+        "A = te.placeholder((m, n), name=\"A\")\nB = te.compute((m, n), lambda i, j: A[i, j], name=\"B\")\n\ns = te.create_schedule(B.op)\n# tile to four axes first: (i.outer, j.outer, i.inner, j.inner)\nxo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)\n# then fuse (i.inner, j.inner) into one axis: (i.inner.j.inner.fused)\nfused = s[B].fuse(xi, yi)\nprint(tvm.lower(s, [A, B], simple_mode=True))"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "reorder\n-------\n:code:`reorder` can reorder the axises in the specified order.\n\n"
+        "reorder\n-------\n:code:`reorder` can reorder the axes in the specified order.\n\n"
       ]
     },
     {
@@ -159,7 +159,7 @@
       },
       "outputs": [],
       "source": [
-        "A = te.placeholder((m, n), name=\"A\")\nB = te.compute((m, n), lambda i, j: A[i, j], name=\"B\")\n\ns = te.create_schedule(B.op)\n# tile to four axises first: (i.outer, j.outer, i.inner, j.inner)\nxo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)\n# then reorder the axises: (i.inner, j.outer, i.outer, j.inner)\ns[B].reorder(xi, yo, xo, yi)\nprint(tvm.lower(s, [A, B], simple_mode=True))"
+        "A = te.placeholder((m, n), name=\"A\")\nB = te.compute((m, n), lambda i, j: A[i, j], name=\"B\")\n\ns = te.create_schedule(B.op)\n# tile to four axes first: (i.outer, j.outer, i.inner, j.inner)\nxo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)\n# then reorder the axes: (i.inner, j.outer, i.outer, j.inner)\ns[B].reorder(xi, yo, xo, yi)\nprint(tvm.lower(s, [A, B], simple_mode=True))"
       ]
     },
     {
diff --git a/docs/_downloads/77322ea21ff00abad461e549895ef1d8/micro_reference_vm.py b/docs/_downloads/77322ea21ff00abad461e549895ef1d8/micro_reference_vm.py
index bcef6a0..93395a4 100644
--- a/docs/_downloads/77322ea21ff00abad461e549895ef1d8/micro_reference_vm.py
+++ b/docs/_downloads/77322ea21ff00abad461e549895ef1d8/micro_reference_vm.py
@@ -59,15 +59,17 @@ Installing prerequisites
 
 A minimal set of prerequisites are needed:
 
-
 1. `Vagrant <https://vagrantup.com>`__
-2. A supported Virtual Machine hypervisor.
-   `VirtualBox <https://www.virtualbox.org>`__ is one suggested free hypervisor, but please note
+2. A supported Virtual Machine hypervisor (**VirtualBox**, **Parallels**, or **VMWare Fusion/Workstation**).
+   `VirtualBox <https://www.virtualbox.org>`__ is a suggested free hypervisor, but please note
    that the `VirtualBox Extension Pack`_ is required for proper USB forwarding. If using VirtualBox,
    also consider installing the `vbguest <https://github.com/dotless-de/vagrant-vbguest>`_ plugin.
 
 .. _VirtualBox Extension Pack: https://www.virtualbox.org/wiki/Downloads#VirtualBox6.1.16OracleVMVirtualBoxExtensionPack
 
+3. If required for your hypervisor, the
+   `Vagrant provider plugin <https://github.com/hashicorp/vagrant/wiki/Available-Vagrant-Plugins#providers>`__ (or see `here <https://www.vagrantup.com/vmware>`__ for VMWare).
+
 First boot
 ----------
 
@@ -75,9 +77,9 @@ The first time you use a reference VM, you need to create the box locally and th
 
 .. code-block:: bash
 
-    # Replace zepyhr with the name of a different platform, if you are not using Zephyr.
+    # Replace zephyr with the name of a different platform, if you are not using Zephyr.
     ~/.../tvm $ cd apps/microtvm/reference-vm/zephyr
-    # Replace <provider_name> with the name of the hypervisor you wish to use (i.e. virtualbox).
+    # Replace <provider_name> with the name of the hypervisor you wish to use (i.e. virtualbox, parallels, vmware_desktop).
     ~/.../tvm/apps/microtvm/reference-vm/zephyr $ vagrant up --provider=<provider_name>
 
 
@@ -140,6 +142,19 @@ Once the VM has been provisioned, tests can executed using ``poetry``:
 
 .. code-block:: bash
 
-    $ poetry run python3 tests/micro/qemu/test_zephyr.py --microtvm-platforms=stm32f746xx
+    $ cd apps/microtvm/reference-vm/zephyr
+    $ poetry run python3 ../../../../tests/micro/qemu/test_zephyr.py --microtvm-platforms=stm32f746xx
+
+If you do not have physical hardware attached, but wish to run the tests using the
+local QEMU emulator running within the VM, run the following commands instead:
+
+.. code-block:: bash
+
+    $ cd /Users/yourusername/path/to/tvm
+    $ sudo ./docker/install/ubuntu_install_qemu.sh
+    $ cd apps/microtvm/reference-vm/zephyr/
+    $ poetry run pytest ../../../../tests/micro/qemu/test_zephyr.py --microtvm-platforms=host
+
+
 
 """
diff --git a/docs/_downloads/78bebde8ea0f8558ac1a6fe12999f99f/tune_network_mali.py b/docs/_downloads/78bebde8ea0f8558ac1a6fe12999f99f/tune_network_mali.py
index d3fefa7..ca1067b 100644
--- a/docs/_downloads/78bebde8ea0f8558ac1a6fe12999f99f/tune_network_mali.py
+++ b/docs/_downloads/78bebde8ea0f8558ac1a6fe12999f99f/tune_network_mali.py
@@ -349,7 +349,7 @@ def tune_and_evaluate():
 # 1. During the tuning, the auto-scheduler needs to compile many programs and
 #    extract feature from them. This part is CPU-intensive,
 #    so a high-performance CPU with many cores is recommended for faster search.
-# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill --i log.json`
+# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json`
 #    to distill the large log file and only save the best useful records.
 # 3. You can resume a search from the previous log file. You just need to
 #    add a new argument :code:`load_log_file` when creating the task scheduler
diff --git a/docs/_downloads/8246644805c8dfcb0b33ca356cc1fafc/deploy_ssd_gluoncv.ipynb b/docs/_downloads/8246644805c8dfcb0b33ca356cc1fafc/deploy_ssd_gluoncv.ipynb
index fdeded2..337ba47 100644
--- a/docs/_downloads/8246644805c8dfcb0b33ca356cc1fafc/deploy_ssd_gluoncv.ipynb
+++ b/docs/_downloads/8246644805c8dfcb0b33ca356cc1fafc/deploy_ssd_gluoncv.ipynb
@@ -87,7 +87,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Create TVM runtime and do inference\n\n"
+        "Create TVM runtime and do inference\n<div class=\"alert alert-info\"><h4>Note</h4><p>Use target = \"cuda -libs\" to enable thrust based sort, if you\n  enabled thrust during cmake by -DUSE_THRUST=ON.</p></div>\n\n"
       ]
     },
     {
diff --git a/docs/_downloads/87b9e8307245d848689e4cdc3e6fa9bf/deploy_sparse.ipynb b/docs/_downloads/87b9e8307245d848689e4cdc3e6fa9bf/deploy_sparse.ipynb
index c3dc776..85aa401 100644
--- a/docs/_downloads/87b9e8307245d848689e4cdc3e6fa9bf/deploy_sparse.ipynb
+++ b/docs/_downloads/87b9e8307245d848689e4cdc3e6fa9bf/deploy_sparse.ipynb
@@ -51,7 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "# The name of the transformer model to download and run.\nname = \"huggingface/prunebert-base-uncased-6-finepruned-w-distil-squad\"\n# The number of batches in an input.\nbatch_size = 1\n# The length of each input sequence.\nseq_len = 128\n# TVM platform identifier. Although cuda is also supported, it requires\n# tuning that is outside the scope of this tutorial. Note that best\n# cpu performance can be achieved by setting -mcpu appropriately for\n# your specific machine.\ntarge [...]
+        "# The name of the transformer model to download and run.\nname = \"huggingface/prunebert-base-uncased-6-finepruned-w-distil-squad\"\n# The number of batches in an input.\nbatch_size = 1\n# The length of each input sequence.\nseq_len = 128\n# TVM platform identifier. Note that best cpu performance can be achieved by setting -mcpu\n# appropriately for your specific machine. CUDA and ROCm are also supported.\ntarget = \"llvm\"\n# Which device to run on. Should be one of tvm.cpu() o [...]
       ]
     },
     {
@@ -159,7 +159,7 @@
       },
       "outputs": [],
       "source": [
-        "# Dense Model Benchmark:\n# Cannot find config for target=llvm, workload=('dense_nopack.x86', ('TENSOR', (1, 768), 'float32'), ('TENSOR', (2, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.\n# Cannot find config for target=llvm, workload=('dense_nopack.x86', ('TENSOR', (1, 768), 'float32'), ('TENSOR', (768, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance re [...]
+        "# Dense Model Benchmark:\n# Cannot find config for target=llvm, workload=('dense_nopack.x86', ('TENSOR', (1, 768), 'float32'), ('TENSOR', (2, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.\n# Cannot find config for target=llvm, workload=('dense_nopack.x86', ('TENSOR', (1, 768), 'float32'), ('TENSOR', (768, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance re [...]
       ]
     }
   ],
diff --git a/docs/_downloads/92b34b8e701291844895f4566f6dc366/schedule_primitives.py b/docs/_downloads/92b34b8e701291844895f4566f6dc366/schedule_primitives.py
index eb48dc2..ade79f6 100644
--- a/docs/_downloads/92b34b8e701291844895f4566f6dc366/schedule_primitives.py
+++ b/docs/_downloads/92b34b8e701291844895f4566f6dc366/schedule_primitives.py
@@ -69,7 +69,7 @@ print(tvm.lower(s, [A, B, C], simple_mode=True))
 ######################################################################
 # split
 # -----
-# :code:`split` can split a specified axis into two axises by
+# :code:`split` can split a specified axis into two axes by
 # :code:`factor`.
 A = te.placeholder((m,), name="A")
 B = te.compute((m,), lambda i: A[i] * 2, name="B")
@@ -92,7 +92,7 @@ print(tvm.lower(s, [A, B], simple_mode=True))
 # tile
 # ----
 # :code:`tile` help you execute the computation tile by tile over two
-# axises.
+# axes.
 A = te.placeholder((m, n), name="A")
 B = te.compute((m, n), lambda i, j: A[i, j], name="B")
 
@@ -103,12 +103,12 @@ print(tvm.lower(s, [A, B], simple_mode=True))
 ######################################################################
 # fuse
 # ----
-# :code:`fuse` can fuse two consecutive axises of one computation.
+# :code:`fuse` can fuse two consecutive axes of one computation.
 A = te.placeholder((m, n), name="A")
 B = te.compute((m, n), lambda i, j: A[i, j], name="B")
 
 s = te.create_schedule(B.op)
-# tile to four axises first: (i.outer, j.outer, i.inner, j.inner)
+# tile to four axes first: (i.outer, j.outer, i.inner, j.inner)
 xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)
 # then fuse (i.inner, j.inner) into one axis: (i.inner.j.inner.fused)
 fused = s[B].fuse(xi, yi)
@@ -117,14 +117,14 @@ print(tvm.lower(s, [A, B], simple_mode=True))
 ######################################################################
 # reorder
 # -------
-# :code:`reorder` can reorder the axises in the specified order.
+# :code:`reorder` can reorder the axes in the specified order.
 A = te.placeholder((m, n), name="A")
 B = te.compute((m, n), lambda i, j: A[i, j], name="B")
 
 s = te.create_schedule(B.op)
-# tile to four axises first: (i.outer, j.outer, i.inner, j.inner)
+# tile to four axes first: (i.outer, j.outer, i.inner, j.inner)
 xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)
-# then reorder the axises: (i.inner, j.outer, i.outer, j.inner)
+# then reorder the axes: (i.inner, j.outer, i.outer, j.inner)
 s[B].reorder(xi, yo, xo, yi)
 print(tvm.lower(s, [A, B], simple_mode=True))
 
diff --git a/docs/_downloads/9cf0213876be0a9cc4aaa52a1ebd9586/low_level_custom_pass.py b/docs/_downloads/9cf0213876be0a9cc4aaa52a1ebd9586/low_level_custom_pass.py
index 44fe59f..0bd656d 100644
--- a/docs/_downloads/9cf0213876be0a9cc4aaa52a1ebd9586/low_level_custom_pass.py
+++ b/docs/_downloads/9cf0213876be0a9cc4aaa52a1ebd9586/low_level_custom_pass.py
@@ -116,8 +116,8 @@ def vectorize8(op):
         name = op.loop_var.name
         lo, li = te.var(name + ".outer"), te.var(name + ".inner")
         body = tvm.tir.stmt_functor.substitute(op.body, {op.loop_var: lo * 8 + li})
-        body = tvm.tir.For(li, 0, 8, tvm.tir.For.Vectorized, 0, body)
-        body = tvm.tir.For(lo, 0, extent // 8, tvm.tir.For.Serial, 0, body)
+        body = tvm.tir.For(li, 0, 8, tvm.tir.ForKind.VECTORIZED, body)
+        body = tvm.tir.For(lo, 0, extent // 8, tvm.tir.ForKind.SERIAL, body)
         return body
     return None
 
diff --git a/docs/_downloads/afa7f0ecb19178546f310a1dfa66281f/tune_network_x86.ipynb b/docs/_downloads/afa7f0ecb19178546f310a1dfa66281f/tune_network_x86.ipynb
index 99b970b..c5b9393 100644
--- a/docs/_downloads/afa7f0ecb19178546f310a1dfa66281f/tune_network_x86.ipynb
+++ b/docs/_downloads/afa7f0ecb19178546f310a1dfa66281f/tune_network_x86.ipynb
@@ -119,7 +119,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Other Tips\n----------\n1. During the tuning, the auto-scheduler needs to compile many programs and\n   extract feature from them. This part is CPU-intensive,\n   so a high-performance CPU with many cores is recommended for faster search.\n2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill --i log.json`\n   to distill the large log file and only save the best useful records.\n3. You can resume a search from the previous log file. You just need to\n [...]
+        "Other Tips\n----------\n1. During the tuning, the auto-scheduler needs to compile many programs and\n   extract feature from them. This part is CPU-intensive,\n   so a high-performance CPU with many cores is recommended for faster search.\n2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json`\n   to distill the large log file and only save the best useful records.\n3. You can resume a search from the previous log file. You just need to\n  [...]
       ]
     }
   ],
diff --git a/docs/_downloads/b3eb5454a38ef6a663c9e4a7a3e61896/tune_network_x86.py b/docs/_downloads/b3eb5454a38ef6a663c9e4a7a3e61896/tune_network_x86.py
index 7f96254..8526abb 100644
--- a/docs/_downloads/b3eb5454a38ef6a663c9e4a7a3e61896/tune_network_x86.py
+++ b/docs/_downloads/b3eb5454a38ef6a663c9e4a7a3e61896/tune_network_x86.py
@@ -298,7 +298,7 @@ print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), n
 # 1. During the tuning, the auto-scheduler needs to compile many programs and
 #    extract feature from them. This part is CPU-intensive,
 #    so a high-performance CPU with many cores is recommended for faster search.
-# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill --i log.json`
+# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json`
 #    to distill the large log file and only save the best useful records.
 # 3. You can resume a search from the previous log file. You just need to
 #    add a new argument :code:`load_log_file` when creating the task scheduler
diff --git a/docs/_downloads/b78890bb249aab574c50f16eb0be62a9/tune_network_arm.ipynb b/docs/_downloads/b78890bb249aab574c50f16eb0be62a9/tune_network_arm.ipynb
new file mode 100644
index 0000000..8ce3f8b
--- /dev/null
+++ b/docs/_downloads/b78890bb249aab574c50f16eb0be62a9/tune_network_arm.ipynb
@@ -0,0 +1,161 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\nAuto-scheduling a Neural Network for ARM CPU\n=============================================\n**Author**: `Thierry Moreau <https://github.com/tmoreau89, Lianmin Zheng <https://github.com/merrymercy>>`_\n\nAuto-tuning for specific devices and workloads is critical for getting the\nbest performance. This is a tutorial on how to tune a whole neural\nnetwork for ARM CPU with the auto-scheduler via RPC.\n\nTo auto-tune a neural network, we partition the network into small subgraphs  [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\n\nimport tvm\nfrom tvm import relay, auto_scheduler\nimport tvm.relay.testing\nfrom tvm.contrib import graph_runtime\nfrom tvm.contrib.utils import tempdir"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Define a Network\n----------------\nFirst, we need to define the network with relay frontend API.\nWe can load some pre-defined network from :code:`tvm.relay.testing`.\nWe can also load models from MXNet, ONNX, PyTorch, and TensorFlow\n(see `front end tutorials<tutorial-frontend>`).\n\nFor convolutional neural networks, although auto-scheduler can work correctly\nwith any layout, we found the best performance is typically achieved with NHWC layout.\nWe also implemented more opti [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "def get_network(name, batch_size, layout=\"NHWC\", dtype=\"float32\"):\n    \"\"\"Get the symbol definition and random weight of a network\"\"\"\n\n    # auto-scheduler prefers NHWC layout\n    if layout == \"NHWC\":\n        image_shape = (224, 224, 3)\n    elif layout == \"NCHW\":\n        image_shape = (3, 224, 224)\n    else:\n        raise ValueError(\"Invalid layout: \" + layout)\n\n    input_shape = (batch_size,) + image_shape\n    output_shape = (batch_size, 1000)\n\n    [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Start RPC Tracker\n-----------------\nTVM uses RPC session to communicate with ARM boards.\nDuring tuning, the tuner will send the generated code to the board and\nmeasure the speed of code on the board.\n\nTo scale up the tuning, TVM uses RPC Tracker to manage distributed devices.\nThe RPC Tracker is a centralized controller node. We can register all devices to\nthe tracker. For example, if we have 10 phones, we can register all of them\nto the tracker, and run 10 measurements  [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Register Devices to RPC Tracker\n-----------------------------------\nNow we can register our devices to the tracker. The first step is to\nbuild the TVM runtime for the ARM devices.\n\n* For Linux:\n  Follow this section `build-tvm-runtime-on-device` to build\n  the TVM runtime on the device. Then register the device to tracker by\n\n  .. code-block:: bash\n\n    python -m tvm.exec.rpc_server --tracker=[HOST_IP]:9190 --key=rasp4b-64\n\n  (replace :code:`[HOST_IP]` with the IP a [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Set Tuning Options\n------------------\nBefore tuning, we should apply some configurations. Here I use a Raspberry Pi 4b 4GB board\nas example with a 64bit OS (Ubuntu 20.04). In your setting, you should modify the target\nand device_key accordingly.\nset :code:`use_ndk` to True if you use android phone.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "#### DEVICE CONFIG ####\n\n# Replace \"aarch64-linux-gnu\" with the correct target of your board.\n# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.\n# FIXME(tmoreau89, merrymercy): We leave '-device=arm_cpu' out of the target string\n#                               because we're sharing x86 op strategy.\ntarget = tvm.target.Target(\"llvm -mtriple=aarch64-linux-gnu -mattr=+neon\")\n\n# Also replace this with the device key in your tr [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Extract Search Tasks\n--------------------\nNext, we extract the search tasks and their weights from a network.\nThe weight of a task is the number of appearances of the task's subgraph\nin the whole network.\nBy using the weight, we can approximate the end-to-end latency of the network\nas :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the\nlatency of a task and :code:`weight[t]` is the weight of the task.\nThe task scheduler will just optimize this objective.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Extract tasks from the network\nprint(\"Extract tasks...\")\nmod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)\ntasks, task_weights = auto_scheduler.extract_tasks(mod[\"main\"], params, target)\n\nfor idx, task in enumerate(tasks):\n    print(\"========== Task %d  (workload key: %s) ==========\" % (idx, task.workload_key))\n    print(task.compute_dag)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Tuning and Evaluation\n---------------------\nNow, we set some options for tuning and launch the search tasks\n\n* :code:`num_measure_trials` is the number of measurement trials we can use during the tuning.\n  You can set it to a small number (e.g., 200) for a fast demonstrative run.\n  In practice, we recommend setting it around :code:`800 * len(tasks)`,\n  which is typically enough for the search to converge.\n  For example, there are 29 tasks in resnet-50, so we can set it a [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "def tune_and_evaluate():\n    print(\"Begin tuning...\")\n    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)\n    tune_option = auto_scheduler.TuningOptions(\n        num_measure_trials=200,  # change this to 20000 to achieve the best performance\n        runner=auto_scheduler.RPCRunner(\n            device_key,\n            host=\"0.0.0.0\",\n            port=9191,\n            timeout=30,\n            repeat=1,\n            min_repeat_ms=200,\n            enable_cpu [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "<div class=\"alert alert-info\"><h4>Note</h4><p>Explaining the printed information during tuning\n\n  During the tuning, a lot of information will be printed on the console.\n  They are used for debugging purposes. The most important info is the output\n  of the task scheduler. The following table is a sample output.\n\n  .. code-block:: c\n\n   ----------------------------------------------------------------------\n   ------------------------------  [ Task Scheduler ]\n   ----- [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "<div class=\"alert alert-info\"><h4>Note</h4><p>Terminate the tuning earlier\n\n  You can terminate the tuning earlier by forcibly killing this process.\n  As long as you get at least one valid schedule for each task in the log file,\n  you should be able to do the compilation (the secion below).</p></div>\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Other Tips\n----------\n1. During the tuning, the auto-scheduler needs to compile many programs and\n   extract feature from them. This part is CPU-intensive,\n   so a high-performance CPU with many cores is recommended for faster search.\n2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json`\n   to distill the large log file and only save the best useful records.\n3. You can resume a search from the previous log file. You just need to\n  [...]
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.12"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/_downloads/ca08de6c440df207921d807474d26f06/deploy_ssd_gluoncv.py b/docs/_downloads/ca08de6c440df207921d807474d26f06/deploy_ssd_gluoncv.py
index f1f1bbb..478aff2 100644
--- a/docs/_downloads/ca08de6c440df207921d807474d26f06/deploy_ssd_gluoncv.py
+++ b/docs/_downloads/ca08de6c440df207921d807474d26f06/deploy_ssd_gluoncv.py
@@ -94,6 +94,10 @@ def build(target):
 
 ######################################################################
 # Create TVM runtime and do inference
+# .. note::
+#
+#   Use target = "cuda -libs" to enable thrust based sort, if you
+#   enabled thrust during cmake by -DUSE_THRUST=ON.
 
 
 def run(lib, ctx):
diff --git a/docs/_downloads/cd8ac9c09164cc04dd9ecd131c536680/micro_tflite.ipynb b/docs/_downloads/cd8ac9c09164cc04dd9ecd131c536680/micro_tflite.ipynb
index 6014258..372f7a2 100644
--- a/docs/_downloads/cd8ac9c09164cc04dd9ecd131c536680/micro_tflite.ipynb
+++ b/docs/_downloads/cd8ac9c09164cc04dd9ecd131c536680/micro_tflite.ipynb
@@ -33,7 +33,7 @@
       },
       "outputs": [],
       "source": [
-        "import os\nimport numpy as np\nimport tvm\nimport tvm.micro as micro\nfrom tvm.contrib.download import download_testdata\nfrom tvm.contrib import graph_runtime, utils\nfrom tvm import relay\n\nmodel_url = \"https://people.linaro.org/~tom.gall/sine_model.tflite\"\nmodel_file = \"sine_model.tflite\"\nmodel_path = download_testdata(model_url, model_file, module=\"data\")\n\ntflite_model_buf = open(model_path, \"rb\").read()"
+        "import os\nimport numpy as np\nimport logging\n\nimport tvm\nimport tvm.micro as micro\nfrom tvm.contrib.download import download_testdata\nfrom tvm.contrib import graph_runtime, utils\nfrom tvm import relay\n\nmodel_url = \"https://people.linaro.org/~tom.gall/sine_model.tflite\"\nmodel_file = \"sine_model.tflite\"\nmodel_path = download_testdata(model_url, model_file, module=\"data\")\n\ntflite_model_buf = open(model_path, \"rb\").read()"
       ]
     },
     {
@@ -123,7 +123,7 @@
       },
       "outputs": [],
       "source": [
-        "with tvm.transform.PassContext(\n    opt_level=3, config={\"tir.disable_vectorize\": True}, disabled_pass=[\"FuseOps\"]\n):\n    graph, c_mod, c_params = relay.build(mod, target=TARGET, params=params)\n\n\n# %%\n# Compiling for a simulated device\n# --------------------------------\n#\n# First, compile a static microTVM runtime for the targeted device. In this case, the host simulated\n# device is used.\ncompiler = tvm.micro.DefaultCompiler(target=TARGET)\nopts = tvm.micro.defau [...]
+        "with tvm.transform.PassContext(\n    opt_level=3, config={\"tir.disable_vectorize\": True}, disabled_pass=[\"FuseOps\", \"AlterOpLayout\"]\n):\n    graph, c_mod, c_params = relay.build(mod, target=TARGET, params=params)\n\n\n# %%\n# Compiling for a simulated device\n# --------------------------------\n#\n# First, compile a static microTVM runtime for the targeted device. In this case, the host simulated\n# device is used.\ncompiler = tvm.micro.DefaultCompiler(target=TARGET)\nopt [...]
       ]
     },
     {
diff --git a/docs/_downloads/dad91669fd0ea707f1374fe331b0dffe/tune_network_cuda.ipynb b/docs/_downloads/dad91669fd0ea707f1374fe331b0dffe/tune_network_cuda.ipynb
index 3161dc9..9119377 100644
--- a/docs/_downloads/dad91669fd0ea707f1374fe331b0dffe/tune_network_cuda.ipynb
+++ b/docs/_downloads/dad91669fd0ea707f1374fe331b0dffe/tune_network_cuda.ipynb
@@ -119,7 +119,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Other Tips\n----------\n1. During the tuning, the auto-scheduler needs to compile many programs and\n   extract feature from them. This part is CPU-intensive,\n   so a high-performance CPU with many cores is recommended for faster search.\n2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill --i log.json`\n   to distill the large log file and only save the best useful records.\n3. You can resume a search from the previous log file. You just need to\n [...]
+        "Other Tips\n----------\n1. During the tuning, the auto-scheduler needs to compile many programs and\n   extract feature from them. This part is CPU-intensive,\n   so a high-performance CPU with many cores is recommended for faster search.\n2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json`\n   to distill the large log file and only save the best useful records.\n3. You can resume a search from the previous log file. You just need to\n  [...]
       ]
     }
   ],
diff --git a/docs/_downloads/e87c21d127b0b825efcf978b9f8e2cd7/low_level_custom_pass.ipynb b/docs/_downloads/e87c21d127b0b825efcf978b9f8e2cd7/low_level_custom_pass.ipynb
index 8238239..9a96126 100644
--- a/docs/_downloads/e87c21d127b0b825efcf978b9f8e2cd7/low_level_custom_pass.ipynb
+++ b/docs/_downloads/e87c21d127b0b825efcf978b9f8e2cd7/low_level_custom_pass.ipynb
@@ -87,7 +87,7 @@
       },
       "outputs": [],
       "source": [
-        "def vectorize8(op):\n    \"\"\" Split can vectorize the loops found in `find_width8`. \"\"\"\n    if op in loops:\n        extent = op.extent.value\n        name = op.loop_var.name\n        lo, li = te.var(name + \".outer\"), te.var(name + \".inner\")\n        body = tvm.tir.stmt_functor.substitute(op.body, {op.loop_var: lo * 8 + li})\n        body = tvm.tir.For(li, 0, 8, tvm.tir.For.Vectorized, 0, body)\n        body = tvm.tir.For(lo, 0, extent // 8, tvm.tir.For.Serial, 0, body [...]
+        "def vectorize8(op):\n    \"\"\" Split can vectorize the loops found in `find_width8`. \"\"\"\n    if op in loops:\n        extent = op.extent.value\n        name = op.loop_var.name\n        lo, li = te.var(name + \".outer\"), te.var(name + \".inner\")\n        body = tvm.tir.stmt_functor.substitute(op.body, {op.loop_var: lo * 8 + li})\n        body = tvm.tir.For(li, 0, 8, tvm.tir.ForKind.VECTORIZED, body)\n        body = tvm.tir.For(lo, 0, extent // 8, tvm.tir.ForKind.SERIAL, bo [...]
       ]
     },
     {
diff --git a/docs/_downloads/78bebde8ea0f8558ac1a6fe12999f99f/tune_network_mali.py b/docs/_downloads/fb88afbf9be39a834109b9b842f12fd0/tune_network_arm.py
similarity index 60%
copy from docs/_downloads/78bebde8ea0f8558ac1a6fe12999f99f/tune_network_mali.py
copy to docs/_downloads/fb88afbf9be39a834109b9b842f12fd0/tune_network_arm.py
index d3fefa7..c4add79 100644
--- a/docs/_downloads/78bebde8ea0f8558ac1a6fe12999f99f/tune_network_mali.py
+++ b/docs/_downloads/fb88afbf9be39a834109b9b842f12fd0/tune_network_arm.py
@@ -15,13 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-Auto-scheduling a Neural Network for mali GPU
+Auto-scheduling a Neural Network for ARM CPU
 =============================================
-**Author**: `Zhao Wu <https://github.com/FrozenGene>`_
+**Author**: `Thierry Moreau <https://github.com/tmoreau89, Lianmin Zheng <https://github.com/merrymercy>>`_
 
 Auto-tuning for specific devices and workloads is critical for getting the
 best performance. This is a tutorial on how to tune a whole neural
-network for mali GPU with the auto-scheduler.
+network for ARM CPU with the auto-scheduler via RPC.
 
 To auto-tune a neural network, we partition the network into small subgraphs and 
 tune them independently. Each subgraph is treated as one search task.
@@ -50,7 +50,7 @@ import tvm
 from tvm import relay, auto_scheduler
 import tvm.relay.testing
 from tvm.contrib import graph_runtime
-import os
+from tvm.contrib.utils import tempdir
 
 #################################################################
 # Define a Network
@@ -131,31 +131,107 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
     return mod, params, input_shape, output_shape
 
 
-# Define the neural network and compilation target.
+#################################################################
+# Start RPC Tracker
+# -----------------
+# TVM uses RPC session to communicate with ARM boards.
+# During tuning, the tuner will send the generated code to the board and
+# measure the speed of code on the board.
+#
+# To scale up the tuning, TVM uses RPC Tracker to manage distributed devices.
+# The RPC Tracker is a centralized controller node. We can register all devices to
+# the tracker. For example, if we have 10 phones, we can register all of them
+# to the tracker, and run 10 measurements in parallel, accelerating the tuning process.
+#
+# To start an RPC tracker, run this command on the host machine. The tracker is
+# required during the whole tuning process, so we need to open a new terminal for
+# this command:
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190
+#
+# The expected output is
+#
+# .. code-block:: bash
+#
+#   INFO:RPCTracker:bind to 0.0.0.0:9190
+
+#################################################################
+# Register Devices to RPC Tracker
+# -----------------------------------
+# Now we can register our devices to the tracker. The first step is to
+# build the TVM runtime for the ARM devices.
+#
+# * For Linux:
+#   Follow this section :ref:`build-tvm-runtime-on-device` to build
+#   the TVM runtime on the device. Then register the device to tracker by
+#
+#   .. code-block:: bash
+#
+#     python -m tvm.exec.rpc_server --tracker=[HOST_IP]:9190 --key=rasp4b-64
+#
+#   (replace :code:`[HOST_IP]` with the IP address of your host machine)
+#
+# * For Android:
+#   Follow this `readme page <https://github.com/apache/tvm/tree/main/apps/android_rpc>`_ to
+#   install the TVM RPC APK on the android device. Make sure you can pass the android rpc test.
+#   Then you have already registered your device. During tuning, you have to go to developer option
+#   and enable "Keep screen awake during changing" and charge your phone to make it stable.
+#
+# After registering devices, we can confirm it by querying rpc_tracker
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190
+#
+# For example, if we have 2 Huawei mate10 pro, 11 Raspberry Pi 4B with 64bit OS, and 2 rk3399,
+# the output can be
+#
+# .. code-block:: bash
+#
+#    Queue Status
+#    ----------------------------------
+#    key          total  free  pending
+#    ----------------------------------
+#    mate10pro    2      2     0
+#    rk3399       2      2     0
+#    rasp4b-64    11     11    0
+#    ----------------------------------
+#
+# You can register multiple devices to the tracker to accelerate the measurement in tuning.
+
+###########################################
+# Set Tuning Options
+# ------------------
+# Before tuning, we should apply some configurations. Here I use a Raspberry Pi 4b 4GB board
+# as example with a 64bit OS (Ubuntu 20.04). In your setting, you should modify the target
+# and device_key accordingly.
+# set :code:`use_ndk` to True if you use android phone.
+
+#### DEVICE CONFIG ####
+
+# Replace "aarch64-linux-gnu" with the correct target of your board.
+# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.
+# FIXME(tmoreau89, merrymercy): We leave '-device=arm_cpu' out of the target string
+#                               because we're sharing x86 op strategy.
+target = tvm.target.Target("llvm -mtriple=aarch64-linux-gnu -mattr=+neon")
+
+# Also replace this with the device key in your tracker
+device_key = "rasp4b-64"
+
+# Set this to True if you use ndk tools for cross compiling
+# And also set the environment variable below to point to the cross compiler
+use_ndk = False
+# os.environ["TVM_NDK_CC"] = "/usr/bin/aarch64-linux-gnu-g++"
+
+#### TUNING OPTION ####
 network = "mobilenet"
 batch_size = 1
 layout = "NHWC"
-# Set this to True if you use ndk tools for cross compiling
-use_ndk = True
-# Path to cross compiler
-os.environ["TVM_NDK_CC"] = "/usr/bin/aarch64-linux-gnu-g++"
-target_host = tvm.target.Target("llvm -mtriple=aarch64-linux-gnu")
-target = tvm.target.Target("opencl -device=mali")
 dtype = "float32"
 log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name)
 
-
-#################################################################
-# Start an RPC Tracker and Register Devices to the Tracker
-# --------------------------------------------------------
-# Please refer to the "Start RPC Tracker" and "Register Devices to RPC Tracker" setions
-# in this :ref:`tutorial <tutorials-autotvm-start-rpc-tracker>` to start an RPC tracker
-# and register devices to the tracker.
-
-# Replace this with the device key in your tracker
-device_key = "rk3399"
-
-
 #################################################################
 # Extract Search Tasks
 # --------------------
@@ -170,41 +246,17 @@ device_key = "rk3399"
 # Extract tasks from the network
 print("Extract tasks...")
 mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
-tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target, target_host)
+tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
 
 for idx, task in enumerate(tasks):
     print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
     print(task.compute_dag)
-######################################################################
-# .. note:: How to get the hardware parameters from remote device
-#
-#   .. code-block:: python
-#
-#     from tvm.auto_scheduler.utils import request_remote
-#     remote = request_remote(device_key, "0.0.0.0", 9190)
-#     ctx = remote.cl()
-#     max_shared_memory_per_block = ctx.max_shared_memory_per_block
-#     # There is no explicit local memory limition
-#     # so we can use INT32_MAX to disalbe the check on local_memory.
-#     max_local_memory_per_block = 2147483647 # INT32_MAX
-#     max_threads_per_block = ctx.max_threads_per_block
-#     max_vthread_extent = int(ctx.warp_size / 4) if int(ctx.warp_size / 4) > 1 else ctx.warp_size
-#     warp_size = ctx.warp_size
-#     hardware_params = auto_scheduler.HardwareParams(-1, 16, 64,
-#                                                     max_shared_memory_per_block, max_local_memory_per_block,
-#                                                     max_threads_per_block, max_vthread_extent, warp_size)
-#
-#  Now you could pass it to search task and tune
-#
-#   .. code-block:: python
-#
-#     tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target, target_host, hardware_params)
-#
+
 
 #################################################################
-# Tuning and Evaluate
-# -------------------
-# Now, we set some options for tuning, launch the search tasks and evaluate the end-to-end performance
+# Tuning and Evaluation
+# ---------------------
+# Now, we set some options for tuning and launch the search tasks
 #
 # * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning.
 #   You can set it to a small number (e.g., 200) for a fast demonstrative run.
@@ -218,6 +270,9 @@ for idx, task in enumerate(tasks):
 # * see :any:`auto_scheduler.TuningOptions`,
 #   :any:`auto_scheduler.LocalRunner` for more parameters.
 #
+# After auto-tuning, we can compile the network with the best schedules we found.
+# All measurement records are dumped into the log file during auto-tuning,
+# so we can read the log file and load the best schedules.
 
 
 def tune_and_evaluate():
@@ -225,40 +280,49 @@ def tune_and_evaluate():
     tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
     tune_option = auto_scheduler.TuningOptions(
         num_measure_trials=200,  # change this to 20000 to achieve the best performance
-        builder=auto_scheduler.LocalBuilder(build_func="ndk" if use_ndk else "default"),
         runner=auto_scheduler.RPCRunner(
-            device_key, host="0.0.0.0", port=9190, repeat=3, timeout=50
+            device_key,
+            host="0.0.0.0",
+            port=9191,
+            timeout=30,
+            repeat=1,
+            min_repeat_ms=200,
+            enable_cpu_cache_flush=True,
         ),
         measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
     )
 
     tuner.tune(tune_option)
 
-    # Compile the whole network
+    # Compile with the history best
     print("Compile...")
     with auto_scheduler.ApplyHistoryBest(log_file):
         with tvm.transform.PassContext(
             opt_level=3, config={"relay.backend.use_auto_scheduler": True}
         ):
-            lib = relay.build(mod, target=target, target_host=target_host, params=params)
+            lib = relay.build(mod, target=target, params=params)
+
+    # Export library
+    tmp = tempdir()
+    if use_ndk:
+        from tvm.contrib import ndk
+
+        filename = "net.so"
+        lib.export_library(tmp.relpath(filename), ndk.create_shared)
+    else:
+        filename = "net.tar"
+        lib.export_library(tmp.relpath(filename))
+
+    # Upload module to device
+    print("Upload...")
+    remote = auto_scheduler.utils.request_remote(device_key, "0.0.0.0", 9191, timeout=10000)
+    remote.upload(tmp.relpath(filename))
+    rlib = remote.load_module(filename)
 
     # Create graph runtime
-    print("=============== Request Remote ===============")
-    from tvm.auto_scheduler.utils import request_remote
-
-    remote = request_remote(device_key, "0.0.0.0", 9190)
-    ctx = remote.cl()
-    from tvm.contrib import utils, ndk
-
-    temp = utils.tempdir()
-    filename = "deploy_lib.so"
-    path_lib = temp.relpath(filename)
-    lib.export_library(path_lib, ndk.create_shared)
-    remote.upload(path_lib)
-    loaded_lib = remote.load_module(filename)
-    module = graph_runtime.GraphModule(loaded_lib["default"](ctx))
-    data = (np.random.uniform(size=input_shape)).astype(dtype)
-    data_tvm = tvm.nd.array(data)
+    ctx = remote.cpu()
+    module = graph_runtime.GraphModule(rlib["default"](ctx))
+    data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
     module.set_input("data", data_tvm)
 
     # Evaluate
@@ -270,13 +334,15 @@ def tune_and_evaluate():
     )
 
 
-# We do not run the tuning in our webpage server since server doesn't have mali gpu.
+# We do not run the tuning in our webpage server since the server doesn't have a Raspberry Pi,
+# or device tracker running.
 # Uncomment the following line to run it by yourself.
 
 # tune_and_evaluate()
 
+
 ######################################################################
-# .. note:: Explain the printed information during tuning
+# .. note:: Explaining the printed information during tuning
 #
 #   During the tuning, a lot of information will be printed on the console.
 #   They are used for debugging purposes. The most important info is the output
@@ -284,42 +350,35 @@ def tune_and_evaluate():
 #
 #   .. code-block:: c
 #
-#     ----------------------------------------------------------------------
-#     ------------------------------  [ Task Scheduler ]
-#     ----------------------------------------------------------------------
-#     |  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-#     -------------------------------------------------
-#     |    0 |        0.010 |           0.40 |     64 |
-#     |    1 |        0.087 |          47.19 |     64 |
-#     |    2 |        0.008 |          -0.00 |     64 |
-#     |    3 |        0.177 |         582.07 |     64 |
-#     |    4 |        0.268 |         862.37 |    256 |
-#     |    5 |        0.166 |         621.13 |    128 |
-#     |    6 |        0.170 |         605.10 |    128 |
-#     |    7 |        0.128 |         403.20 |     64 |
-#     |    8 |        0.189 |         545.71 |     64 |
-#     |    9 |        0.231 |        1001.01 |    448 |
-#     |   10 |        0.155 |         664.80 |    256 |
-#     |   11 |        0.155 |         662.86 |    256 |
-#     |   12 |        0.119 |         434.08 |     64 |
-#     |   13 |        0.199 |         522.13 |     64 |
-#     |   14 |        0.235 |         986.56 |    320 |
-#     |   15 |        0.149 |         689.13 |    128 |
-#     |   16 |        0.155 |         664.80 |    192 |
-#     |   17 |        0.151 |         340.64 |     64 |
-#     |   18 |        0.176 |         597.55 |    128 |
-#     |   19 |        0.220 |        1054.37 |    192 |
-#     |   20 |        0.150 |         686.01 |    128 |
-#     |   21 |        0.159 |         650.88 |    128 |
-#     |   22 |        0.073 |         358.19 |     64 |
-#     |   23 |        0.031 |          70.63 |     64 |
-#     |   24 |        0.251 |         947.73 |    128 |
-#     |   25 |        0.157 |         652.47 |    128 |
-#     |   26 |        0.215 |         954.84 |    128 |
-#     |   27 |        0.237 |         868.92 |    128 |
-#     |   28 |        0.266 |         774.06 |    128 |
-#     -------------------------------------------------
-#     Estimated total latency: 10.016 ms      Trials: 3992    Used time : 1131 s      Next ID: 15
+#    ----------------------------------------------------------------------
+#    ------------------------------  [ Task Scheduler ]
+#    ----------------------------------------------------------------------
+#    |  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
+#    -------------------------------------------------
+#    |    0 |        0.013 |           0.31 |     64 |
+#    |    1 |        0.845 |           2.43 |    448 |
+#    |    2 |        0.046 |          -0.00 |     64 |
+#    |    3 |        4.194 |          24.53 |   2112 |
+#    |    4 |        0.109 |           9.21 |     64 |
+#    |    5 |        1.759 |          29.27 |    896 |
+#    |    6 |        0.083 |           6.01 |     64 |
+#    |    7 |        3.084 |          33.38 |   7680 |
+#    |    8 |        0.136 |          14.78 |    384 |
+#    |    9 |        1.349 |          38.23 |    768 |
+#    |   10 |        0.133 |           7.55 |    128 |
+#    |   11 |        2.747 |          37.56 |   1536 |
+#    |   12 |        0.338 |          11.87 |    192 |
+#    |   13 |        1.295 |          40.00 |    704 |
+#    |   14 |        0.482 |           4.16 |    256 |
+#    |   15 |        2.686 |          38.56 |   1344 |
+#    |   16 |        0.884 |           9.08 |    448 |
+#    |   17 |        1.332 |          39.18 |    704 |
+#    |   18 |        1.045 |           3.84 |    576 |
+#    |   19 |        1.391 |          38.09 |    704 |
+#    |   20 |        0.777 |          10.34 |    448 |
+#    |   21 |        0.739 |          30.97 |    448 |
+#    -------------------------------------------------
+#     Estimated total latency: 38.347 ms      Trials: 19992   Used time : 19260 s     Next ID: 3
 #
 #   This table lists the latency and (estimated) speed of all tasks.
 #   It also lists the allocation of measurement trials for all tasks.
@@ -349,14 +408,14 @@ def tune_and_evaluate():
 # 1. During the tuning, the auto-scheduler needs to compile many programs and
 #    extract feature from them. This part is CPU-intensive,
 #    so a high-performance CPU with many cores is recommended for faster search.
-# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill --i log.json`
+# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json`
 #    to distill the large log file and only save the best useful records.
 # 3. You can resume a search from the previous log file. You just need to
 #    add a new argument :code:`load_log_file` when creating the task scheduler
 #    in function :code:`run_tuning`. Say,
 #    :code:`tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)`
-# 4. If you have multiple target GPUs, you can use all of them for measurements to
-#    parallelize the measurements. Check this :ref:`section <tutorials-autotvm-scale-up-rpc-tracker>`
+# 4. If you have multiple target CPUs, you can use all of them for measurements to
+#    parallelize the measurements. Check this :ref:`section <tutorials-autotvm-rpc-tracker>`
 #    to learn how to use the RPC Tracker and RPC Server.
 #    To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions`
 #    with :any:`auto_scheduler.RPCRunner`.
diff --git a/docs/_downloads/fd012fa7b67f4e333acce1d25a8e62bc/micro_tflite.py b/docs/_downloads/fd012fa7b67f4e333acce1d25a8e62bc/micro_tflite.py
index c289183..6ad0da5 100644
--- a/docs/_downloads/fd012fa7b67f4e333acce1d25a8e62bc/micro_tflite.py
+++ b/docs/_downloads/fd012fa7b67f4e333acce1d25a8e62bc/micro_tflite.py
@@ -122,6 +122,8 @@ model with Relay.
 
 import os
 import numpy as np
+import logging
+
 import tvm
 import tvm.micro as micro
 from tvm.contrib.download import download_testdata
@@ -195,7 +197,7 @@ TARGET = tvm.target.target.micro("host")
 # Now, compile the model for the target:
 
 with tvm.transform.PassContext(
-    opt_level=3, config={"tir.disable_vectorize": True}, disabled_pass=["FuseOps"]
+    opt_level=3, config={"tir.disable_vectorize": True}, disabled_pass=["FuseOps", "AlterOpLayout"]
 ):
     graph, c_mod, c_params = relay.build(mod, target=TARGET, params=params)
 
@@ -207,7 +209,9 @@ with tvm.transform.PassContext(
 # First, compile a static microTVM runtime for the targeted device. In this case, the host simulated
 # device is used.
 compiler = tvm.micro.DefaultCompiler(target=TARGET)
-opts = tvm.micro.default_options(os.path.join(tvm.micro.CRT_ROOT_DIR, "host"))
+opts = tvm.micro.default_options(
+    os.path.join(tvm.micro.get_standalone_crt_dir(), "template", "host")
+)
 
 # %%
 # Compiling for physical hardware
@@ -227,21 +231,20 @@ opts = tvm.micro.default_options(os.path.join(tvm.micro.CRT_ROOT_DIR, "host"))
 #     )
 #
 #     opts = tvm.micro.default_options(f"{project_dir}/crt")
+#
+# enable printing memory usage statistics of the runtime image
+# generated by Zephyr compiler for the physical hardware
+# logging.basicConfig(level="INFO")
 
 workspace = tvm.micro.Workspace()
 micro_binary = tvm.micro.build_static_runtime(
-    # the x86 compiler *expects* you to give the exact same dictionary for both
-    # lib_opts and bin_opts. so the library compiler is mutating lib_opts and
-    # the binary compiler is expecting those mutations to be in bin_opts.
-    # TODO(weberlo) fix this very bizarre behavior
     workspace,
     compiler,
     c_mod,
-    lib_opts=opts["lib_opts"],
-    bin_opts=opts["bin_opts"],
+    opts,
     # Use the microTVM memory manager. If, in your main.cc, you change TVMPlatformMemoryAllocate and
     # TVMPlatformMemoryFree to use e.g. malloc() and free(), you can omit this extra library.
-    extra_libs=[os.path.join(tvm.micro.build.CRT_ROOT_DIR, "memory")],
+    extra_libs=[tvm.micro.get_standalone_crt_lib("memory")],
 )
 
 
diff --git a/docs/_images/sphx_glr_tune_network_arm_thumb.png b/docs/_images/sphx_glr_tune_network_arm_thumb.png
new file mode 100644
index 0000000..233f8e6
Binary files /dev/null and b/docs/_images/sphx_glr_tune_network_arm_thumb.png differ
diff --git a/docs/_sources/deploy/vitis_ai.rst.txt b/docs/_sources/deploy/vitis_ai.rst.txt
index df29f16..7de8f58 100644
--- a/docs/_sources/deploy/vitis_ai.rst.txt
+++ b/docs/_sources/deploy/vitis_ai.rst.txt
@@ -304,15 +304,22 @@ Edge hardware setup
   This section provides instructions for setting up with the `Pynq <http://www.pynq.io/>`__ platform but
   Petalinux based flows are also supported.
 
-1. Download the Pynq v2.5 image for your target (use Z1 or Z2 for
+1. Download the Pynq v2.6 image for your target (use Z1 or Z2 for
    Ultra96 target depending on board version) Link to image:
-   https://github.com/Xilinx/PYNQ/releases/tag/v2.5
+   https://github.com/Xilinx/PYNQ/releases/tag/v2.6.0
 2. Follow Pynq instructions for setting up the board: `pynq
    setup <https://pynq.readthedocs.io/en/latest/getting_started.html>`__
-3. After connecting to the board, make sure to run as root. Execute
+3. After connecting to the board, make sure to run as root. **Execute**
    ``su``
-4. Set up DPU on Pynq by following the steps here: `DPU Pynq
-   setup <https://github.com/Xilinx/DPU-PYNQ>`__
+4. Set up DPU on Pynq:
+
+    .. code:: bash
+
+     git clone --branch v1.2.0 --recursive --shallow-submodules https://github.com/Xilinx/DPU-PYNQ.git
+     cd DPU-PYNQ/upgrade
+     make
+     pip3 install pynq-dpu==1.2.0
+
 5. Run the following command to download the DPU bitstream:
 
    .. code:: bash
@@ -343,7 +350,7 @@ interface between TVM and Vitis-AI tools.
    .. code:: bash
 
       apt-get install libhdf5-dev
-      pip3 install pydot h5py
+      pip3 install pydot==1.4.1 h5py==2.8.0
 
 2. Install PyXIR
 
@@ -362,16 +369,17 @@ interface between TVM and Vitis-AI tools.
       mkdir build
       cp cmake/config.cmake build
       cd build
+      echo set\(USE_LLVM OFF\) >> config.cmake
       echo set\(USE_VITIS_AI ON\) >> config.cmake
       cmake ..
-      make
+      make tvm_runtime -j$(nproc)
 
 4. Install TVM
 
    .. code:: bash
 
       cd tvm/python
-      pip3 install -e . --user
+      pip3 install -e .
 
 5. Check whether the setup was successful in the Python shell:
 
@@ -441,7 +449,7 @@ TVM.
    import tvm
    import tvm.relay as relay
    from tvm.contrib.target import vitis_ai
-   from tvm.contrib import util, graph_runtime
+   from tvm.contrib import utils, graph_runtime
    from tvm.relay.build_module import bind_params_by_name
    from tvm.relay.op.contrib.vitis_ai import annotation
 
@@ -524,6 +532,8 @@ model in TVM with Vitis-AI at the edge. The first couple of steps will
 have to be run on the host machine and take care of quantization and
 compilation for deployment at the edge.
 
+A complete ResNet 18 example can be found `here <https://github.com/Xilinx/pyxir/tree/master/examples/tvm>`__.
+
 Host steps
 ^^^^^^^^^^
 
@@ -541,7 +551,7 @@ TVM.
    import tvm
    import tvm.relay as relay
    from tvm.contrib.target import vitis_ai
-   from tvm.contrib import util, graph_runtime
+   from tvm.contrib import utils, graph_runtime
    from tvm.relay.build_module import bind_params_by_name
    from tvm.relay.op.contrib.vitis_ai import annotation
 
@@ -549,12 +559,47 @@ After importing a convolutional neural network model using the usual
 Relay API's, annotate the Relay expression for the given Vitis-AI DPU
 target and partition the graph.
 
+.. note::
+
+    We recommend converting DPU convolutions' data layouts to NHWC and CPU convolutions'
+    data layouts to NCHW for best DPU and out of the box CPU performance. You can use the
+    ConvertLayout transformation pass two times to achieve this as demonstrated in the code
+    block underneath. You can also leave the CPU convolution layouts in NHWC and tune ARM CPU
+    performance for this data layout to avoid the layout transformation overheads introduced by
+    executing DPU convolutions in NHWC and CPU convolutions in NCHW
+    (check out the `AutoScheduling <https://tvm.apache.org/docs/tutorials/index.html#autoscheduler-template-free-auto-scheduling>`__
+    and `AutoTuning <https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_arm.html>`__
+    tutorials for this).
+
 .. code:: python
 
    mod["main"] = bind_params_by_name(mod["main"], params)
+   
+   # For edge DPU we recommend converting the convolutions' data layout
+   #    to NHWC for best performance. Therefore, we first convert the layouts
+   #    of all convolutions to NHWC before partitioning. Afterwards, we can
+   #    convert any remaining convolutions (to be executed on CPU) back to NCHW.
+   desired_layouts = {'nn.conv2d': ['NHWC', 'default']}
+   seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(),
+                                   relay.transform.ConvertLayout(desired_layouts),
+                                   relay.transform.FoldConstant()])
+   with tvm.transform.PassContext(opt_level=3):
+       mod = seq(mod)
+            
+   # Annotate and partition the Relay expression for the given target
    mod = annotation(mod, params, target)
    mod = relay.transform.MergeCompilerRegions()(mod)
    mod = relay.transform.PartitionGraph()(mod)
+   
+   # After partitioning we recommend transforming the remaining convolutions
+   #    (that will be executed on CPU, if any) back to NCHW data layout
+   #    for best CPU performance
+   desired_layouts = {'nn.conv2d': ['NCHW', 'default']}
+   seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(),
+                                   relay.transform.ConvertLayout(desired_layouts),
+                                   relay.transform.FoldConstant()])
+   with tvm.transform.PassContext(opt_level=3):
+       mod = seq(mod)
 
 Now, we can build the TVM runtime library for executing the model. The
 TVM target is 'llvm' as the operations that can't be handled by the DPU
@@ -572,13 +617,9 @@ can be included.
 
 .. code:: python
 
-   from tvm.contrib import util
-
-   temp = util.tempdir()
-
    tvm_target = 'llvm'
    target='DPUCZDX8G-zcu104'
-   export_rt_mod_file = temp.relpath("vitis_ai.rtmod")
+   export_rt_mod_file = "vitis_ai.rtmod"
 
    with tvm.transform.PassContext(opt_level=3, config= {'relay.ext.vitis_ai.options.target': target,
    						        'relay.ext.vitis_ai.options.export_runtime_module': export_rt_mod_file}):
@@ -604,9 +645,9 @@ Save the TVM lib module so that the Vitis-AI runtime module will also be exporte
 
 .. code:: python
 
-   from tvm.contrib import util
+   from tvm.contrib import utils
 
-   temp = util.tempdir()
+   temp = utils.tempdir()
    lib.export_library(temp.relpath("tvm_lib.so"))
 
 After quantizing and compiling the model for Vitis-AI acceleration using the
@@ -638,15 +679,31 @@ Edge steps
 ^^^^^^^^^^
 
 After setting up TVM with Vitis-AI on the edge device, you can now load
-the TVM runtime module into memory and feed inputs for inference.
+the TVM runtime module into memory and feed inputs for inference. A nearly
+complete runtiem script can be found underneath. Make sure to run the script
+as root (execute ``su`` in terminal to log into root).
+
+
+.. note::
+
+    You will see a warning about the 'cpu-tf' runtime not being found. This warning is
+    expected on the board and can be ignored. Note also that you **shouldn't** import the
+    PyXIR targets in the run script (``import pyxir.contrib.target.DPUCZDX8G``).
 
 .. code:: python
 
+   import pyxir
+   import tvm
+   from tvm.contrib import graph_runtime
+
    ctx = tvm.cpu()
+   
+   # input_name = ...
+   # input_data = ...
 
    # load the module into memory
    lib = tvm.runtime.load_module("tvm_dpu_arm.so")
 
    module = graph_runtime.GraphModule(lib["default"](tvm.cpu()))
-   module.set_input(name, data)
+   module.set_input(input_name, input_data)
    module.run()
diff --git a/docs/_sources/langref/relay_pattern.rst.txt b/docs/_sources/langref/relay_pattern.rst.txt
index 992954c..d77a519 100644
--- a/docs/_sources/langref/relay_pattern.rst.txt
+++ b/docs/_sources/langref/relay_pattern.rst.txt
@@ -246,6 +246,24 @@ are matched:
 
         assert pat.match(relay.expr.If(cond, x, y))
 
+
+A Relay ``Let`` expression can be matched if all of its variable, value, and body
+are matched:
+
+.. code-block:: python
+
+  def test_match_let():
+      x = is_var("x")
+      y = is_var("y")
+      let_var = is_var("let")
+      pat = is_let(let_var, is_op("less")(x, y), let_var)
+
+      x = relay.var("x")
+      y = relay.var("y")
+      lv = relay.var("let")
+      cond = x < y
+      assert pat.match(relay.expr.Let(lv, cond, lv))
+
 Matching Diamonds and Post-Dominator Graphs
 *******************************************
 
@@ -310,6 +328,7 @@ The high level design is to introduce a language of patterns for now we propose
             | is_tuple()
             | is_tuple_get_item(pattern, index = None)
             | is_if(cond, tru, fls)
+            | is_let(var, value, body)
             | pattern1 `|` pattern2
             | dominates(parent_pattern, path_pattern, child_pattern)
             | FunctionPattern(params, body)
@@ -367,6 +386,16 @@ Function Pattern
 
 Match a Function with a body and parameters
 
+If Pattern
+**********
+
+Match an If with condition, true branch, and false branch
+
+Let Pattern
+***********
+
+Match a Let with a variable, value, and body
+
 Applications
 ============
 
diff --git a/docs/_sources/tutorials/auto_scheduler/sg_execution_times.rst.txt b/docs/_sources/tutorials/auto_scheduler/sg_execution_times.rst.txt
index 0fd1a2d..3bba675 100644
--- a/docs/_sources/tutorials/auto_scheduler/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorials/auto_scheduler/sg_execution_times.rst.txt
@@ -5,10 +5,11 @@
 
 Computation times
 =================
-**03:26.805** total execution time for **tutorials_auto_scheduler** files:
-
-- **01:33.895**: :ref:`sphx_glr_tutorials_auto_scheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``)
-- **00:55.111**: :ref:`sphx_glr_tutorials_auto_scheduler_tune_matmul_x86.py` (``tune_matmul_x86.py``)
-- **00:34.695**: :ref:`sphx_glr_tutorials_auto_scheduler_tune_network_x86.py` (``tune_network_x86.py``)
-- **00:20.024**: :ref:`sphx_glr_tutorials_auto_scheduler_tune_network_cuda.py` (``tune_network_cuda.py``)
-- **00:03.080**: :ref:`sphx_glr_tutorials_auto_scheduler_tune_network_mali.py` (``tune_network_mali.py``)
+**03:30.902** total execution time for **tutorials_auto_scheduler** files:
+
+- **01:37.345**: :ref:`sphx_glr_tutorials_auto_scheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``)
+- **00:53.101**: :ref:`sphx_glr_tutorials_auto_scheduler_tune_matmul_x86.py` (``tune_matmul_x86.py``)
+- **00:35.941**: :ref:`sphx_glr_tutorials_auto_scheduler_tune_network_x86.py` (``tune_network_x86.py``)
+- **00:20.223**: :ref:`sphx_glr_tutorials_auto_scheduler_tune_network_cuda.py` (``tune_network_cuda.py``)
+- **00:02.922**: :ref:`sphx_glr_tutorials_auto_scheduler_tune_network_arm.py` (``tune_network_arm.py``)
+- **00:01.370**: :ref:`sphx_glr_tutorials_auto_scheduler_tune_network_mali.py` (``tune_network_mali.py``)
diff --git a/docs/_sources/tutorials/auto_scheduler/tune_conv2d_layer_cuda.rst.txt b/docs/_sources/tutorials/auto_scheduler/tune_conv2d_layer_cuda.rst.txt
index ae1976c..8bbc502 100644
--- a/docs/_sources/tutorials/auto_scheduler/tune_conv2d_layer_cuda.rst.txt
+++ b/docs/_sources/tutorials/auto_scheduler/tune_conv2d_layer_cuda.rst.txt
@@ -216,83 +216,650 @@ cooperative fetching, unrolling and operator fusion.
     primfn(data_1: handle, kernel_1: handle, bias_1: handle, compute_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
       buffers = {compute: Buffer(compute_2: Pointer(float32), float32, [1, 512, 7, 7], []),
-                 bias: Buffer(bias_2: Pointer(float32), float32, [1, 512, 1, 1], []),
                  kernel: Buffer(kernel_2: Pointer(float32), float32, [512, 512, 3, 3], []),
+                 bias: Buffer(bias_2: Pointer(float32), float32, [1, 512, 1, 1], []),
                  data: Buffer(data_2: Pointer(float32), float32, [1, 512, 7, 7], [])}
       buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, compute_1: compute} {
-      attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 224;
+      attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 16;
       attr [compute_3: Pointer(float32)] "storage_scope" = "local";
-      allocate(compute_3, float32, [7]);
+      allocate(compute_3, float32, [14]);
       attr [pad_temp.shared: Pointer(float32)] "storage_scope" = "shared";
-      allocate(pad_temp.shared, float32, [216]);
+      allocate(pad_temp.shared, float32, [1296]);
       attr [kernel.shared: Pointer(float32)] "storage_scope" = "shared";
-      allocate(kernel.shared, float32, [1152]);
-      attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 16 {
+      allocate(kernel.shared, float32, [4608]);
+      attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112 {
         compute_3[0] = 0f32
+        compute_3[7] = 0f32
         compute_3[1] = 0f32
+        compute_3[8] = 0f32
         compute_3[2] = 0f32
+        compute_3[9] = 0f32
         compute_3[3] = 0f32
+        compute_3[10] = 0f32
         compute_3[4] = 0f32
+        compute_3[11] = 0f32
         compute_3[5] = 0f32
+        compute_3[12] = 0f32
         compute_3[6] = 0f32
-        for (rc.outer.outer: int32, 0, 64) {
-          attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 16;
-          pad_temp.shared[threadIdx.x_1] = @tir.if_then_else((((1 <= (floordiv(threadIdx.x_1, 9) + floormod(blockIdx.x, 7))) && (1 <= floormod(threadIdx.x_1, 9))) && (floormod(threadIdx.x_1, 9) < 8)), (float32*)data_2[(((((rc.outer.outer*392) + (floordiv(threadIdx.x_1, 9)*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(threadIdx.x_1, 9)) - 8)], 0f32, dtype=float32)
-          attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 16;
-          pad_temp.shared[(threadIdx.x_1 + 16)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 16), 27), 9) + floormod(blockIdx.x, 7))) && ((floordiv(floormod((threadIdx.x_1 + 16), 27), 9) + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1 + 7), 9))) && (floormod((threadIdx.x_1 + 7), 9) < 8)), (float32*)data_2[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1 + 16), 27)*49)) + (floordiv(floormod((threadIdx.x_1 + 16), 27), 9)*7)) + (floormod(blockIdx.x, [...]
-          attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 16;
-          pad_temp.shared[(threadIdx.x_1 + 32)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 5), 27), 9) + floormod(blockIdx.x, 7))) && ((floordiv(floormod((threadIdx.x_1 + 5), 27), 9) + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1 + 5), 9))) && (floormod((threadIdx.x_1 + 5), 9) < 8)), (float32*)data_2[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1 + 32), 27)*49)) + (floordiv(floormod((threadIdx.x_1 + 5), 27), 9)*7)) + (floormod(blockIdx.x, 7) [...]
-          attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 16;
-          pad_temp.shared[(threadIdx.x_1 + 48)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 21), 27), 9) + floormod(blockIdx.x, 7))) && ((floordiv(floormod((threadIdx.x_1 + 21), 27), 9) + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1 + 3), 9))) && (floormod((threadIdx.x_1 + 3), 9) < 8)), (float32*)data_2[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1 + 48), 27)*49)) + (floordiv(floormod((threadIdx.x_1 + 21), 27), 9)*7)) + (floormod(blockIdx.x, [...]
-          attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 16;
-          pad_temp.shared[(threadIdx.x_1 + 64)] = @tir.if_then_else(((((floordiv(floormod((threadIdx.x_1 + 10), 27), 9) + floormod(blockIdx.x, 7)) < 8) && (1 <= floormod((threadIdx.x_1 + 1), 9))) && (floormod((threadIdx.x_1 + 1), 9) < 8)), (float32*)data_2[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1 + 64), 27)*49)) + (floordiv(floormod((threadIdx.x_1 + 10), 27), 9)*7)) + (floormod(blockIdx.x, 7)*7)) + floormod((threadIdx.x_1 + 1), 9)) - 8)], 0f32, dtype=float32)
-          attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 16;
-          pad_temp.shared[(threadIdx.x_1 + 80)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 26), 27), 9) + floormod(blockIdx.x, 7))) && ((floordiv(floormod((threadIdx.x_1 + 26), 27), 9) + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1 + 8), 9))) && (floormod((threadIdx.x_1 + 8), 9) < 8)), (float32*)data_2[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1 + 80), 27)*49)) + (floordiv(floormod((threadIdx.x_1 + 26), 27), 9)*7)) + (floormod(blockIdx.x, [...]
-          attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 16;
-          pad_temp.shared[(threadIdx.x_1 + 96)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 15), 27), 9) + floormod(blockIdx.x, 7))) && ((floordiv(floormod((threadIdx.x_1 + 15), 27), 9) + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1 + 6), 9))) && (floormod((threadIdx.x_1 + 6), 9) < 8)), (float32*)data_2[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1 + 96), 27)*49)) + (floordiv(floormod((threadIdx.x_1 + 15), 27), 9)*7)) + (floormod(blockIdx.x, [...]
-          attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 16;
-          pad_temp.shared[(threadIdx.x_1 + 112)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 4), 27), 9) + floormod(blockIdx.x, 7))) && ((floordiv(floormod((threadIdx.x_1 + 4), 27), 9) + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1 + 4), 9))) && (floormod((threadIdx.x_1 + 4), 9) < 8)), (float32*)data_2[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1 + 112), 27)*49)) + (floordiv(floormod((threadIdx.x_1 + 4), 27), 9)*7)) + (floormod(blockIdx.x,  [...]
-          attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 16;
-          pad_temp.shared[(threadIdx.x_1 + 128)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 20), 27), 9) + floormod(blockIdx.x, 7))) && ((floordiv(floormod((threadIdx.x_1 + 20), 27), 9) + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1 + 2), 9))) && (floormod((threadIdx.x_1 + 2), 9) < 8)), (float32*)data_2[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1 + 128), 27)*49)) + (floordiv(floormod((threadIdx.x_1 + 20), 27), 9)*7)) + (floormod(blockIdx. [...]
-          attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 16;
-          pad_temp.shared[(threadIdx.x_1 + 144)] = @tir.if_then_else(((((floordiv(floormod((threadIdx.x_1 + 9), 27), 9) + floormod(blockIdx.x, 7)) < 8) && (1 <= floormod(threadIdx.x_1, 9))) && (floormod(threadIdx.x_1, 9) < 8)), (float32*)data_2[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1 + 144), 27)*49)) + (floordiv(floormod((threadIdx.x_1 + 9), 27), 9)*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(threadIdx.x_1, 9)) - 8)], 0f32, dtype=float32)
-          attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 16;
-          pad_temp.shared[(threadIdx.x_1 + 160)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 25), 27), 9) + floormod(blockIdx.x, 7))) && ((floordiv(floormod((threadIdx.x_1 + 25), 27), 9) + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1 + 7), 9))) && (floormod((threadIdx.x_1 + 7), 9) < 8)), (float32*)data_2[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1 + 160), 27)*49)) + (floordiv(floormod((threadIdx.x_1 + 25), 27), 9)*7)) + (floormod(blockIdx. [...]
-          attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 16;
-          pad_temp.shared[(threadIdx.x_1 + 176)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 14), 27), 9) + floormod(blockIdx.x, 7))) && ((floordiv(floormod((threadIdx.x_1 + 14), 27), 9) + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1 + 5), 9))) && (floormod((threadIdx.x_1 + 5), 9) < 8)), (float32*)data_2[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1 + 176), 27)*49)) + (floordiv(floormod((threadIdx.x_1 + 14), 27), 9)*7)) + (floormod(blockIdx. [...]
-          attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 16;
-          pad_temp.shared[(threadIdx.x_1 + 192)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1 + 3), 27), 9) + floormod(blockIdx.x, 7))) && ((floordiv(floormod((threadIdx.x_1 + 3), 27), 9) + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1 + 3), 9))) && (floormod((threadIdx.x_1 + 3), 9) < 8)), (float32*)data_2[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1 + 192), 27)*49)) + (floordiv(floormod((threadIdx.x_1 + 3), 27), 9)*7)) + (floormod(blockIdx.x,  [...]
-          attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 16;
-          if @tir.likely((threadIdx.x_1 < 8), dtype=bool) {
-            pad_temp.shared[(threadIdx.x_1 + 208)] = @tir.if_then_else((((floordiv(floormod((threadIdx.x_1 + 19), 27), 9) + floormod(blockIdx.x, 7)) < 8) && (floormod((threadIdx.x_1 + 1), 9) < 8)), (float32*)data_2[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1 + 208), 27)*49)) + (floordiv(floormod((threadIdx.x_1 + 19), 27), 9)*7)) + (floormod(blockIdx.x, 7)*7)) + floormod((threadIdx.x_1 + 1), 9)) - 8)], 0f32, dtype=float32)
+        compute_3[13] = 0f32
+        for (rc.outer.outer: int32, 0, 32) {
+          attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+          pad_temp.shared[threadIdx.x_1] = @tir.if_then_else(((((9 <= floormod(threadIdx.x_1, 81)) && (floormod(threadIdx.x_1, 81) < 72)) && (1 <= floormod(threadIdx.x_1, 9))) && (floormod(threadIdx.x_1, 9) < 8)), (float32*)data_2[(((((rc.outer.outer*784) + (floordiv(threadIdx.x_1, 81)*49)) + (floordiv(floormod(threadIdx.x_1, 81), 9)*7)) + floormod(threadIdx.x_1, 9)) - 8)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+          pad_temp.shared[(threadIdx.x_1 + 112)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 31), 81)) && (floormod((threadIdx.x_1 + 31), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 4), 9))) && (floormod((threadIdx.x_1 + 4), 9) < 8)), (float32*)data_2[(((((rc.outer.outer*784) + (floordiv((threadIdx.x_1 + 112), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 31), 81), 9)*7)) + floormod((threadIdx.x_1 + 4), 9)) - 8)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+          pad_temp.shared[(threadIdx.x_1 + 224)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 62), 81)) && (floormod((threadIdx.x_1 + 62), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 8), 9))) && (floormod((threadIdx.x_1 + 8), 9) < 8)), (float32*)data_2[(((((rc.outer.outer*784) + (floordiv((threadIdx.x_1 + 224), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 62), 81), 9)*7)) + floormod((threadIdx.x_1 + 8), 9)) - 8)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+          pad_temp.shared[(threadIdx.x_1 + 336)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 12), 81)) && (floormod((threadIdx.x_1 + 12), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 3), 9))) && (floormod((threadIdx.x_1 + 3), 9) < 8)), (float32*)data_2[(((((rc.outer.outer*784) + (floordiv((threadIdx.x_1 + 336), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 12), 81), 9)*7)) + floormod((threadIdx.x_1 + 3), 9)) - 8)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+          pad_temp.shared[(threadIdx.x_1 + 448)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 43), 81)) && (floormod((threadIdx.x_1 + 43), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 7), 9))) && (floormod((threadIdx.x_1 + 7), 9) < 8)), (float32*)data_2[(((((rc.outer.outer*784) + (floordiv((threadIdx.x_1 + 448), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 43), 81), 9)*7)) + floormod((threadIdx.x_1 + 7), 9)) - 8)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+          pad_temp.shared[(threadIdx.x_1 + 560)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 74), 81)) && (floormod((threadIdx.x_1 + 74), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 2), 9))) && (floormod((threadIdx.x_1 + 2), 9) < 8)), (float32*)data_2[(((((rc.outer.outer*784) + (floordiv((threadIdx.x_1 + 560), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 74), 81), 9)*7)) + floormod((threadIdx.x_1 + 2), 9)) - 8)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+          pad_temp.shared[(threadIdx.x_1 + 672)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 24), 81)) && (floormod((threadIdx.x_1 + 24), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 6), 9))) && (floormod((threadIdx.x_1 + 6), 9) < 8)), (float32*)data_2[(((((rc.outer.outer*784) + (floordiv((threadIdx.x_1 + 672), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 24), 81), 9)*7)) + floormod((threadIdx.x_1 + 6), 9)) - 8)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+          pad_temp.shared[(threadIdx.x_1 + 784)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 55), 81)) && (floormod((threadIdx.x_1 + 55), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 1), 9))) && (floormod((threadIdx.x_1 + 1), 9) < 8)), (float32*)data_2[(((((rc.outer.outer*784) + (floordiv((threadIdx.x_1 + 784), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 55), 81), 9)*7)) + floormod((threadIdx.x_1 + 1), 9)) - 8)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+          pad_temp.shared[(threadIdx.x_1 + 896)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 5), 81)) && (floormod((threadIdx.x_1 + 5), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 5), 9))) && (floormod((threadIdx.x_1 + 5), 9) < 8)), (float32*)data_2[(((((rc.outer.outer*784) + (floordiv((threadIdx.x_1 + 896), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 5), 81), 9)*7)) + floormod((threadIdx.x_1 + 5), 9)) - 8)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+          pad_temp.shared[(threadIdx.x_1 + 1008)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 36), 81)) && (floormod((threadIdx.x_1 + 36), 81) < 72)) && (1 <= floormod(threadIdx.x_1, 9))) && (floormod(threadIdx.x_1, 9) < 8)), (float32*)data_2[(((((rc.outer.outer*784) + (floordiv((threadIdx.x_1 + 1008), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 36), 81), 9)*7)) + floormod(threadIdx.x_1, 9)) - 8)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+          pad_temp.shared[(threadIdx.x_1 + 1120)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 67), 81)) && (floormod((threadIdx.x_1 + 67), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 4), 9))) && (floormod((threadIdx.x_1 + 4), 9) < 8)), (float32*)data_2[(((((rc.outer.outer*784) + (floordiv((threadIdx.x_1 + 1120), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 67), 81), 9)*7)) + floormod((threadIdx.x_1 + 4), 9)) - 8)], 0f32, dtype=float32)
+          attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112;
+          if @tir.likely((threadIdx.x_1 < 64), dtype=bool) {
+            pad_temp.shared[(threadIdx.x_1 + 1232)] = @tir.if_then_else((((floormod((threadIdx.x_1 + 17), 81) < 72) && (1 <= floormod((threadIdx.x_1 + 8), 9))) && (floormod((threadIdx.x_1 + 8), 9) < 8)), (float32*)data_2[(((((rc.outer.outer*784) + (floordiv((threadIdx.x_1 + 1232), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 17), 81), 9)*7)) + floormod((threadIdx.x_1 + 8), 9)) - 8)], 0f32, dtype=float32)
+          }
+          attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112 {
+            kernel.shared[(threadIdx.x_2*4)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(threadIdx.x_2, 36)*4608)) + (rc.outer.outer*144)) + (floormod(threadIdx.x_2, 36)*4))]
+            kernel.shared[((threadIdx.x_2*4) + 1)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 1), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 1), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 2)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 2), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 2), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 3)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 3), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 3), 144))]
+          }
+          attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112 {
+            kernel.shared[((threadIdx.x_2*4) + 448)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 448), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 16), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 449)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 449), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 17), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 450)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 450), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 18), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 451)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 451), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 19), 144))]
+          }
+          attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112 {
+            kernel.shared[((threadIdx.x_2*4) + 896)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 896), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 32), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 897)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 897), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 33), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 898)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 898), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 34), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 899)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 899), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 35), 144))]
+          }
+          attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112 {
+            kernel.shared[((threadIdx.x_2*4) + 1344)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 1344), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 48), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 1345)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 1345), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 49), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 1346)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 1346), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 50), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 1347)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 1347), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 51), 144))]
+          }
+          attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112 {
+            kernel.shared[((threadIdx.x_2*4) + 1792)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 1792), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 64), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 1793)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 1793), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 65), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 1794)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 1794), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 66), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 1795)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 1795), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 67), 144))]
           }
-          for (ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer: int32, 0, 72) {
-            attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 16;
-            kernel.shared[((ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer*16) + threadIdx.x_2)] = (float32*)kernel_2[((((floordiv(blockIdx.x, 7)*73728) + (floordiv(((ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer*16) + threadIdx.x_2), 72)*4608)) + (rc.outer.outer*72)) + floormod(((ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer*16) + threadIdx.x_2), 72))]
+          attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112 {
+            kernel.shared[((threadIdx.x_2*4) + 2240)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 2240), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 80), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 2241)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 2241), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 81), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 2242)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 2242), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 82), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 2243)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 2243), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 83), 144))]
           }
-          for (rc.outer.inner: int32, 0, 2) {
-            for (rx.outer.inner: int32, 0, 3) {
-              for (rc.inner: int32, 0, 4) {
-                for (ry.inner: int32, 0, 3) {
-                  compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[((((rc.outer.inner*108) + (rc.inner*27)) + (ry.inner*9)) + rx.outer.inner)]*(float32*)kernel.shared[(((((threadIdx.x*72) + (rc.outer.inner*36)) + (rc.inner*9)) + (ry.inner*3)) + rx.outer.inner)]))
-                  compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((((rc.outer.inner*108) + (rc.inner*27)) + (ry.inner*9)) + rx.outer.inner) + 1)]*(float32*)kernel.shared[(((((threadIdx.x*72) + (rc.outer.inner*36)) + (rc.inner*9)) + (ry.inner*3)) + rx.outer.inner)]))
-                  compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((((rc.outer.inner*108) + (rc.inner*27)) + (ry.inner*9)) + rx.outer.inner) + 2)]*(float32*)kernel.shared[(((((threadIdx.x*72) + (rc.outer.inner*36)) + (rc.inner*9)) + (ry.inner*3)) + rx.outer.inner)]))
-                  compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((((rc.outer.inner*108) + (rc.inner*27)) + (ry.inner*9)) + rx.outer.inner) + 3)]*(float32*)kernel.shared[(((((threadIdx.x*72) + (rc.outer.inner*36)) + (rc.inner*9)) + (ry.inner*3)) + rx.outer.inner)]))
-                  compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((((rc.outer.inner*108) + (rc.inner*27)) + (ry.inner*9)) + rx.outer.inner) + 4)]*(float32*)kernel.shared[(((((threadIdx.x*72) + (rc.outer.inner*36)) + (rc.inner*9)) + (ry.inner*3)) + rx.outer.inner)]))
-                  compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((((rc.outer.inner*108) + (rc.inner*27)) + (ry.inner*9)) + rx.outer.inner) + 5)]*(float32*)kernel.shared[(((((threadIdx.x*72) + (rc.outer.inner*36)) + (rc.inner*9)) + (ry.inner*3)) + rx.outer.inner)]))
-                  compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((((rc.outer.inner*108) + (rc.inner*27)) + (ry.inner*9)) + rx.outer.inner) + 6)]*(float32*)kernel.shared[(((((threadIdx.x*72) + (rc.outer.inner*36)) + (rc.inner*9)) + (ry.inner*3)) + rx.outer.inner)]))
-                }
+          attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112 {
+            kernel.shared[((threadIdx.x_2*4) + 2688)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 2688), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 96), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 2689)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 2689), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 97), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 2690)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 2690), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 98), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 2691)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 2691), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 99), 144))]
+          }
+          attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112 {
+            kernel.shared[((threadIdx.x_2*4) + 3136)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 3136), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 112), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 3137)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 3137), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 113), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 3138)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 3138), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 114), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 3139)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 3139), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 115), 144))]
+          }
+          attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112 {
+            kernel.shared[((threadIdx.x_2*4) + 3584)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 3584), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 128), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 3585)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 3585), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 129), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 3586)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 3586), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 130), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 3587)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 3587), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 131), 144))]
+          }
+          attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112 {
+            kernel.shared[((threadIdx.x_2*4) + 4032)] = (float32*)kernel_2[(((((blockIdx.x*147456) + (floordiv((threadIdx.x_2*4), 144)*4608)) + (rc.outer.outer*144)) + (floormod(threadIdx.x_2, 36)*4)) + 129024)]
+            kernel.shared[((threadIdx.x_2*4) + 4033)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 4033), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 1), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 4034)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 4034), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 2), 144))]
+            kernel.shared[((threadIdx.x_2*4) + 4035)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 4035), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 3), 144))]
+          }
+          attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 112 {
+            if @tir.likely((threadIdx.x_2 < 32), dtype=bool) {
+              kernel.shared[((threadIdx.x_2*4) + 4480)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 4480), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 16), 144))]
+            }
+            if @tir.likely(((threadIdx.x_2*4) < 127), dtype=bool) {
+              if @tir.likely((threadIdx.x_2 < 32), dtype=bool) {
+                kernel.shared[((threadIdx.x_2*4) + 4481)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 4481), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 17), 144))]
+              }
+            }
+            if @tir.likely(((threadIdx.x_2*4) < 126), dtype=bool) {
+              if @tir.likely((threadIdx.x_2 < 32), dtype=bool) {
+                kernel.shared[((threadIdx.x_2*4) + 4482)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 4482), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 18), 144))]
+              }
+            }
+            if @tir.likely(((threadIdx.x_2*4) < 125), dtype=bool) {
+              if @tir.likely((threadIdx.x_2 < 32), dtype=bool) {
+                kernel.shared[((threadIdx.x_2*4) + 4483)] = (float32*)kernel_2[((((blockIdx.x*147456) + (floordiv(((threadIdx.x_2*4) + 4483), 144)*4608)) + (rc.outer.outer*144)) + floormod(((threadIdx.x_2*4) + 19), 144))]
               }
             }
           }
+          for (rc.outer.inner: int32, 0, 4) {
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9))]*(float32*)kernel.shared[((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36))]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9))]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2304)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 1)]*(float32*)kernel.shared[((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36))]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 1)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2304)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 2)]*(float32*)kernel.shared[((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36))]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 2)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2304)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 3)]*(float32*)kernel.shared[((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36))]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 3)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2304)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 4)]*(float32*)kernel.shared[((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36))]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 4)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2304)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 5)]*(float32*)kernel.shared[((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36))]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 5)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2304)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 6)]*(float32*)kernel.shared[((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36))]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 6)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2304)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 1)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 1)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 1)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2305)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 2)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 1)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 2)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2305)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 3)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 1)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 3)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2305)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 4)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 1)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 4)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2305)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 5)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 1)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 5)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2305)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 6)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 1)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 6)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2305)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 7)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 1)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 7)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2305)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 2)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 2)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2306)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 3)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 3)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2306)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 4)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 4)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2306)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 5)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 5)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2306)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 6)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 6)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2306)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 7)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 7)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2306)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 8)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 8)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2306)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 81)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 9)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 81)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2313)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 82)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 9)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 82)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2313)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 83)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 9)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 83)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2313)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 84)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 9)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 84)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2313)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 85)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 9)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 85)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2313)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 86)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 9)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 86)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2313)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 87)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 9)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 87)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2313)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 82)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 10)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 82)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2314)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 83)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 10)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 83)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2314)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 84)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 10)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 84)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2314)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 85)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 10)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 85)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2314)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 86)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 10)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 86)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2314)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 87)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 10)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 87)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2314)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 88)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 10)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 88)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2314)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 83)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 11)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 83)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2315)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 84)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 11)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 84)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2315)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 85)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 11)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 85)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2315)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 86)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 11)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 86)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2315)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 87)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 11)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 87)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2315)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 88)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 11)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 88)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2315)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 89)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 11)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 89)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2315)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 162)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 18)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 162)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2322)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 163)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 18)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 163)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2322)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 164)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 18)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 164)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2322)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 165)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 18)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 165)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2322)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 166)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 18)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 166)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2322)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 167)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 18)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 167)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2322)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 168)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 18)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 168)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2322)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 163)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 19)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 163)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2323)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 164)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 19)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 164)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2323)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 165)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 19)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 165)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2323)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 166)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 19)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 166)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2323)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 167)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 19)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 167)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2323)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 168)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 19)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 168)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2323)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 169)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 19)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 169)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2323)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 164)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 20)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 164)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2324)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 165)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 20)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 165)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2324)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 166)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 20)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 166)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2324)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 167)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 20)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 167)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2324)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 168)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 20)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 168)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2324)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 169)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 20)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 169)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2324)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 170)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 20)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 170)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2324)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 243)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 27)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 243)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2331)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 244)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 27)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 244)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2331)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 245)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 27)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 245)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2331)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 246)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 27)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 246)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2331)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 247)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 27)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 247)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2331)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 248)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 27)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 248)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2331)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 249)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 27)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 249)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2331)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 244)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 28)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 244)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2332)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 245)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 28)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 245)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2332)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 246)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 28)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 246)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2332)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 247)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 28)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 247)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2332)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 248)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 28)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 248)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2332)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 249)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 28)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 249)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2332)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 250)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 28)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 250)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2332)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 245)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 29)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 245)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2333)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 246)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 29)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 246)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2333)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 247)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 29)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 247)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2333)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 248)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 29)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 248)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2333)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 249)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 29)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 249)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2333)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 250)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 29)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 250)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2333)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 251)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 29)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 251)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2333)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 9)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 3)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 9)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2307)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 10)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 3)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 10)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2307)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 11)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 3)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 11)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2307)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 12)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 3)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 12)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2307)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 13)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 3)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 13)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2307)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 14)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 3)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 14)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2307)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 15)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 3)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 15)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2307)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 10)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 4)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 10)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2308)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 11)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 4)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 11)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2308)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 12)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 4)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 12)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2308)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 13)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 4)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 13)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2308)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 14)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 4)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 14)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2308)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 15)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 4)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 15)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2308)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 16)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 4)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 16)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2308)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 11)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 5)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 11)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2309)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 12)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 5)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 12)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2309)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 13)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 5)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 13)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2309)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 14)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 5)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 14)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2309)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 15)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 5)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 15)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2309)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 16)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 5)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 16)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2309)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 17)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 5)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 17)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2309)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 90)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 12)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 90)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2316)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 91)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 12)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 91)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2316)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 92)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 12)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 92)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2316)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 93)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 12)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 93)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2316)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 94)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 12)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 94)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2316)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 95)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 12)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 95)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2316)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 96)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 12)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 96)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2316)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 91)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 13)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 91)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2317)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 92)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 13)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 92)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2317)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 93)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 13)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 93)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2317)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 94)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 13)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 94)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2317)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 95)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 13)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 95)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2317)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 96)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 13)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 96)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2317)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 97)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 13)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 97)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2317)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 92)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 14)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 92)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2318)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 93)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 14)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 93)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2318)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 94)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 14)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 94)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2318)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 95)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 14)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 95)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2318)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 96)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 14)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 96)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2318)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 97)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 14)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 97)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2318)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 98)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 14)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 98)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2318)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 171)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 21)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 171)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2325)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 172)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 21)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 172)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2325)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 173)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 21)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 173)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2325)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 174)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 21)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 174)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2325)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 175)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 21)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 175)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2325)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 176)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 21)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 176)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2325)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 177)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 21)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 177)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2325)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 172)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 22)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 172)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2326)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 173)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 22)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 173)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2326)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 174)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 22)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 174)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2326)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 175)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 22)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 175)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2326)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 176)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 22)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 176)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2326)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 177)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 22)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 177)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2326)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 178)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 22)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 178)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2326)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 173)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 23)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 173)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2327)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 174)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 23)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 174)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2327)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 175)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 23)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 175)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2327)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 176)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 23)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 176)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2327)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 177)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 23)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 177)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2327)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 178)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 23)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 178)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2327)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 179)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 23)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 179)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2327)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 252)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 30)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 252)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2334)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 253)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 30)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 253)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2334)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 254)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 30)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 254)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2334)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 255)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 30)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 255)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2334)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 256)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 30)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 256)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2334)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 257)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 30)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 257)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2334)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 258)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 30)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 258)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2334)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 253)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 31)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 253)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2335)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 254)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 31)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 254)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2335)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 255)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 31)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 255)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2335)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 256)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 31)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 256)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2335)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 257)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 31)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 257)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2335)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 258)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 31)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 258)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2335)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 259)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 31)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 259)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2335)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 254)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 32)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 254)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2336)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 255)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 32)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 255)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2336)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 256)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 32)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 256)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2336)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 257)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 32)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 257)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2336)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 258)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 32)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 258)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2336)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 259)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 32)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 259)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2336)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 260)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 32)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 260)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2336)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 18)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 6)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 18)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2310)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 19)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 6)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 19)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2310)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 20)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 6)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 20)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2310)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 21)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 6)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 21)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2310)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 22)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 6)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 22)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2310)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 23)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 6)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 23)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2310)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 24)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 6)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 24)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2310)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 19)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 7)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 19)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2311)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 20)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 7)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 20)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2311)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 21)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 7)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 21)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2311)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 22)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 7)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 22)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2311)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 23)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 7)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 23)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2311)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 24)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 7)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 24)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2311)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 25)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 7)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 25)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2311)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 20)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 8)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 20)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2312)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 21)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 8)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 21)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2312)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 22)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 8)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 22)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2312)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 23)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 8)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 23)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2312)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 24)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 8)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 24)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2312)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 25)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 8)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 25)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2312)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 26)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 8)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 26)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2312)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 99)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 15)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 99)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2319)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 100)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 15)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 100)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2319)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 101)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 15)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 101)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2319)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 102)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 15)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 102)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2319)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 103)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 15)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 103)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2319)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 104)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 15)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 104)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2319)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 105)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 15)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 105)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2319)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 100)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 16)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 100)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2320)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 101)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 16)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 101)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2320)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 102)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 16)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 102)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2320)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 103)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 16)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 103)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2320)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 104)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 16)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 104)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2320)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 105)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 16)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 105)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2320)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 106)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 16)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 106)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2320)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 101)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 17)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 101)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2321)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 102)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 17)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 102)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2321)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 103)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 17)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 103)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2321)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 104)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 17)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 104)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2321)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 105)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 17)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 105)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2321)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 106)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 17)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 106)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2321)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 107)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 17)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 107)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2321)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 180)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 24)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 180)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2328)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 181)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 24)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 181)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2328)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 182)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 24)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 182)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2328)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 183)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 24)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 183)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2328)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 184)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 24)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 184)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2328)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 185)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 24)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 185)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2328)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 186)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 24)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 186)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2328)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 181)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 25)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 181)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2329)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 182)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 25)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 182)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2329)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 183)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 25)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 183)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2329)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 184)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 25)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 184)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2329)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 185)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 25)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 185)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2329)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 186)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 25)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 186)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2329)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 187)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 25)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 187)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2329)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 182)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 26)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 182)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2330)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 183)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 26)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 183)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2330)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 184)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 26)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 184)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2330)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 185)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 26)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 185)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2330)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 186)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 26)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 186)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2330)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 187)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 26)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 187)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2330)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 188)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 26)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 188)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2330)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 261)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 33)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 261)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2337)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 262)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 33)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 262)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2337)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 263)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 33)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 263)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2337)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 264)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 33)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 264)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2337)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 265)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 33)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 265)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2337)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 266)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 33)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 266)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2337)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 267)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 33)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 267)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2337)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 262)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 34)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 262)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2338)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 263)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 34)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 263)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2338)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 264)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 34)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 264)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2338)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 265)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 34)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 265)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2338)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 266)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 34)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 266)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2338)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 267)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 34)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 267)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2338)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 268)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 34)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 268)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2338)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 263)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 35)]))
+            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 263)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2339)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 264)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 35)]))
+            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 264)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2339)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 265)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 35)]))
+            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 265)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2339)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 266)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 35)]))
+            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 266)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2339)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 267)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 35)]))
+            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 267)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2339)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 268)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 35)]))
+            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 268)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2339)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 269)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 35)]))
+            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[(((rc.outer.inner*324) + (floormod(threadIdx.x, 7)*9)) + 269)]*(float32*)kernel.shared[(((floordiv(threadIdx.x, 7)*144) + (rc.outer.inner*36)) + 2339)]))
+          }
+        }
+        for (i3.inner: int32, 0, 7) {
+          compute_2[(((blockIdx.x*1568) + (threadIdx.x*7)) + i3.inner)] = max(((float32*)compute_3[i3.inner] + (float32*)bias_2[((blockIdx.x*32) + floordiv(threadIdx.x, 7))]), 0f32)
+          compute_2[((((blockIdx.x*1568) + (threadIdx.x*7)) + i3.inner) + 784)] = max(((float32*)compute_3[(i3.inner + 7)] + (float32*)bias_2[(((blockIdx.x*32) + floordiv(threadIdx.x, 7)) + 16)]), 0f32)
         }
-        compute_2[(((floordiv(blockIdx.x, 7)*784) + (threadIdx.x*49)) + (floormod(blockIdx.x, 7)*7))] = max(((float32*)compute_3[0] + (float32*)bias_2[((floordiv(blockIdx.x, 7)*16) + threadIdx.x)]), 0f32)
-        compute_2[((((floordiv(blockIdx.x, 7)*784) + (threadIdx.x*49)) + (floormod(blockIdx.x, 7)*7)) + 1)] = max(((float32*)compute_3[1] + (float32*)bias_2[((floordiv(blockIdx.x, 7)*16) + threadIdx.x)]), 0f32)
-        compute_2[((((floordiv(blockIdx.x, 7)*784) + (threadIdx.x*49)) + (floormod(blockIdx.x, 7)*7)) + 2)] = max(((float32*)compute_3[2] + (float32*)bias_2[((floordiv(blockIdx.x, 7)*16) + threadIdx.x)]), 0f32)
-        compute_2[((((floordiv(blockIdx.x, 7)*784) + (threadIdx.x*49)) + (floormod(blockIdx.x, 7)*7)) + 3)] = max(((float32*)compute_3[3] + (float32*)bias_2[((floordiv(blockIdx.x, 7)*16) + threadIdx.x)]), 0f32)
-        compute_2[((((floordiv(blockIdx.x, 7)*784) + (threadIdx.x*49)) + (floormod(blockIdx.x, 7)*7)) + 4)] = max(((float32*)compute_3[4] + (float32*)bias_2[((floordiv(blockIdx.x, 7)*16) + threadIdx.x)]), 0f32)
-        compute_2[((((floordiv(blockIdx.x, 7)*784) + (threadIdx.x*49)) + (floormod(blockIdx.x, 7)*7)) + 5)] = max(((float32*)compute_3[5] + (float32*)bias_2[((floordiv(blockIdx.x, 7)*16) + threadIdx.x)]), 0f32)
-        compute_2[((((floordiv(blockIdx.x, 7)*784) + (threadIdx.x*49)) + (floormod(blockIdx.x, 7)*7)) + 6)] = max(((float32*)compute_3[6] + (float32*)bias_2[((floordiv(blockIdx.x, 7)*16) + threadIdx.x)]), 0f32)
       }
     }
 
@@ -344,7 +911,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 0.364 ms
+    Execution time of this operator: 0.184 ms
 
 
 
@@ -391,34 +958,34 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
     compute_ff_o_i, compute_ff_i = s[compute].split(compute_ff, factor=1)
     compute_ff_o_o_i, compute_ff_o_i = s[compute].split(compute_ff_o_i, factor=1)
     compute_ff_o_o_o_i, compute_ff_o_o_i = s[compute].split(compute_ff_o_o_i, factor=16)
-    compute_ff_o_o_o_o, compute_ff_o_o_o_i = s[compute].split(compute_ff_o_o_o_i, factor=1)
+    compute_ff_o_o_o_o, compute_ff_o_o_o_i = s[compute].split(compute_ff_o_o_o_i, factor=2)
     compute_yy_o_i, compute_yy_i = s[compute].split(compute_yy, factor=1)
     compute_yy_o_o_i, compute_yy_o_i = s[compute].split(compute_yy_o_i, factor=1)
-    compute_yy_o_o_o_i, compute_yy_o_o_i = s[compute].split(compute_yy_o_o_i, factor=1)
+    compute_yy_o_o_o_i, compute_yy_o_o_i = s[compute].split(compute_yy_o_o_i, factor=7)
     compute_yy_o_o_o_o, compute_yy_o_o_o_i = s[compute].split(compute_yy_o_o_o_i, factor=1)
-    compute_xx_o_i, compute_xx_i = s[compute].split(compute_xx, factor=1)
+    compute_xx_o_i, compute_xx_i = s[compute].split(compute_xx, factor=7)
     compute_xx_o_o_i, compute_xx_o_i = s[compute].split(compute_xx_o_i, factor=1)
     compute_xx_o_o_o_i, compute_xx_o_o_i = s[compute].split(compute_xx_o_o_i, factor=1)
-    compute_xx_o_o_o_o, compute_xx_o_o_o_i = s[compute].split(compute_xx_o_o_o_i, factor=7)
+    compute_xx_o_o_o_o, compute_xx_o_o_o_i = s[compute].split(compute_xx_o_o_o_i, factor=1)
     compute_rc_o_i, compute_rc_i = s[compute].split(compute_rc, factor=4)
-    compute_rc_o_o, compute_rc_o_i = s[compute].split(compute_rc_o_i, factor=2)
-    compute_ry_o_i, compute_ry_i = s[compute].split(compute_ry, factor=3)
-    compute_ry_o_o, compute_ry_o_i = s[compute].split(compute_ry_o_i, factor=1)
-    compute_rx_o_i, compute_rx_i = s[compute].split(compute_rx, factor=1)
-    compute_rx_o_o, compute_rx_o_i = s[compute].split(compute_rx_o_i, factor=3)
+    compute_rc_o_o, compute_rc_o_i = s[compute].split(compute_rc_o_i, factor=4)
+    compute_ry_o_i, compute_ry_i = s[compute].split(compute_ry, factor=1)
+    compute_ry_o_o, compute_ry_o_i = s[compute].split(compute_ry_o_i, factor=3)
+    compute_rx_o_i, compute_rx_i = s[compute].split(compute_rx, factor=3)
+    compute_rx_o_o, compute_rx_o_i = s[compute].split(compute_rx_o_i, factor=1)
     s[compute].reorder(compute_nn_o_o_o_o, compute_ff_o_o_o_o, compute_yy_o_o_o_o, compute_xx_o_o_o_o, compute_nn_o_o_o_i, compute_ff_o_o_o_i, compute_yy_o_o_o_i, compute_xx_o_o_o_i, compute_nn_o_o_i, compute_ff_o_o_i, compute_yy_o_o_i, compute_xx_o_o_i, compute_rc_o_o, compute_ry_o_o, compute_rx_o_o, compute_rc_o_i, compute_ry_o_i, compute_rx_o_i, compute_nn_o_i, compute_ff_o_i, compute_yy_o_i, compute_xx_o_i, compute_rc_i, compute_ry_i, compute_rx_i, compute_nn_i, compute_ff_i, compute [...]
     compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1)
     compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
     compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
     compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=1)
     compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=16)
-    compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
+    compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=2)
     compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
-    compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=1)
+    compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=7)
     compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
-    compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
+    compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
     compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
-    compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=7)
+    compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
     s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
     s[compute].compute_at(s[compute], compute_i3_o_i)
     kernel_shared = s.cache_read(kernel, "shared", [compute])
@@ -435,75 +1002,631 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
     compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused = s[compute].fuse(compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i)
     s[compute].bind(compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused, te.thread_axis("threadIdx.x"))
     kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
-    kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
+    kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=4)
     s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-    kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=16)
+    kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=112)
     s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
     pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
     pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
     s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=16)
+    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=112)
     s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
-    s[compute].pragma(compute_nn_o_o_o_o, "auto_unroll_max_step", 16)
+    s[compute].pragma(compute_nn_o_o_o_o, "auto_unroll_max_step", 1024)
     s[compute].pragma(compute_nn_o_o_o_o, "unroll_explicit", True)
 
     CUDA source code:
+
+    #ifdef _WIN32
+      using uint = unsigned int;
+      using uchar = unsigned char;
+      using ushort = unsigned short;
+      using int64_t = long long;
+      using uint64_t = unsigned long long;
+    #else
+      #define uint unsigned int
+      #define uchar unsigned char
+      #define ushort unsigned short
+      #define int64_t long
+      #define uint64_t ulong
+    #endif
     extern "C" __global__ void default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
-      float compute1[7];
-      __shared__ float pad_temp_shared[216];
-      __shared__ float kernel_shared[1152];
+      float compute1[14];
+      __shared__ float pad_temp_shared[1296];
+      __shared__ float kernel_shared[4608];
       compute1[(0)] = 0.000000e+00f;
+      compute1[(7)] = 0.000000e+00f;
       compute1[(1)] = 0.000000e+00f;
+      compute1[(8)] = 0.000000e+00f;
       compute1[(2)] = 0.000000e+00f;
+      compute1[(9)] = 0.000000e+00f;
       compute1[(3)] = 0.000000e+00f;
+      compute1[(10)] = 0.000000e+00f;
       compute1[(4)] = 0.000000e+00f;
+      compute1[(11)] = 0.000000e+00f;
       compute1[(5)] = 0.000000e+00f;
+      compute1[(12)] = 0.000000e+00f;
       compute1[(6)] = 0.000000e+00f;
-      for (int rc_outer_outer = 0; rc_outer_outer < 64; ++rc_outer_outer) {
+      compute1[(13)] = 0.000000e+00f;
+      for (int rc_outer_outer = 0; rc_outer_outer < 32; ++rc_outer_outer) {
         __syncthreads();
-        pad_temp_shared[(((int)threadIdx.x))] = ((((1 <= ((((int)threadIdx.x) / 9) + (((int)blockIdx.x) % 7))) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((int)threadIdx.x) / 9) * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((int)threadIdx.x) % 9)) - 8))] : 0.000000e+00f);
-        pad_temp_shared[((((int)threadIdx.x) + 16))] = (((((1 <= ((((((int)threadIdx.x) + 16) % 27) / 9) + (((int)blockIdx.x) % 7))) && (((((((int)threadIdx.x) + 16) % 27) / 9) + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) + 7) % 9))) && (((((int)threadIdx.x) + 7) % 9) < 8)) ? data[(((((((rc_outer_outer * 392) + (((((int)threadIdx.x) + 16) / 27) * 49)) + ((((((int)threadIdx.x) + 16) % 27) / 9) * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 7) % 9)) - 8)) [...]
-        pad_temp_shared[((((int)threadIdx.x) + 32))] = (((((1 <= (((((int)threadIdx.x) + 5) / 9) + (((int)blockIdx.x) % 7))) && ((((((int)threadIdx.x) + 5) / 9) + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) + 5) % 9))) && (((((int)threadIdx.x) + 5) % 9) < 8)) ? data[(((((((rc_outer_outer * 392) + (((((int)threadIdx.x) + 32) / 27) * 49)) + (((((int)threadIdx.x) + 5) / 9) * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 5) % 9)) - 8))] : 0.000000e+00f);
-        pad_temp_shared[((((int)threadIdx.x) + 48))] = (((((1 <= ((((((int)threadIdx.x) + 21) % 27) / 9) + (((int)blockIdx.x) % 7))) && (((((((int)threadIdx.x) + 21) % 27) / 9) + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) + 3) % 9))) && (((((int)threadIdx.x) + 3) % 9) < 8)) ? data[(((((((rc_outer_outer * 392) + (((((int)threadIdx.x) + 48) / 27) * 49)) + ((((((int)threadIdx.x) + 21) % 27) / 9) * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 3) % 9)) - 8)) [...]
-        pad_temp_shared[((((int)threadIdx.x) + 64))] = (((((((((int)threadIdx.x) + 10) / 9) + (((int)blockIdx.x) % 7)) < 8) && (1 <= ((((int)threadIdx.x) + 1) % 9))) && (((((int)threadIdx.x) + 1) % 9) < 8)) ? data[(((((((rc_outer_outer * 392) + (((((int)threadIdx.x) + 64) / 27) * 49)) + (((((int)threadIdx.x) + 10) / 9) * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 1) % 9)) - 8))] : 0.000000e+00f);
-        pad_temp_shared[((((int)threadIdx.x) + 80))] = (((((1 <= ((((((int)threadIdx.x) + 26) % 27) / 9) + (((int)blockIdx.x) % 7))) && (((((((int)threadIdx.x) + 26) % 27) / 9) + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) + 8) % 9))) && (((((int)threadIdx.x) + 8) % 9) < 8)) ? data[(((((((rc_outer_outer * 392) + (((((int)threadIdx.x) + 80) / 27) * 49)) + ((((((int)threadIdx.x) + 26) % 27) / 9) * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 8) % 9)) - 8)) [...]
-        pad_temp_shared[((((int)threadIdx.x) + 96))] = (((((1 <= ((((((int)threadIdx.x) + 15) % 27) / 9) + (((int)blockIdx.x) % 7))) && (((((((int)threadIdx.x) + 15) % 27) / 9) + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) + 6) % 9))) && (((((int)threadIdx.x) + 6) % 9) < 8)) ? data[(((((((rc_outer_outer * 392) + (((((int)threadIdx.x) + 96) / 27) * 49)) + ((((((int)threadIdx.x) + 15) % 27) / 9) * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 6) % 9)) - 8)) [...]
-        pad_temp_shared[((((int)threadIdx.x) + 112))] = (((((1 <= (((((int)threadIdx.x) + 4) / 9) + (((int)blockIdx.x) % 7))) && ((((((int)threadIdx.x) + 4) / 9) + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) + 4) % 9))) && (((((int)threadIdx.x) + 4) % 9) < 8)) ? data[(((((((rc_outer_outer * 392) + (((((int)threadIdx.x) + 112) / 27) * 49)) + (((((int)threadIdx.x) + 4) / 9) * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 4) % 9)) - 8))] : 0.000000e+00f);
-        pad_temp_shared[((((int)threadIdx.x) + 128))] = (((((1 <= ((((((int)threadIdx.x) + 20) % 27) / 9) + (((int)blockIdx.x) % 7))) && (((((((int)threadIdx.x) + 20) % 27) / 9) + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) + 2) % 9))) && (((((int)threadIdx.x) + 2) % 9) < 8)) ? data[(((((((rc_outer_outer * 392) + (((((int)threadIdx.x) + 128) / 27) * 49)) + ((((((int)threadIdx.x) + 20) % 27) / 9) * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 2) % 9)) - 8 [...]
-        pad_temp_shared[((((int)threadIdx.x) + 144))] = (((((((((int)threadIdx.x) + 9) / 9) + (((int)blockIdx.x) % 7)) < 8) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[(((((((rc_outer_outer * 392) + (((((int)threadIdx.x) + 144) / 27) * 49)) + (((((int)threadIdx.x) + 9) / 9) * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((int)threadIdx.x) % 9)) - 8))] : 0.000000e+00f);
-        pad_temp_shared[((((int)threadIdx.x) + 160))] = (((((1 <= ((((((int)threadIdx.x) + 25) % 27) / 9) + (((int)blockIdx.x) % 7))) && (((((((int)threadIdx.x) + 25) % 27) / 9) + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) + 7) % 9))) && (((((int)threadIdx.x) + 7) % 9) < 8)) ? data[(((((((rc_outer_outer * 392) + (((((int)threadIdx.x) + 160) / 27) * 49)) + ((((((int)threadIdx.x) + 25) % 27) / 9) * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 7) % 9)) - 8 [...]
-        pad_temp_shared[((((int)threadIdx.x) + 176))] = (((((1 <= ((((((int)threadIdx.x) + 14) % 27) / 9) + (((int)blockIdx.x) % 7))) && (((((((int)threadIdx.x) + 14) % 27) / 9) + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) + 5) % 9))) && (((((int)threadIdx.x) + 5) % 9) < 8)) ? data[(((((((rc_outer_outer * 392) + (((((int)threadIdx.x) + 176) / 27) * 49)) + ((((((int)threadIdx.x) + 14) % 27) / 9) * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 5) % 9)) - 8 [...]
-        pad_temp_shared[((((int)threadIdx.x) + 192))] = (((((1 <= (((((int)threadIdx.x) + 3) / 9) + (((int)blockIdx.x) % 7))) && ((((((int)threadIdx.x) + 3) / 9) + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) + 3) % 9))) && (((((int)threadIdx.x) + 3) % 9) < 8)) ? data[(((((((rc_outer_outer * 392) + (((((int)threadIdx.x) + 192) / 27) * 49)) + (((((int)threadIdx.x) + 3) / 9) * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) + 3) % 9)) - 8))] : 0.000000e+00f);
-        if (((int)threadIdx.x) < 8) {
-          pad_temp_shared[((((int)threadIdx.x) + 208))] = ((((((((int)threadIdx.x) + 19) / 9) + (((int)blockIdx.x) % 7)) < 8) && (((int)threadIdx.x) < 7)) ? data[(((((((rc_outer_outer * 392) + (((((int)threadIdx.x) + 208) / 27) * 49)) + (((((int)threadIdx.x) + 19) / 9) * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((int)threadIdx.x) + 1)) - 8))] : 0.000000e+00f);
+        pad_temp_shared[(((int)threadIdx.x))] = (((((9 <= (((int)threadIdx.x) % 81)) && ((((int)threadIdx.x) % 81) < 72)) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + ((((int)threadIdx.x) / 81) * 49)) + (((((int)threadIdx.x) % 81) / 9) * 7)) + (((int)threadIdx.x) % 9)) - 8))] : 0.000000e+00f);
+        pad_temp_shared[((((int)threadIdx.x) + 112))] = (((((9 <= ((((int)threadIdx.x) + 31) % 81)) && (((((int)threadIdx.x) + 31) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 4) % 9))) && (((((int)threadIdx.x) + 4) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 112) / 81) * 49)) + ((((((int)threadIdx.x) + 31) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 4) % 9)) - 8))] : 0.000000e+00f);
+        pad_temp_shared[((((int)threadIdx.x) + 224))] = (((((9 <= ((((int)threadIdx.x) + 62) % 81)) && (((((int)threadIdx.x) + 62) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 8) % 9))) && (((((int)threadIdx.x) + 8) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 224) / 81) * 49)) + ((((((int)threadIdx.x) + 62) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 8) % 9)) - 8))] : 0.000000e+00f);
+        pad_temp_shared[((((int)threadIdx.x) + 336))] = (((((9 <= ((((int)threadIdx.x) + 12) % 81)) && (((((int)threadIdx.x) + 12) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 3) % 9))) && (((((int)threadIdx.x) + 3) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 336) / 81) * 49)) + ((((((int)threadIdx.x) + 12) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 3) % 9)) - 8))] : 0.000000e+00f);
+        pad_temp_shared[((((int)threadIdx.x) + 448))] = (((((9 <= ((((int)threadIdx.x) + 43) % 81)) && (((((int)threadIdx.x) + 43) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 7) % 9))) && (((((int)threadIdx.x) + 7) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 448) / 81) * 49)) + ((((((int)threadIdx.x) + 43) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 7) % 9)) - 8))] : 0.000000e+00f);
+        pad_temp_shared[((((int)threadIdx.x) + 560))] = (((((9 <= ((((int)threadIdx.x) + 74) % 81)) && (((((int)threadIdx.x) + 74) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 2) % 9))) && (((((int)threadIdx.x) + 2) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 560) / 81) * 49)) + ((((((int)threadIdx.x) + 74) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 2) % 9)) - 8))] : 0.000000e+00f);
+        pad_temp_shared[((((int)threadIdx.x) + 672))] = (((((9 <= ((((int)threadIdx.x) + 24) % 81)) && (((((int)threadIdx.x) + 24) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 6) % 9))) && (((((int)threadIdx.x) + 6) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 672) / 81) * 49)) + ((((((int)threadIdx.x) + 24) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 6) % 9)) - 8))] : 0.000000e+00f);
+        pad_temp_shared[((((int)threadIdx.x) + 784))] = (((((9 <= ((((int)threadIdx.x) + 55) % 81)) && (((((int)threadIdx.x) + 55) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 1) % 9))) && (((((int)threadIdx.x) + 1) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 784) / 81) * 49)) + ((((((int)threadIdx.x) + 55) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 1) % 9)) - 8))] : 0.000000e+00f);
+        pad_temp_shared[((((int)threadIdx.x) + 896))] = (((((9 <= ((((int)threadIdx.x) + 5) % 81)) && (((((int)threadIdx.x) + 5) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 5) % 9))) && (((((int)threadIdx.x) + 5) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 896) / 81) * 49)) + ((((((int)threadIdx.x) + 5) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 5) % 9)) - 8))] : 0.000000e+00f);
+        pad_temp_shared[((((int)threadIdx.x) + 1008))] = (((((9 <= ((((int)threadIdx.x) + 36) % 81)) && (((((int)threadIdx.x) + 36) % 81) < 72)) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 1008) / 81) * 49)) + ((((((int)threadIdx.x) + 36) % 81) / 9) * 7)) + (((int)threadIdx.x) % 9)) - 8))] : 0.000000e+00f);
+        pad_temp_shared[((((int)threadIdx.x) + 1120))] = (((((9 <= ((((int)threadIdx.x) + 67) % 81)) && (((((int)threadIdx.x) + 67) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 4) % 9))) && (((((int)threadIdx.x) + 4) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 1120) / 81) * 49)) + ((((((int)threadIdx.x) + 67) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 4) % 9)) - 8))] : 0.000000e+00f);
+        if (((int)threadIdx.x) < 64) {
+          pad_temp_shared[((((int)threadIdx.x) + 1232))] = ((((((int)threadIdx.x) < 55) && (1 <= ((((int)threadIdx.x) + 8) % 9))) && (((((int)threadIdx.x) + 8) % 9) < 8)) ? data[((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 1232) / 81) * 49)) + (((((int)threadIdx.x) + 17) / 9) * 7)) + ((((int)threadIdx.x) + 8) % 9)) - 8))] : 0.000000e+00f);
+        }
+        kernel_shared[((((int)threadIdx.x) * 4))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((int)threadIdx.x) / 36) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) % 36) * 4)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 1))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 1) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 1) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 2))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 2) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 2) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 3))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 3) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 3) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 448))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 448) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 16) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 449))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 449) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 17) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 450))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 450) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 18) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 451))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 451) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 19) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 896))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 896) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 32) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 897))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 897) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 33) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 898))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 898) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 34) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 899))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 899) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 35) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 1344))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 1344) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 48) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 1345))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 1345) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 49) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 1346))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 1346) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 50) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 1347))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 1347) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 51) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 1792))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 1792) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 64) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 1793))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 1793) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 65) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 1794))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 1794) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 66) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 1795))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 1795) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 67) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 2240))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 2240) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 80) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 2241))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 2241) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 81) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 2242))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 2242) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 82) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 2243))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 2243) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 83) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 2688))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 2688) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 96) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 2689))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 2689) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 97) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 2690))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 2690) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 98) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 2691))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 2691) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 99) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 3136))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 3136) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 112) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 3137))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 3137) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 113) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 3138))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 3138) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 114) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 3139))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 3139) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 115) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 3584))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 3584) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 128) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 3585))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 3585) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 129) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 3586))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 3586) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 130) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 3587))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 3587) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 131) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 4032))] = kernel[((((((((int)blockIdx.x) * 147456) + ((((int)threadIdx.x) / 36) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) % 36) * 4)) + 129024))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 4033))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 4033) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 1) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 4034))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 4034) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 2) % 144)))];
+        kernel_shared[(((((int)threadIdx.x) * 4) + 4035))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 4035) / 144) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) * 4) + 3) % 144)))];
+        if (((int)threadIdx.x) < 32) {
+          kernel_shared[(((((int)threadIdx.x) * 4) + 4480))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 4480) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) * 4) + 16)))];
         }
-        for (int ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = 0; ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer < 72; ++ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer) {
-          kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 16) + ((int)threadIdx.x)))] = kernel[((((((((int)blockIdx.x) / 7) * 73728) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 16) + ((int)threadIdx.x)) / 72) * 4608)) + (rc_outer_outer * 72)) + (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 16) + ((int)threadIdx.x)) % 72)))];
+        if (((int)threadIdx.x) < 32) {
+          kernel_shared[(((((int)threadIdx.x) * 4) + 4481))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 4481) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) * 4) + 17)))];
+        }
+        if (((int)threadIdx.x) < 32) {
+          kernel_shared[(((((int)threadIdx.x) * 4) + 4482))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 4482) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) * 4) + 18)))];
+        }
+        if (((int)threadIdx.x) < 32) {
+          kernel_shared[(((((int)threadIdx.x) * 4) + 4483))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((((int)threadIdx.x) * 4) + 4483) / 144) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) * 4) + 19)))];
         }
         __syncthreads();
-        for (int rc_outer_inner = 0; rc_outer_inner < 2; ++rc_outer_inner) {
-          for (int rx_outer_inner = 0; rx_outer_inner < 3; ++rx_outer_inner) {
-            for (int rc_inner = 0; rc_inner < 4; ++rc_inner) {
-              for (int ry_inner = 0; ry_inner < 3; ++ry_inner) {
-                compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(((((rc_outer_inner * 108) + (rc_inner * 27)) + (ry_inner * 9)) + rx_outer_inner))] * kernel_shared[((((((((int)threadIdx.x) * 72) + (rc_outer_inner * 36)) + (rc_inner * 9)) + (ry_inner * 3)) + rx_outer_inner))]));
-                compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((((rc_outer_inner * 108) + (rc_inner * 27)) + (ry_inner * 9)) + rx_outer_inner) + 1))] * kernel_shared[((((((((int)threadIdx.x) * 72) + (rc_outer_inner * 36)) + (rc_inner * 9)) + (ry_inner * 3)) + rx_outer_inner))]));
-                compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((((rc_outer_inner * 108) + (rc_inner * 27)) + (ry_inner * 9)) + rx_outer_inner) + 2))] * kernel_shared[((((((((int)threadIdx.x) * 72) + (rc_outer_inner * 36)) + (rc_inner * 9)) + (ry_inner * 3)) + rx_outer_inner))]));
-                compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((((rc_outer_inner * 108) + (rc_inner * 27)) + (ry_inner * 9)) + rx_outer_inner) + 3))] * kernel_shared[((((((((int)threadIdx.x) * 72) + (rc_outer_inner * 36)) + (rc_inner * 9)) + (ry_inner * 3)) + rx_outer_inner))]));
-                compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((((rc_outer_inner * 108) + (rc_inner * 27)) + (ry_inner * 9)) + rx_outer_inner) + 4))] * kernel_shared[((((((((int)threadIdx.x) * 72) + (rc_outer_inner * 36)) + (rc_inner * 9)) + (ry_inner * 3)) + rx_outer_inner))]));
-                compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((((rc_outer_inner * 108) + (rc_inner * 27)) + (ry_inner * 9)) + rx_outer_inner) + 5))] * kernel_shared[((((((((int)threadIdx.x) * 72) + (rc_outer_inner * 36)) + (rc_inner * 9)) + (ry_inner * 3)) + rx_outer_inner))]));
-                compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((((rc_outer_inner * 108) + (rc_inner * 27)) + (ry_inner * 9)) + rx_outer_inner) + 6))] * kernel_shared[((((((((int)threadIdx.x) * 72) + (rc_outer_inner * 36)) + (rc_inner * 9)) + (ry_inner * 3)) + rx_outer_inner))]));
-              }
-            }
-          }
+        for (int rc_outer_inner = 0; rc_outer_inner < 4; ++rc_outer_inner) {
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)))] * kernel_shared[((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[(((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2304))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 1))] * kernel_shared[((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 1))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2304))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 2))] * kernel_shared[((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 2))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2304))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 3))] * kernel_shared[((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 3))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2304))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 4))] * kernel_shared[((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 4))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2304))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 5))] * kernel_shared[((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 5))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2304))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 6))] * kernel_shared[((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 6))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2304))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 1))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 1))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 1))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2305))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 2))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 1))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 2))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2305))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 3))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 1))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 3))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2305))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 4))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 1))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 4))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2305))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 5))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 1))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 5))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2305))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 6))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 1))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 6))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2305))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 7))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 1))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 7))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2305))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 2))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 2))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2306))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 3))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 3))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2306))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 4))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 4))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2306))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 5))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 5))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2306))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 6))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 6))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2306))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 7))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 7))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2306))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 8))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 8))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2306))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 81))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 9))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 81))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2313))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 82))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 9))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 82))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2313))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 83))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 9))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 83))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2313))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 84))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 9))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 84))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2313))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 85))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 9))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 85))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2313))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 86))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 9))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 86))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2313))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 87))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 9))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 87))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2313))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 82))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 10))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 82))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2314))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 83))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 10))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 83))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2314))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 84))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 10))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 84))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2314))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 85))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 10))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 85))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2314))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 86))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 10))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 86))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2314))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 87))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 10))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 87))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2314))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 88))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 10))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 88))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2314))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 83))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 11))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 83))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2315))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 84))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 11))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 84))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2315))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 85))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 11))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 85))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2315))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 86))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 11))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 86))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2315))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 87))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 11))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 87))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2315))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 88))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 11))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 88))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2315))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 89))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 11))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 89))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2315))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 162))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 18))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 162))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2322))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 163))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 18))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 163))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2322))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 164))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 18))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 164))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2322))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 165))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 18))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 165))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2322))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 166))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 18))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 166))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2322))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 167))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 18))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 167))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2322))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 168))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 18))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 168))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2322))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 163))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 19))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 163))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2323))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 164))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 19))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 164))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2323))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 165))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 19))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 165))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2323))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 166))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 19))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 166))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2323))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 167))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 19))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 167))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2323))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 168))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 19))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 168))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2323))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 169))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 19))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 169))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2323))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 164))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 20))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 164))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2324))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 165))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 20))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 165))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2324))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 166))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 20))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 166))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2324))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 167))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 20))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 167))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2324))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 168))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 20))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 168))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2324))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 169))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 20))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 169))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2324))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 170))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 20))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 170))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2324))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 243))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 27))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 243))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2331))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 244))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 27))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 244))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2331))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 245))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 27))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 245))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2331))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 246))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 27))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 246))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2331))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 247))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 27))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 247))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2331))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 248))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 27))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 248))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2331))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 249))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 27))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 249))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2331))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 244))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 28))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 244))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2332))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 245))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 28))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 245))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2332))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 246))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 28))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 246))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2332))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 247))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 28))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 247))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2332))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 248))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 28))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 248))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2332))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 249))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 28))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 249))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2332))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 250))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 28))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 250))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2332))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 245))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 29))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 245))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2333))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 246))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 29))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 246))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2333))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 247))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 29))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 247))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2333))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 248))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 29))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 248))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2333))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 249))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 29))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 249))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2333))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 250))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 29))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 250))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2333))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 251))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 29))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 251))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2333))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 9))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 3))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 9))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2307))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 10))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 3))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 10))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2307))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 11))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 3))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 11))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2307))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 12))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 3))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 12))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2307))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 13))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 3))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 13))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2307))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 14))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 3))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 14))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2307))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 15))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 3))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 15))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2307))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 10))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 4))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 10))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2308))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 11))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 4))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 11))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2308))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 12))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 4))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 12))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2308))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 13))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 4))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 13))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2308))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 14))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 4))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 14))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2308))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 15))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 4))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 15))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2308))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 16))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 4))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 16))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2308))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 11))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 5))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 11))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2309))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 12))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 5))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 12))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2309))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 13))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 5))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 13))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2309))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 14))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 5))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 14))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2309))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 15))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 5))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 15))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2309))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 16))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 5))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 16))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2309))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 17))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 5))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 17))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2309))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 90))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 12))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 90))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2316))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 91))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 12))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 91))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2316))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 92))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 12))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 92))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2316))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 93))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 12))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 93))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2316))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 94))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 12))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 94))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2316))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 95))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 12))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 95))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2316))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 96))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 12))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 96))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2316))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 91))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 13))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 91))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2317))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 92))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 13))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 92))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2317))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 93))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 13))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 93))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2317))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 94))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 13))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 94))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2317))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 95))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 13))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 95))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2317))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 96))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 13))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 96))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2317))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 97))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 13))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 97))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2317))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 92))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 14))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 92))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2318))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 93))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 14))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 93))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2318))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 94))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 14))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 94))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2318))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 95))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 14))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 95))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2318))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 96))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 14))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 96))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2318))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 97))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 14))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 97))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2318))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 98))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 14))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 98))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2318))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 171))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 21))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 171))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2325))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 172))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 21))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 172))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2325))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 173))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 21))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 173))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2325))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 174))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 21))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 174))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2325))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 175))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 21))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 175))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2325))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 176))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 21))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 176))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2325))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 177))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 21))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 177))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2325))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 172))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 22))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 172))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2326))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 173))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 22))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 173))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2326))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 174))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 22))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 174))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2326))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 175))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 22))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 175))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2326))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 176))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 22))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 176))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2326))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 177))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 22))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 177))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2326))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 178))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 22))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 178))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2326))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 173))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 23))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 173))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2327))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 174))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 23))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 174))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2327))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 175))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 23))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 175))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2327))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 176))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 23))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 176))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2327))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 177))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 23))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 177))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2327))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 178))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 23))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 178))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2327))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 179))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 23))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 179))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2327))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 252))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 30))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 252))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2334))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 253))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 30))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 253))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2334))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 254))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 30))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 254))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2334))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 255))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 30))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 255))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2334))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 256))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 30))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 256))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2334))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 257))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 30))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 257))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2334))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 258))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 30))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 258))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2334))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 253))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 31))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 253))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2335))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 254))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 31))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 254))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2335))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 255))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 31))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 255))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2335))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 256))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 31))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 256))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2335))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 257))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 31))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 257))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2335))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 258))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 31))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 258))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2335))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 259))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 31))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 259))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2335))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 254))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 32))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 254))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2336))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 255))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 32))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 255))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2336))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 256))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 32))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 256))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2336))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 257))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 32))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 257))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2336))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 258))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 32))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 258))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2336))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 259))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 32))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 259))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2336))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 260))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 32))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 260))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2336))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 18))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 6))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 18))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2310))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 19))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 6))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 19))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2310))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 20))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 6))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 20))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2310))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 21))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 6))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 21))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2310))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 22))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 6))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 22))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2310))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 23))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 6))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 23))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2310))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 24))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 6))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 24))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2310))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 19))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 7))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 19))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2311))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 20))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 7))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 20))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2311))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 21))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 7))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 21))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2311))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 22))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 7))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 22))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2311))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 23))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 7))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 23))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2311))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 24))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 7))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 24))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2311))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 25))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 7))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 25))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2311))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 20))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 8))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 20))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2312))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 21))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 8))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 21))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2312))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 22))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 8))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 22))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2312))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 23))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 8))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 23))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2312))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 24))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 8))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 24))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2312))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 25))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 8))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 25))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2312))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 26))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 8))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 26))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2312))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 99))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 15))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 99))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2319))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 100))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 15))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 100))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2319))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 101))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 15))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 101))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2319))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 102))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 15))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 102))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2319))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 103))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 15))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 103))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2319))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 104))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 15))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 104))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2319))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 105))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 15))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 105))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2319))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 100))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 16))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 100))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2320))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 101))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 16))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 101))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2320))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 102))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 16))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 102))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2320))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 103))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 16))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 103))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2320))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 104))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 16))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 104))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2320))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 105))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 16))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 105))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2320))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 106))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 16))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 106))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2320))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 101))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 17))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 101))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2321))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 102))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 17))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 102))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2321))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 103))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 17))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 103))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2321))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 104))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 17))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 104))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2321))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 105))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 17))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 105))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2321))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 106))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 17))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 106))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2321))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 107))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 17))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 107))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2321))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 180))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 24))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 180))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2328))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 181))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 24))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 181))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2328))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 182))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 24))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 182))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2328))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 183))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 24))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 183))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2328))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 184))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 24))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 184))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2328))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 185))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 24))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 185))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2328))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 186))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 24))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 186))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2328))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 181))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 25))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 181))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2329))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 182))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 25))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 182))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2329))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 183))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 25))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 183))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2329))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 184))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 25))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 184))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2329))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 185))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 25))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 185))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2329))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 186))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 25))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 186))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2329))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 187))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 25))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 187))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2329))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 182))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 26))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 182))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2330))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 183))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 26))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 183))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2330))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 184))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 26))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 184))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2330))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 185))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 26))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 185))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2330))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 186))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 26))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 186))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2330))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 187))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 26))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 187))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2330))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 188))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 26))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 188))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2330))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 261))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 33))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 261))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2337))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 262))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 33))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 262))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2337))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 263))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 33))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 263))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2337))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 264))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 33))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 264))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2337))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 265))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 33))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 265))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2337))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 266))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 33))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 266))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2337))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 267))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 33))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 267))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2337))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 262))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 34))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 262))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2338))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 263))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 34))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 263))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2338))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 264))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 34))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 264))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2338))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 265))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 34))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 265))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2338))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 266))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 34))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 266))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2338))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 267))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 34))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 267))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2338))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 268))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 34))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 268))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2338))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 263))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 35))]));
+          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 263))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2339))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 264))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 35))]));
+          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 264))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2339))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 265))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 35))]));
+          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 265))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2339))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 266))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 35))]));
+          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 266))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2339))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 267))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 35))]));
+          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 267))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2339))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 268))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 35))]));
+          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 268))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2339))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 269))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 35))]));
+          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[((((rc_outer_inner * 324) + ((((int)threadIdx.x) % 7) * 9)) + 269))] * kernel_shared[(((((((int)threadIdx.x) / 7) * 144) + (rc_outer_inner * 36)) + 2339))]));
         }
       }
-      compute[(((((((int)blockIdx.x) / 7) * 784) + (((int)threadIdx.x) * 49)) + ((((int)blockIdx.x) % 7) * 7)))] = max((compute1[(0)] + bias[((((((int)blockIdx.x) / 7) * 16) + ((int)threadIdx.x)))]), 0.000000e+00f);
-      compute[((((((((int)blockIdx.x) / 7) * 784) + (((int)threadIdx.x) * 49)) + ((((int)blockIdx.x) % 7) * 7)) + 1))] = max((compute1[(1)] + bias[((((((int)blockIdx.x) / 7) * 16) + ((int)threadIdx.x)))]), 0.000000e+00f);
-      compute[((((((((int)blockIdx.x) / 7) * 784) + (((int)threadIdx.x) * 49)) + ((((int)blockIdx.x) % 7) * 7)) + 2))] = max((compute1[(2)] + bias[((((((int)blockIdx.x) / 7) * 16) + ((int)threadIdx.x)))]), 0.000000e+00f);
-      compute[((((((((int)blockIdx.x) / 7) * 784) + (((int)threadIdx.x) * 49)) + ((((int)blockIdx.x) % 7) * 7)) + 3))] = max((compute1[(3)] + bias[((((((int)blockIdx.x) / 7) * 16) + ((int)threadIdx.x)))]), 0.000000e+00f);
-      compute[((((((((int)blockIdx.x) / 7) * 784) + (((int)threadIdx.x) * 49)) + ((((int)blockIdx.x) % 7) * 7)) + 4))] = max((compute1[(4)] + bias[((((((int)blockIdx.x) / 7) * 16) + ((int)threadIdx.x)))]), 0.000000e+00f);
-      compute[((((((((int)blockIdx.x) / 7) * 784) + (((int)threadIdx.x) * 49)) + ((((int)blockIdx.x) % 7) * 7)) + 5))] = max((compute1[(5)] + bias[((((((int)blockIdx.x) / 7) * 16) + ((int)threadIdx.x)))]), 0.000000e+00f);
-      compute[((((((((int)blockIdx.x) / 7) * 784) + (((int)threadIdx.x) * 49)) + ((((int)blockIdx.x) % 7) * 7)) + 6))] = max((compute1[(6)] + bias[((((((int)blockIdx.x) / 7) * 16) + ((int)threadIdx.x)))]), 0.000000e+00f);
+      for (int i3_inner = 0; i3_inner < 7; ++i3_inner) {
+        compute[((((((int)blockIdx.x) * 1568) + (((int)threadIdx.x) * 7)) + i3_inner))] = max((compute1[(i3_inner)] + bias[(((((int)blockIdx.x) * 32) + (((int)threadIdx.x) / 7)))]), 0.000000e+00f);
+        compute[(((((((int)blockIdx.x) * 1568) + (((int)threadIdx.x) * 7)) + i3_inner) + 784))] = max((compute1[((i3_inner + 7))] + bias[((((((int)blockIdx.x) * 32) + (((int)threadIdx.x) / 7)) + 16))]), 0.000000e+00f);
+      }
     }
 
 
@@ -559,7 +1682,7 @@ In the example below we resume the status and do more 5 trials.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  33.895 seconds)
+   **Total running time of the script:** ( 1 minutes  37.345 seconds)
 
 
 .. _sphx_glr_download_tutorials_auto_scheduler_tune_conv2d_layer_cuda.py:
diff --git a/docs/_sources/tutorials/auto_scheduler/tune_matmul_x86.rst.txt b/docs/_sources/tutorials/auto_scheduler/tune_matmul_x86.rst.txt
index 43b71e7..0e25cd2 100644
--- a/docs/_sources/tutorials/auto_scheduler/tune_matmul_x86.rst.txt
+++ b/docs/_sources/tutorials/auto_scheduler/tune_matmul_x86.rst.txt
@@ -198,8 +198,8 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
     primfn(A_1: handle, B_1: handle, C_1: handle, out_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
       buffers = {out: Buffer(out_2: Pointer(float32), float32, [1024, 1024], []),
-                 C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),
                  B: Buffer(B_2: Pointer(float32), float32, [1024, 1024], []),
+                 C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),
                  A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], [])}
       buffer_map = {A_1: A, B_1: B, C_1: C, out_1: out} {
       attr [auto_scheduler_layout_transform: Pointer(float32)] "storage_scope" = "global";
@@ -283,7 +283,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 43.375 ms
+    Execution time of this operator: 91.229 ms
 
 
 
@@ -376,7 +376,7 @@ In the example below we resume the status and do more 5 trials.
  .. code-block:: none
 
     Resume search:
-
+    *T
 
 
 
diff --git a/docs/_sources/tutorials/auto_scheduler/tune_network_mali.rst.txt b/docs/_sources/tutorials/auto_scheduler/tune_network_arm.rst.txt
similarity index 67%
copy from docs/_sources/tutorials/auto_scheduler/tune_network_mali.rst.txt
copy to docs/_sources/tutorials/auto_scheduler/tune_network_arm.rst.txt
index 0ae1107..b8fcfa9 100644
--- a/docs/_sources/tutorials/auto_scheduler/tune_network_mali.rst.txt
+++ b/docs/_sources/tutorials/auto_scheduler/tune_network_arm.rst.txt
@@ -1,19 +1,19 @@
 .. note::
     :class: sphx-glr-download-link-note
 
-    Click :ref:`here <sphx_glr_download_tutorials_auto_scheduler_tune_network_mali.py>` to download the full example code
+    Click :ref:`here <sphx_glr_download_tutorials_auto_scheduler_tune_network_arm.py>` to download the full example code
 .. rst-class:: sphx-glr-example-title
 
-.. _sphx_glr_tutorials_auto_scheduler_tune_network_mali.py:
+.. _sphx_glr_tutorials_auto_scheduler_tune_network_arm.py:
 
 
-Auto-scheduling a Neural Network for mali GPU
+Auto-scheduling a Neural Network for ARM CPU
 =============================================
-**Author**: `Zhao Wu <https://github.com/FrozenGene>`_
+**Author**: `Thierry Moreau <https://github.com/tmoreau89, Lianmin Zheng <https://github.com/merrymercy>>`_
 
 Auto-tuning for specific devices and workloads is critical for getting the
 best performance. This is a tutorial on how to tune a whole neural
-network for mali GPU with the auto-scheduler.
+network for ARM CPU with the auto-scheduler via RPC.
 
 To auto-tune a neural network, we partition the network into small subgraphs and 
 tune them independently. Each subgraph is treated as one search task.
@@ -45,7 +45,7 @@ __name__ == "__main__":` block.
     from tvm import relay, auto_scheduler
     import tvm.relay.testing
     from tvm.contrib import graph_runtime
-    import os
+    from tvm.contrib.utils import tempdir
 
 
 
@@ -135,39 +135,113 @@ You can use :ref:`ConvertLayout <convert-layout-usage>` pass to do the layout co
         return mod, params, input_shape, output_shape
 
 
-    # Define the neural network and compilation target.
-    network = "mobilenet"
-    batch_size = 1
-    layout = "NHWC"
-    # Set this to True if you use ndk tools for cross compiling
-    use_ndk = True
-    # Path to cross compiler
-    os.environ["TVM_NDK_CC"] = "/usr/bin/aarch64-linux-gnu-g++"
-    target_host = tvm.target.Target("llvm -mtriple=aarch64-linux-gnu")
-    target = tvm.target.Target("opencl -device=mali")
-    dtype = "float32"
-    log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name)
 
 
 
 
 
 
+Start RPC Tracker
+-----------------
+TVM uses RPC session to communicate with ARM boards.
+During tuning, the tuner will send the generated code to the board and
+measure the speed of code on the board.
+
+To scale up the tuning, TVM uses RPC Tracker to manage distributed devices.
+The RPC Tracker is a centralized controller node. We can register all devices to
+the tracker. For example, if we have 10 phones, we can register all of them
+to the tracker, and run 10 measurements in parallel, accelerating the tuning process.
+
+To start an RPC tracker, run this command on the host machine. The tracker is
+required during the whole tuning process, so we need to open a new terminal for
+this command:
+
+.. code-block:: bash
+
+  python -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190
+
+The expected output is
+
+.. code-block:: bash
+
+  INFO:RPCTracker:bind to 0.0.0.0:9190
+
+Register Devices to RPC Tracker
+-----------------------------------
+Now we can register our devices to the tracker. The first step is to
+build the TVM runtime for the ARM devices.
+
+* For Linux:
+  Follow this section :ref:`build-tvm-runtime-on-device` to build
+  the TVM runtime on the device. Then register the device to tracker by
+
+  .. code-block:: bash
+
+    python -m tvm.exec.rpc_server --tracker=[HOST_IP]:9190 --key=rasp4b-64
+
+  (replace :code:`[HOST_IP]` with the IP address of your host machine)
+
+* For Android:
+  Follow this `readme page <https://github.com/apache/tvm/tree/main/apps/android_rpc>`_ to
+  install the TVM RPC APK on the android device. Make sure you can pass the android rpc test.
+  Then you have already registered your device. During tuning, you have to go to developer option
+  and enable "Keep screen awake during changing" and charge your phone to make it stable.
+
+After registering devices, we can confirm it by querying rpc_tracker
+
+.. code-block:: bash
+
+  python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190
+
+For example, if we have 2 Huawei mate10 pro, 11 Raspberry Pi 4B with 64bit OS, and 2 rk3399,
+the output can be
+
+.. code-block:: bash
 
+   Queue Status
+   ----------------------------------
+   key          total  free  pending
+   ----------------------------------
+   mate10pro    2      2     0
+   rk3399       2      2     0
+   rasp4b-64    11     11    0
+   ----------------------------------
 
-Start an RPC Tracker and Register Devices to the Tracker
---------------------------------------------------------
-Please refer to the "Start RPC Tracker" and "Register Devices to RPC Tracker" setions
-in this :ref:`tutorial <tutorials-autotvm-start-rpc-tracker>` to start an RPC tracker
-and register devices to the tracker.
+You can register multiple devices to the tracker to accelerate the measurement in tuning.
+
+Set Tuning Options
+------------------
+Before tuning, we should apply some configurations. Here I use a Raspberry Pi 4b 4GB board
+as example with a 64bit OS (Ubuntu 20.04). In your setting, you should modify the target
+and device_key accordingly.
+set :code:`use_ndk` to True if you use android phone.
 
 
 .. code-block:: default
 
 
-    # Replace this with the device key in your tracker
-    device_key = "rk3399"
+    #### DEVICE CONFIG ####
+
+    # Replace "aarch64-linux-gnu" with the correct target of your board.
+    # This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.
+    # FIXME(tmoreau89, merrymercy): We leave '-device=arm_cpu' out of the target string
+    #                               because we're sharing x86 op strategy.
+    target = tvm.target.Target("llvm -mtriple=aarch64-linux-gnu -mattr=+neon")
+
+    # Also replace this with the device key in your tracker
+    device_key = "rasp4b-64"
 
+    # Set this to True if you use ndk tools for cross compiling
+    # And also set the environment variable below to point to the cross compiler
+    use_ndk = False
+    # os.environ["TVM_NDK_CC"] = "/usr/bin/aarch64-linux-gnu-g++"
+
+    #### TUNING OPTION ####
+    network = "mobilenet"
+    batch_size = 1
+    layout = "NHWC"
+    dtype = "float32"
+    log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name)
 
 
 
@@ -192,7 +266,7 @@ The task scheduler will just optimize this objective.
     # Extract tasks from the network
     print("Extract tasks...")
     mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
-    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target, target_host)
+    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
 
     for idx, task in enumerate(tasks):
         print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
@@ -201,6 +275,8 @@ The task scheduler will just optimize this objective.
 
 
 
+
+
 .. rst-class:: sphx-glr-script-out
 
  Out:
@@ -208,27 +284,26 @@ The task scheduler will just optimize this objective.
  .. code-block:: none
 
     Extract tasks...
-
    ...2%, 0.01 MB, 185 KB/s, 0 seconds passed
    ...5%, 0.02 MB, 370 KB/s, 0 seconds passed
    ...7%, 0.02 MB, 549 KB/s, 0 seconds passed
    ...10%, 0.03 MB, 730 KB/s, 0 seconds passed
    ...12%, 0.04 MB, 814 KB/s, 0 seconds passed
    ...15%, 0.05 MB, 974 KB/s, 0 seconds passed
    ...18%, 0.05 MB, 1111 KB/s, 0 seconds passed
    ...20%, 0.06 MB, 1267 KB/s, 0 seconds passed
    ...23%, 0.07 MB, 1423 KB/s, 0 seconds passed
    ...25%, 0.08 MB, 1560 KB/s, 0 seconds passed
    ...28%, 0.09 MB, 1712 KB/s, 0 seconds passed
    ...30%, 0.09 MB, 1843 KB/s, 0 seconds passed
    ...33%, 0.10 MB, 1992 KB/s, 0 seconds passed
    ...36%, 0.11 MB, 2116 KB/s, 0 seconds passed
    ...38%, 0.12 MB, 2155 KB/s, 0 seconds passed
    ...41%, 0.12 MB, 2257 KB/s, 0 seconds passed
    ...43%, 0.13 MB, 2394 KB/s, 0 seconds passed
    ...46%, 0.14 MB, 2530 KB/s, 0 seconds passed
    ...48%, 0.15 MB, 2659 KB/s, 0 seconds passed
    ...51%, 0.16 MB, 2793 KB/s, 0 seconds passed
    ...54%, 0.16 MB, 2928
  KB/s, 0 seconds passed
    ...56%, 0.17 MB, 3063 KB/s, 0 seconds passed
    ...59%, 0.18 MB, 3168 KB/s, 0 seconds passed
    ...61%, 0.19 MB, 3289 KB/s, 0 seconds passed
    ...64%, 0.20 MB, 3419 KB/s, 0 seconds passed
    ...66%, 0.20 MB, 3551 KB/s, 0 seconds passed
    ...69%, 0.21 MB, 3639 KB/s, 0 seconds passed
    ...72%, 0.22 MB, 3768 KB/s, 0 seconds passed
    ...74%, 0.23 MB, 3893 KB/s, 0 seconds passed
    ...77%, 0.23 MB, 4022 KB/s, 0 seconds passed
    ...79%, 0.24 MB, 3981 KB/s, 0 seconds passed
    ...82%, 0.25 MB, 4105 KB/s, 0 seconds passed
    ...84%, 0.26 MB, 4165 KB/s, 0 seconds passed
    ...87%, 0.27 MB, 4286 KB/s, 0 seconds passed
    ...90%, 0.27 MB, 4391 KB/s, 0 seconds passed
    ...92%, 0.28 MB, 4511 KB/s, 0 seconds passed
    ...95%, 0.29 MB, 4586 KB/s, 0 seconds passed
    ...97%, 0.30 MB, 4705 KB/s, 0 seconds passed
    ...100%, 0.30 MB, 4816 KB/s, 0 seconds passed
-    ========== Task 0  (workload key: ["b32ed43fb351136894c322ee49097a1a"]) ==========
+    ========== Task 0  (workload key: ["d7b65649a4dd54becea0a52aabbc5af5", 1, 1000, 1, 1000]) ==========
     placeholder = PLACEHOLDER [1, 1000]
     T_softmax_maxelem(i0) max= placeholder[i0, k]
     T_softmax_exp(i0, i1) = tir.exp((placeholder[i0, i1] - T_softmax_maxelem[i0]))
     T_softmax_expsum(i0) += T_softmax_exp[i0, k]
     T_softmax_norm(i0, i1) = (T_softmax_exp[i0, i1]/T_softmax_expsum[i0])
 
-    ========== Task 1  (workload key: ["35552028f3076f68df3063174e40b59f"]) ==========
+    ========== Task 1  (workload key: ["9847f8cc0b305137f49f2c5c0c8ab25d", 1, 1024, 1000, 1024, 1000, 1, 1000]) ==========
     placeholder = PLACEHOLDER [1, 1024]
     placeholder = PLACEHOLDER [1000, 1024]
     T_dense(i, j) += (placeholder[i, k]*placeholder[j, k])
     placeholder = PLACEHOLDER [1000]
     T_add(ax0, ax1) = (T_dense[ax0, ax1] + placeholder[ax1])
 
-    ========== Task 2  (workload key: ["cf95f3a14294b5393f63b280d0ec0ab6"]) ==========
+    ========== Task 2  (workload key: ["69115f188984ae34ede37c3b8ca40b43", 1, 7, 7, 1024, 1, 1, 1, 1024]) ==========
     placeholder = PLACEHOLDER [1, 7, 7, 1024]
     tensor(ax0, ax1, ax2, ax3) += placeholder[ax0, ((ax1*7) + rv0), ((ax2*7) + rv1), ax3]
     tensor(ax0, ax1, ax2, ax3) = (tensor[ax0, ax1, ax2, ax3]/(float32((select((bool)1, ((ax1 + 1)*7), (((ax1 + 1)*7) + 1)) - (ax1*7)))*float32((select((bool)1, ((ax2 + 1)*7), (((ax2 + 1)*7) + 1)) - (ax2*7)))))
 
-    ========== Task 3  (workload key: ["baa3a42d3cb6ab30685b0a7894b95da9"]) ==========
+    ========== Task 3  (workload key: ["6b7583cf23c7c37d3212cad9d06e58c1", 1, 7, 7, 1024, 1, 1, 1024, 1024, 1, 1, 1, 1024, 1, 7, 7, 1024]) ==========
     placeholder = PLACEHOLDER [1, 7, 7, 1024]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 1024, 1024]
@@ -237,7 +312,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 4  (workload key: ["089861a00a7dfcc7196c2b6b5c807855"]) ==========
+    ========== Task 4  (workload key: ["06fce76bd84cb904eee50b905ca9449a", 1, 7, 7, 1024, 3, 3, 1024, 1, 1, 1, 1, 1024, 1, 7, 7, 1024]) ==========
     placeholder = PLACEHOLDER [1, 7, 7, 1024]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 8)) && (i2 >= 1)) && (i2 < 8)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     placeholder = PLACEHOLDER [3, 3, 1024, 1]
@@ -246,7 +321,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (DepthwiseConv2d[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 5  (workload key: ["e7ff95f121397b87a0ca12ef428aef59"]) ==========
+    ========== Task 5  (workload key: ["6b7583cf23c7c37d3212cad9d06e58c1", 1, 7, 7, 512, 1, 1, 512, 1024, 1, 1, 1, 1024, 1, 7, 7, 1024]) ==========
     placeholder = PLACEHOLDER [1, 7, 7, 512]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 512, 1024]
@@ -255,7 +330,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 6  (workload key: ["c3831fcb49bfdfc679be0bbfb987da82"]) ==========
+    ========== Task 6  (workload key: ["c87ba68bc180312f5716af09a77ca15b", 1, 14, 14, 512, 3, 3, 512, 1, 1, 1, 1, 512, 1, 7, 7, 512]) ==========
     placeholder = PLACEHOLDER [1, 14, 14, 512]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 15)) && (i2 >= 1)) && (i2 < 15)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     placeholder = PLACEHOLDER [3, 3, 512, 1]
@@ -264,7 +339,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (DepthwiseConv2d[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 7  (workload key: ["33bb900cb60276282852b4b9c1346fe9"]) ==========
+    ========== Task 7  (workload key: ["6b7583cf23c7c37d3212cad9d06e58c1", 1, 14, 14, 512, 1, 1, 512, 512, 1, 1, 1, 512, 1, 14, 14, 512]) ==========
     placeholder = PLACEHOLDER [1, 14, 14, 512]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 512, 512]
@@ -273,7 +348,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 8  (workload key: ["f2a48dd923600da67abb78b4895f8f7b"]) ==========
+    ========== Task 8  (workload key: ["06fce76bd84cb904eee50b905ca9449a", 1, 14, 14, 512, 3, 3, 512, 1, 1, 1, 1, 512, 1, 14, 14, 512]) ==========
     placeholder = PLACEHOLDER [1, 14, 14, 512]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 15)) && (i2 >= 1)) && (i2 < 15)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     placeholder = PLACEHOLDER [3, 3, 512, 1]
@@ -282,7 +357,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (DepthwiseConv2d[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 9  (workload key: ["f6906ccbe2258e70648ea15f3c037ca0"]) ==========
+    ========== Task 9  (workload key: ["6b7583cf23c7c37d3212cad9d06e58c1", 1, 14, 14, 256, 1, 1, 256, 512, 1, 1, 1, 512, 1, 14, 14, 512]) ==========
     placeholder = PLACEHOLDER [1, 14, 14, 256]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 256, 512]
@@ -291,7 +366,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 10  (workload key: ["381331b022e1b4ddc705aa66c2cb90c8"]) ==========
+    ========== Task 10  (workload key: ["c87ba68bc180312f5716af09a77ca15b", 1, 28, 28, 256, 3, 3, 256, 1, 1, 1, 1, 256, 1, 14, 14, 256]) ==========
     placeholder = PLACEHOLDER [1, 28, 28, 256]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 29)) && (i2 >= 1)) && (i2 < 29)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     placeholder = PLACEHOLDER [3, 3, 256, 1]
@@ -300,7 +375,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (DepthwiseConv2d[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 11  (workload key: ["413e7c2a210f0fbf2fadeb2686aba8ee"]) ==========
+    ========== Task 11  (workload key: ["6b7583cf23c7c37d3212cad9d06e58c1", 1, 28, 28, 256, 1, 1, 256, 256, 1, 1, 1, 256, 1, 28, 28, 256]) ==========
     placeholder = PLACEHOLDER [1, 28, 28, 256]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 256, 256]
@@ -309,7 +384,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 12  (workload key: ["2b4a9c43c1bcbb5c68742378a4e72f74"]) ==========
+    ========== Task 12  (workload key: ["06fce76bd84cb904eee50b905ca9449a", 1, 28, 28, 256, 3, 3, 256, 1, 1, 1, 1, 256, 1, 28, 28, 256]) ==========
     placeholder = PLACEHOLDER [1, 28, 28, 256]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 29)) && (i2 >= 1)) && (i2 < 29)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     placeholder = PLACEHOLDER [3, 3, 256, 1]
@@ -318,7 +393,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (DepthwiseConv2d[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 13  (workload key: ["017340b550a0bda8bd8ec1933bc32756"]) ==========
+    ========== Task 13  (workload key: ["6b7583cf23c7c37d3212cad9d06e58c1", 1, 28, 28, 128, 1, 1, 128, 256, 1, 1, 1, 256, 1, 28, 28, 256]) ==========
     placeholder = PLACEHOLDER [1, 28, 28, 128]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 128, 256]
@@ -327,7 +402,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 14  (workload key: ["539b0d6ae7b6e1610e29ae571b8b8c25"]) ==========
+    ========== Task 14  (workload key: ["c87ba68bc180312f5716af09a77ca15b", 1, 56, 56, 128, 3, 3, 128, 1, 1, 1, 1, 128, 1, 28, 28, 128]) ==========
     placeholder = PLACEHOLDER [1, 56, 56, 128]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 57)) && (i2 >= 1)) && (i2 < 57)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     placeholder = PLACEHOLDER [3, 3, 128, 1]
@@ -336,7 +411,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (DepthwiseConv2d[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 15  (workload key: ["80b2e789f7bce126bde2176640ca76a4"]) ==========
+    ========== Task 15  (workload key: ["6b7583cf23c7c37d3212cad9d06e58c1", 1, 56, 56, 128, 1, 1, 128, 128, 1, 1, 1, 128, 1, 56, 56, 128]) ==========
     placeholder = PLACEHOLDER [1, 56, 56, 128]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 128, 128]
@@ -345,7 +420,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 16  (workload key: ["3dba2989a90e19861af284d74e40f5cd"]) ==========
+    ========== Task 16  (workload key: ["06fce76bd84cb904eee50b905ca9449a", 1, 56, 56, 128, 3, 3, 128, 1, 1, 1, 1, 128, 1, 56, 56, 128]) ==========
     placeholder = PLACEHOLDER [1, 56, 56, 128]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 57)) && (i2 >= 1)) && (i2 < 57)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     placeholder = PLACEHOLDER [3, 3, 128, 1]
@@ -354,7 +429,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (DepthwiseConv2d[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 17  (workload key: ["3b9a17584b6afa25229ef34c6f417660"]) ==========
+    ========== Task 17  (workload key: ["6b7583cf23c7c37d3212cad9d06e58c1", 1, 56, 56, 64, 1, 1, 64, 128, 1, 1, 1, 128, 1, 56, 56, 128]) ==========
     placeholder = PLACEHOLDER [1, 56, 56, 64]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 64, 128]
@@ -363,7 +438,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 18  (workload key: ["a4553ca6a00b6c8adb555bcde25d95c4"]) ==========
+    ========== Task 18  (workload key: ["c87ba68bc180312f5716af09a77ca15b", 1, 112, 112, 64, 3, 3, 64, 1, 1, 1, 1, 64, 1, 56, 56, 64]) ==========
     placeholder = PLACEHOLDER [1, 112, 112, 64]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 113)) && (i2 >= 1)) && (i2 < 113)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     placeholder = PLACEHOLDER [3, 3, 64, 1]
@@ -372,7 +447,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (DepthwiseConv2d[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 19  (workload key: ["63672689bf8f678a0abe0854828cbd3b"]) ==========
+    ========== Task 19  (workload key: ["6b7583cf23c7c37d3212cad9d06e58c1", 1, 112, 112, 32, 1, 1, 32, 64, 1, 1, 1, 64, 1, 112, 112, 64]) ==========
     placeholder = PLACEHOLDER [1, 112, 112, 32]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 32, 64]
@@ -381,7 +456,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 20  (workload key: ["1ceacb63c63eaa3da881bff2858acdbf"]) ==========
+    ========== Task 20  (workload key: ["06fce76bd84cb904eee50b905ca9449a", 1, 112, 112, 32, 3, 3, 32, 1, 1, 1, 1, 32, 1, 112, 112, 32]) ==========
     placeholder = PLACEHOLDER [1, 112, 112, 32]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 113)) && (i2 >= 1)) && (i2 < 113)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     placeholder = PLACEHOLDER [3, 3, 32, 1]
@@ -390,7 +465,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (DepthwiseConv2d[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 21  (workload key: ["2c2147047fd6dafd3d66d75165843f67"]) ==========
+    ========== Task 21  (workload key: ["98418eda02701ddd175ad50e364a0638", 1, 224, 224, 3, 3, 3, 3, 32, 1, 112, 1, 1, 1, 112, 1, 1, 1, 112, 112, 32]) ==========
     placeholder = PLACEHOLDER [1, 224, 224, 3]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 225)) && (i2 >= 1)) && (i2 < 225)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     placeholder = PLACEHOLDER [3, 3, 3, 32]
@@ -404,34 +479,9 @@ The task scheduler will just optimize this objective.
 
 
 
-.. note:: How to get the hardware parameters from remote device
-
-  .. code-block:: python
-
-    from tvm.auto_scheduler.utils import request_remote
-    remote = request_remote(device_key, "0.0.0.0", 9190)
-    ctx = remote.cl()
-    max_shared_memory_per_block = ctx.max_shared_memory_per_block
-    # There is no explicit local memory limition
-    # so we can use INT32_MAX to disalbe the check on local_memory.
-    max_local_memory_per_block = 2147483647 # INT32_MAX
-    max_threads_per_block = ctx.max_threads_per_block
-    max_vthread_extent = int(ctx.warp_size / 4) if int(ctx.warp_size / 4) > 1 else ctx.warp_size
-    warp_size = ctx.warp_size
-    hardware_params = auto_scheduler.HardwareParams(-1, 16, 64,
-                                                    max_shared_memory_per_block, max_local_memory_per_block,
-                                                    max_threads_per_block, max_vthread_extent, warp_size)
-
- Now you could pass it to search task and tune
-
-  .. code-block:: python
-
-    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target, target_host, hardware_params)
-
-
-Tuning and Evaluate
--------------------
-Now, we set some options for tuning, launch the search tasks and evaluate the end-to-end performance
+Tuning and Evaluation
+---------------------
+Now, we set some options for tuning and launch the search tasks
 
 * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning.
   You can set it to a small number (e.g., 200) for a fast demonstrative run.
@@ -445,6 +495,9 @@ Now, we set some options for tuning, launch the search tasks and evaluate the en
 * see :any:`auto_scheduler.TuningOptions`,
   :any:`auto_scheduler.LocalRunner` for more parameters.
 
+After auto-tuning, we can compile the network with the best schedules we found.
+All measurement records are dumped into the log file during auto-tuning,
+so we can read the log file and load the best schedules.
 
 
 .. code-block:: default
@@ -456,40 +509,49 @@ Now, we set some options for tuning, launch the search tasks and evaluate the en
         tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
         tune_option = auto_scheduler.TuningOptions(
             num_measure_trials=200,  # change this to 20000 to achieve the best performance
-            builder=auto_scheduler.LocalBuilder(build_func="ndk" if use_ndk else "default"),
             runner=auto_scheduler.RPCRunner(
-                device_key, host="0.0.0.0", port=9190, repeat=3, timeout=50
+                device_key,
+                host="0.0.0.0",
+                port=9191,
+                timeout=30,
+                repeat=1,
+                min_repeat_ms=200,
+                enable_cpu_cache_flush=True,
             ),
             measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
         )
 
         tuner.tune(tune_option)
 
-        # Compile the whole network
+        # Compile with the history best
         print("Compile...")
         with auto_scheduler.ApplyHistoryBest(log_file):
             with tvm.transform.PassContext(
                 opt_level=3, config={"relay.backend.use_auto_scheduler": True}
             ):
-                lib = relay.build(mod, target=target, target_host=target_host, params=params)
+                lib = relay.build(mod, target=target, params=params)
+
+        # Export library
+        tmp = tempdir()
+        if use_ndk:
+            from tvm.contrib import ndk
+
+            filename = "net.so"
+            lib.export_library(tmp.relpath(filename), ndk.create_shared)
+        else:
+            filename = "net.tar"
+            lib.export_library(tmp.relpath(filename))
+
+        # Upload module to device
+        print("Upload...")
+        remote = auto_scheduler.utils.request_remote(device_key, "0.0.0.0", 9191, timeout=10000)
+        remote.upload(tmp.relpath(filename))
+        rlib = remote.load_module(filename)
 
         # Create graph runtime
-        print("=============== Request Remote ===============")
-        from tvm.auto_scheduler.utils import request_remote
-
-        remote = request_remote(device_key, "0.0.0.0", 9190)
-        ctx = remote.cl()
-        from tvm.contrib import utils, ndk
-
-        temp = utils.tempdir()
-        filename = "deploy_lib.so"
-        path_lib = temp.relpath(filename)
-        lib.export_library(path_lib, ndk.create_shared)
-        remote.upload(path_lib)
-        loaded_lib = remote.load_module(filename)
-        module = graph_runtime.GraphModule(loaded_lib["default"](ctx))
-        data = (np.random.uniform(size=input_shape)).astype(dtype)
-        data_tvm = tvm.nd.array(data)
+        ctx = remote.cpu()
+        module = graph_runtime.GraphModule(rlib["default"](ctx))
+        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
         module.set_input("data", data_tvm)
 
         # Evaluate
@@ -501,7 +563,8 @@ Now, we set some options for tuning, launch the search tasks and evaluate the en
         )
 
 
-    # We do not run the tuning in our webpage server since server doesn't have mali gpu.
+    # We do not run the tuning in our webpage server since the server doesn't have a Raspberry Pi,
+    # or device tracker running.
     # Uncomment the following line to run it by yourself.
 
     # tune_and_evaluate()
@@ -512,7 +575,8 @@ Now, we set some options for tuning, launch the search tasks and evaluate the en
 
 
 
-.. note:: Explain the printed information during tuning
+
+.. note:: Explaining the printed information during tuning
 
   During the tuning, a lot of information will be printed on the console.
   They are used for debugging purposes. The most important info is the output
@@ -520,42 +584,35 @@ Now, we set some options for tuning, launch the search tasks and evaluate the en
 
   .. code-block:: c
 
-    ----------------------------------------------------------------------
-    ------------------------------  [ Task Scheduler ]
-    ----------------------------------------------------------------------
-    |  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-    -------------------------------------------------
-    |    0 |        0.010 |           0.40 |     64 |
-    |    1 |        0.087 |          47.19 |     64 |
-    |    2 |        0.008 |          -0.00 |     64 |
-    |    3 |        0.177 |         582.07 |     64 |
-    |    4 |        0.268 |         862.37 |    256 |
-    |    5 |        0.166 |         621.13 |    128 |
-    |    6 |        0.170 |         605.10 |    128 |
-    |    7 |        0.128 |         403.20 |     64 |
-    |    8 |        0.189 |         545.71 |     64 |
-    |    9 |        0.231 |        1001.01 |    448 |
-    |   10 |        0.155 |         664.80 |    256 |
-    |   11 |        0.155 |         662.86 |    256 |
-    |   12 |        0.119 |         434.08 |     64 |
-    |   13 |        0.199 |         522.13 |     64 |
-    |   14 |        0.235 |         986.56 |    320 |
-    |   15 |        0.149 |         689.13 |    128 |
-    |   16 |        0.155 |         664.80 |    192 |
-    |   17 |        0.151 |         340.64 |     64 |
-    |   18 |        0.176 |         597.55 |    128 |
-    |   19 |        0.220 |        1054.37 |    192 |
-    |   20 |        0.150 |         686.01 |    128 |
-    |   21 |        0.159 |         650.88 |    128 |
-    |   22 |        0.073 |         358.19 |     64 |
-    |   23 |        0.031 |          70.63 |     64 |
-    |   24 |        0.251 |         947.73 |    128 |
-    |   25 |        0.157 |         652.47 |    128 |
-    |   26 |        0.215 |         954.84 |    128 |
-    |   27 |        0.237 |         868.92 |    128 |
-    |   28 |        0.266 |         774.06 |    128 |
-    -------------------------------------------------
-    Estimated total latency: 10.016 ms      Trials: 3992    Used time : 1131 s      Next ID: 15
+   ----------------------------------------------------------------------
+   ------------------------------  [ Task Scheduler ]
+   ----------------------------------------------------------------------
+   |  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
+   -------------------------------------------------
+   |    0 |        0.013 |           0.31 |     64 |
+   |    1 |        0.845 |           2.43 |    448 |
+   |    2 |        0.046 |          -0.00 |     64 |
+   |    3 |        4.194 |          24.53 |   2112 |
+   |    4 |        0.109 |           9.21 |     64 |
+   |    5 |        1.759 |          29.27 |    896 |
+   |    6 |        0.083 |           6.01 |     64 |
+   |    7 |        3.084 |          33.38 |   7680 |
+   |    8 |        0.136 |          14.78 |    384 |
+   |    9 |        1.349 |          38.23 |    768 |
+   |   10 |        0.133 |           7.55 |    128 |
+   |   11 |        2.747 |          37.56 |   1536 |
+   |   12 |        0.338 |          11.87 |    192 |
+   |   13 |        1.295 |          40.00 |    704 |
+   |   14 |        0.482 |           4.16 |    256 |
+   |   15 |        2.686 |          38.56 |   1344 |
+   |   16 |        0.884 |           9.08 |    448 |
+   |   17 |        1.332 |          39.18 |    704 |
+   |   18 |        1.045 |           3.84 |    576 |
+   |   19 |        1.391 |          38.09 |    704 |
+   |   20 |        0.777 |          10.34 |    448 |
+   |   21 |        0.739 |          30.97 |    448 |
+   -------------------------------------------------
+    Estimated total latency: 38.347 ms      Trials: 19992   Used time : 19260 s     Next ID: 3
 
   This table lists the latency and (estimated) speed of all tasks.
   It also lists the allocation of measurement trials for all tasks.
@@ -583,20 +640,20 @@ Other Tips
 1. During the tuning, the auto-scheduler needs to compile many programs and
    extract feature from them. This part is CPU-intensive,
    so a high-performance CPU with many cores is recommended for faster search.
-2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill --i log.json`
+2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json`
    to distill the large log file and only save the best useful records.
 3. You can resume a search from the previous log file. You just need to
    add a new argument :code:`load_log_file` when creating the task scheduler
    in function :code:`run_tuning`. Say,
    :code:`tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)`
-4. If you have multiple target GPUs, you can use all of them for measurements to
-   parallelize the measurements. Check this :ref:`section <tutorials-autotvm-scale-up-rpc-tracker>`
+4. If you have multiple target CPUs, you can use all of them for measurements to
+   parallelize the measurements. Check this :ref:`section <tutorials-autotvm-rpc-tracker>`
    to learn how to use the RPC Tracker and RPC Server.
    To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions`
    with :any:`auto_scheduler.RPCRunner`.
 
 
-.. _sphx_glr_download_tutorials_auto_scheduler_tune_network_mali.py:
+.. _sphx_glr_download_tutorials_auto_scheduler_tune_network_arm.py:
 
 
 .. only :: html
@@ -608,13 +665,13 @@ Other Tips
 
   .. container:: sphx-glr-download
 
-     :download:`Download Python source code: tune_network_mali.py <tune_network_mali.py>`
+     :download:`Download Python source code: tune_network_arm.py <tune_network_arm.py>`
 
 
 
   .. container:: sphx-glr-download
 
-     :download:`Download Jupyter notebook: tune_network_mali.ipynb <tune_network_mali.ipynb>`
+     :download:`Download Jupyter notebook: tune_network_arm.ipynb <tune_network_arm.ipynb>`
 
 
 .. only:: html
diff --git a/docs/_sources/tutorials/auto_scheduler/tune_network_cuda.rst.txt b/docs/_sources/tutorials/auto_scheduler/tune_network_cuda.rst.txt
index 9b78d96..c2c8555 100644
--- a/docs/_sources/tutorials/auto_scheduler/tune_network_cuda.rst.txt
+++ b/docs/_sources/tutorials/auto_scheduler/tune_network_cuda.rst.txt
@@ -182,26 +182,26 @@ The task scheduler will just optimize this objective.
  .. code-block:: none
 
     Extract tasks...
-    ========== Task 0  (workload key: ["b32ed43fb351136894c322ee49097a1a"]) ==========
+    ========== Task 0  (workload key: ["d7b65649a4dd54becea0a52aabbc5af5", 1, 1000, 1, 1000]) ==========
     placeholder = PLACEHOLDER [1, 1000]
     T_softmax_maxelem(i0) max= placeholder[i0, k]
     T_softmax_exp(i0, i1) = tir.exp((placeholder[i0, i1] - T_softmax_maxelem[i0]))
     T_softmax_expsum(i0) += T_softmax_exp[i0, k]
     T_softmax_norm(i0, i1) = (T_softmax_exp[i0, i1]/T_softmax_expsum[i0])
 
-    ========== Task 1  (workload key: ["d09dc1a6bb90d59c91b68989ad3492ff"]) ==========
+    ========== Task 1  (workload key: ["9847f8cc0b305137f49f2c5c0c8ab25d", 1, 512, 1000, 512, 1000, 1, 1000]) ==========
     placeholder = PLACEHOLDER [1, 512]
     placeholder = PLACEHOLDER [1000, 512]
     T_dense(i, j) += (placeholder[i, k]*placeholder[j, k])
     placeholder = PLACEHOLDER [1000]
     T_add(ax0, ax1) = (T_dense[ax0, ax1] + placeholder[ax1])
 
-    ========== Task 2  (workload key: ["7de313da0ca29a8c63f647791692430d"]) ==========
+    ========== Task 2  (workload key: ["69115f188984ae34ede37c3b8ca40b43", 1, 7, 7, 512, 1, 1, 1, 512]) ==========
     placeholder = PLACEHOLDER [1, 7, 7, 512]
     tensor(ax0, ax1, ax2, ax3) += placeholder[ax0, ((ax1*7) + rv0), ((ax2*7) + rv1), ax3]
     tensor(ax0, ax1, ax2, ax3) = (tensor[ax0, ax1, ax2, ax3]/(float32((select((bool)1, ((ax1 + 1)*7), (((ax1 + 1)*7) + 1)) - (ax1*7)))*float32((select((bool)1, ((ax2 + 1)*7), (((ax2 + 1)*7) + 1)) - (ax2*7)))))
 
-    ========== Task 3  (workload key: ["8d5a93959138dc7b2ee1f1b3219dfa14"]) ==========
+    ========== Task 3  (workload key: ["ad6cecbf5d85cb1cda3c2bb7af170211", 1, 7, 7, 512, 4, 4, 512, 512, 1, 7, 7, 512, 1, 1, 1, 512, 1, 1, 1, 512, 1, 7, 7, 512]) ==========
     placeholder = PLACEHOLDER [1, 7, 7, 512]
     data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 8)) && (i2 >= 1)) && (i2 < 8)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 16), ((floormod(floordiv(p, 4), 4)*2) + eps), ((floormod(p, 4)*2) + nu), ci]
@@ -220,7 +220,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (T_multiply[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 4  (workload key: ["ac6920940de3797cc3f9f9c260675e5d"]) ==========
+    ========== Task 4  (workload key: ["3a69f9fbc63760d99e36b4c17b3bfc57", 1, 7, 7, 512, 4, 4, 512, 512, 1, 1, 1, 512, 1, 7, 7, 512]) ==========
     placeholder = PLACEHOLDER [1, 7, 7, 512]
     data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 8)) && (i2 >= 1)) && (i2 < 8)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 16), ((floormod(floordiv(p, 4), 4)*2) + eps), ((floormod(p, 4)*2) + nu), ci]
@@ -235,7 +235,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 5  (workload key: ["7e83a2ee5cd5d50282ed19310700046a"]) ==========
+    ========== Task 5  (workload key: ["d730bcd28f0920f6b97245e2a11bd8d6", 1, 7, 7, 512, 4, 4, 512, 512, 1, 7, 7, 512, 1, 7, 7, 512]) ==========
     placeholder = PLACEHOLDER [1, 7, 7, 512]
     data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 8)) && (i2 >= 1)) && (i2 < 8)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 16), ((floormod(floordiv(p, 4), 4)*2) + eps), ((floormod(p, 4)*2) + nu), ci]
@@ -249,7 +249,7 @@ The task scheduler will just optimize this objective.
     placeholder = PLACEHOLDER [1, 7, 7, 512]
     T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3])
 
-    ========== Task 6  (workload key: ["1f6cd3637ec856bf5cf5010a623eed05"]) ==========
+    ========== Task 6  (workload key: ["12b88bedece6984af589a28b43e0f3c4", 1, 14, 14, 256, 3, 3, 256, 512, 1, 1, 1, 512, 1, 7, 7, 512]) ==========
     placeholder = PLACEHOLDER [1, 14, 14, 256]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 15)) && (i2 >= 1)) && (i2 < 15)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     placeholder = PLACEHOLDER [3, 3, 256, 512]
@@ -258,7 +258,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 7  (workload key: ["424ba83160af31badc0b098136e1a3b0"]) ==========
+    ========== Task 7  (workload key: ["f3b6c10fcc6ce01ff01add933e4d21e9", 1, 14, 14, 256, 4, 4, 256, 256, 1, 14, 14, 256, 1, 1, 1, 256, 1, 14, 14, 256]) ==========
     placeholder = PLACEHOLDER [1, 14, 14, 256]
     data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 15)) && (i2 >= 1)) && (i2 < 15)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 49), ((floormod(floordiv(p, 7), 7)*2) + eps), ((floormod(p, 7)*2) + nu), ci]
@@ -275,7 +275,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (T_add[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 8  (workload key: ["a169cd0053d3a7ca82998fcb62e42c58"]) ==========
+    ========== Task 8  (workload key: ["b8b52b9be9df6102466a22a014c44c1f", 1, 14, 14, 256, 4, 4, 256, 256, 1, 1, 1, 256, 1, 14, 14, 256]) ==========
     placeholder = PLACEHOLDER [1, 14, 14, 256]
     data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 15)) && (i2 >= 1)) && (i2 < 15)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 49), ((floormod(floordiv(p, 7), 7)*2) + eps), ((floormod(p, 7)*2) + nu), ci]
@@ -290,7 +290,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 9  (workload key: ["0141ffc4fbabc10cc5a94c954419055b"]) ==========
+    ========== Task 9  (workload key: ["d374e472bd9d8164892b9e28a0a8cb59", 1, 14, 14, 256, 4, 4, 256, 256, 1, 14, 14, 256, 1, 14, 14, 256]) ==========
     placeholder = PLACEHOLDER [1, 14, 14, 256]
     data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 15)) && (i2 >= 1)) && (i2 < 15)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 49), ((floormod(floordiv(p, 7), 7)*2) + eps), ((floormod(p, 7)*2) + nu), ci]
@@ -304,7 +304,7 @@ The task scheduler will just optimize this objective.
     placeholder = PLACEHOLDER [1, 14, 14, 256]
     T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3])
 
-    ========== Task 10  (workload key: ["81aae4b8e2c076a4014d403e8a2c70a1"]) ==========
+    ========== Task 10  (workload key: ["12b88bedece6984af589a28b43e0f3c4", 1, 28, 28, 128, 3, 3, 128, 256, 1, 1, 1, 256, 1, 14, 14, 256]) ==========
     placeholder = PLACEHOLDER [1, 28, 28, 128]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 29)) && (i2 >= 1)) && (i2 < 29)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     placeholder = PLACEHOLDER [3, 3, 128, 256]
@@ -313,7 +313,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 11  (workload key: ["c7a6b56bdc04b94c829fb2ef9874019e"]) ==========
+    ========== Task 11  (workload key: ["c4500b4e2fd04e695c32d2f31bbdc14a", 1, 28, 28, 128, 4, 4, 128, 128, 1, 28, 28, 128, 1, 1, 1, 128, 1, 28, 28, 128]) ==========
     placeholder = PLACEHOLDER [1, 28, 28, 128]
     data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 29)) && (i2 >= 1)) && (i2 < 29)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 196), ((floormod(floordiv(p, 14), 14)*2) + eps), ((floormod(p, 14)*2) + nu), ci]
@@ -330,7 +330,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (T_add[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 12  (workload key: ["c035cc8b0568a8e054d06bd7f4950550"]) ==========
+    ========== Task 12  (workload key: ["e4cdf917b876dbdd64488c3818d9c141", 1, 28, 28, 128, 4, 4, 128, 128, 1, 1, 1, 128, 1, 28, 28, 128]) ==========
     placeholder = PLACEHOLDER [1, 28, 28, 128]
     data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 29)) && (i2 >= 1)) && (i2 < 29)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 196), ((floormod(floordiv(p, 14), 14)*2) + eps), ((floormod(p, 14)*2) + nu), ci]
@@ -345,7 +345,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 13  (workload key: ["c5ee3e05edd9754492d0763aa41fd025"]) ==========
+    ========== Task 13  (workload key: ["dac19035dd5fe9424ee8617421b9c817", 1, 28, 28, 128, 4, 4, 128, 128, 1, 28, 28, 128, 1, 28, 28, 128]) ==========
     placeholder = PLACEHOLDER [1, 28, 28, 128]
     data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 29)) && (i2 >= 1)) && (i2 < 29)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 196), ((floormod(floordiv(p, 14), 14)*2) + eps), ((floormod(p, 14)*2) + nu), ci]
@@ -359,7 +359,7 @@ The task scheduler will just optimize this objective.
     placeholder = PLACEHOLDER [1, 28, 28, 128]
     T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3])
 
-    ========== Task 14  (workload key: ["022ebb6b7c55c5ed030421380ec83a04"]) ==========
+    ========== Task 14  (workload key: ["12b88bedece6984af589a28b43e0f3c4", 1, 56, 56, 64, 3, 3, 64, 128, 1, 1, 1, 128, 1, 28, 28, 128]) ==========
     placeholder = PLACEHOLDER [1, 56, 56, 64]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 57)) && (i2 >= 1)) && (i2 < 57)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     placeholder = PLACEHOLDER [3, 3, 64, 128]
@@ -368,7 +368,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 15  (workload key: ["de0df0893e01892cfe69f7bc2c24111f"]) ==========
+    ========== Task 15  (workload key: ["1e3c4211ffd2f2db91078ae4d04b779d", 1, 56, 56, 64, 6, 6, 64, 64, 1, 56, 56, 64, 1, 1, 1, 64, 1, 56, 56, 64]) ==========
     placeholder = PLACEHOLDER [1, 56, 56, 64]
     data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 57)) && (i2 >= 1)) && (i2 < 57)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 196), ((floormod(floordiv(p, 14), 14)*4) + eps), ((floormod(p, 14)*4) + nu), ci]
@@ -385,7 +385,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (T_add[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 16  (workload key: ["f2e3c09a00e7d0a9897f70497e089f1e"]) ==========
+    ========== Task 16  (workload key: ["b818b53148cd450f86569dfc3e04cb8a", 1, 56, 56, 64, 6, 6, 64, 64, 1, 1, 1, 64, 1, 56, 56, 64]) ==========
     placeholder = PLACEHOLDER [1, 56, 56, 64]
     data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 57)) && (i2 >= 1)) && (i2 < 57)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 196), ((floormod(floordiv(p, 14), 14)*4) + eps), ((floormod(p, 14)*4) + nu), ci]
@@ -400,7 +400,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 17  (workload key: ["fa26946d7ac51126bfa859cb183f9ca1"]) ==========
+    ========== Task 17  (workload key: ["3ea73fb9b0364374730d09e068821f95", 1, 56, 56, 64, 6, 6, 64, 64, 1, 56, 56, 64, 1, 56, 56, 64]) ==========
     placeholder = PLACEHOLDER [1, 56, 56, 64]
     data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 57)) && (i2 >= 1)) && (i2 < 57)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 196), ((floormod(floordiv(p, 14), 14)*4) + eps), ((floormod(p, 14)*4) + nu), ci]
@@ -414,7 +414,7 @@ The task scheduler will just optimize this objective.
     placeholder = PLACEHOLDER [1, 56, 56, 64]
     T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3])
 
-    ========== Task 18  (workload key: ["ba2026d923536b75e9b4faed89287d5f"]) ==========
+    ========== Task 18  (workload key: ["a5612fdeb9db4d579a75ec225ea4c06a", 1, 112, 112, 64, 1, 1, 1, 64, 1, 56, 56, 64]) ==========
     placeholder = PLACEHOLDER [1, 112, 112, 64]
     pad_temp(ax0, ax1, ax2, ax3) = tir.if_then_else(((((ax1 >= 1) && (ax1 < 113)) && (ax2 >= 1)) && (ax2 < 113)), placeholder[ax0, (ax1 - 1), (ax2 - 1), ax3], -3.40282e+38f)
     tensor(ax0, ax1, ax2, ax3) max= pad_temp[ax0, ((ax1*2) + dh), ((ax2*2) + dw), ax3]
@@ -422,7 +422,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (tensor[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 19  (workload key: ["a0eb8d6048282a4a0986cc2ccf14eaa2"]) ==========
+    ========== Task 19  (workload key: ["12b88bedece6984af589a28b43e0f3c4", 1, 224, 224, 3, 7, 7, 3, 64, 1, 1, 1, 64, 1, 112, 112, 64]) ==========
     placeholder = PLACEHOLDER [1, 224, 224, 3]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 3) && (i1 < 227)) && (i2 >= 3)) && (i2 < 227)), placeholder[i0, (i1 - 3), (i2 - 3), i3], 0f)
     placeholder = PLACEHOLDER [7, 7, 3, 64]
@@ -431,25 +431,25 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 20  (workload key: ["bf78a7bf0209980f72953637dfd14a6f"]) ==========
+    ========== Task 20  (workload key: ["7006235cfc29b73be524cf390ed5a977", 1, 56, 56, 64, 1, 1, 64, 64, 1, 56, 56, 64]) ==========
     placeholder = PLACEHOLDER [1, 56, 56, 64]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 64, 64]
     Conv2dOutput(nn, yy, xx, ff) += (PaddedInput[nn, (yy + ry), (xx + rx), rc]*placeholder[ry, rx, rc, ff])
 
-    ========== Task 21  (workload key: ["6630936c26852f2b89dbfa2ff37fbb9c"]) ==========
+    ========== Task 21  (workload key: ["f4380bb1dc62422a69ad4a1a9771f927", 1, 56, 56, 64, 1, 1, 64, 128, 1, 28, 28, 128]) ==========
     placeholder = PLACEHOLDER [1, 56, 56, 64]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 64, 128]
     Conv2dOutput(nn, yy, xx, ff) += (PaddedInput[nn, ((yy*2) + ry), ((xx*2) + rx), rc]*placeholder[ry, rx, rc, ff])
 
-    ========== Task 22  (workload key: ["ba5f918733ccbbd4a1d7fd3724665a2f"]) ==========
+    ========== Task 22  (workload key: ["f4380bb1dc62422a69ad4a1a9771f927", 1, 28, 28, 128, 1, 1, 128, 256, 1, 14, 14, 256]) ==========
     placeholder = PLACEHOLDER [1, 28, 28, 128]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 128, 256]
     Conv2dOutput(nn, yy, xx, ff) += (PaddedInput[nn, ((yy*2) + ry), ((xx*2) + rx), rc]*placeholder[ry, rx, rc, ff])
 
-    ========== Task 23  (workload key: ["21ad409d72953de188314010134e3acd"]) ==========
+    ========== Task 23  (workload key: ["f4380bb1dc62422a69ad4a1a9771f927", 1, 14, 14, 256, 1, 1, 256, 512, 1, 7, 7, 512]) ==========
     placeholder = PLACEHOLDER [1, 14, 14, 256]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 256, 512]
@@ -614,7 +614,7 @@ so we can read the log file and load the best schedules.
 
     Compile...
     Evaluate inference time cost...
-    Mean inference time (std dev): 3.28 ms (0.01 ms)
+    Mean inference time (std dev): 3.22 ms (0.02 ms)
 
 
 
@@ -623,7 +623,7 @@ Other Tips
 1. During the tuning, the auto-scheduler needs to compile many programs and
    extract feature from them. This part is CPU-intensive,
    so a high-performance CPU with many cores is recommended for faster search.
-2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill --i log.json`
+2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json`
    to distill the large log file and only save the best useful records.
 3. You can resume a search from the previous log file. You just need to
    add a new argument :code:`load_log_file` when creating the task scheduler
diff --git a/docs/_sources/tutorials/auto_scheduler/tune_network_mali.rst.txt b/docs/_sources/tutorials/auto_scheduler/tune_network_mali.rst.txt
index 0ae1107..59e55e8 100644
--- a/docs/_sources/tutorials/auto_scheduler/tune_network_mali.rst.txt
+++ b/docs/_sources/tutorials/auto_scheduler/tune_network_mali.rst.txt
@@ -208,27 +208,27 @@ The task scheduler will just optimize this objective.
  .. code-block:: none
 
     Extract tasks...
-
    ...2%, 0.01 MB, 185 KB/s, 0 seconds passed
    ...5%, 0.02 MB, 370 KB/s, 0 seconds passed
    ...7%, 0.02 MB, 549 KB/s, 0 seconds passed
    ...10%, 0.03 MB, 730 KB/s, 0 seconds passed
    ...12%, 0.04 MB, 814 KB/s, 0 seconds passed
    ...15%, 0.05 MB, 974 KB/s, 0 seconds passed
    ...18%, 0.05 MB, 1111 KB/s, 0 seconds passed
    ...20%, 0.06 MB, 1267 KB/s, 0 seconds passed
    ...23%, 0.07 MB, 1423 KB/s, 0 seconds passed
    ...25%, 0.08 MB, 1560 KB/s, 0 seconds passed
    ...28%, 0.09 MB, 1712 KB/s, 0 seconds passed
    ...30%, 0.09 MB, 1843 KB/s, 0 seconds passed
    ...33%, 0.10 MB, 1992 KB/s, 0 seconds passed
    ...36%, 0.11 MB, 2116 KB/s, 0 seconds passed
    ...38%, 0.12 MB, 2155 KB/s, 0 seconds passed
    ...41%, 0.12 MB, 2257 KB/s, 0 seconds passed
    ...43%, 0.13 MB, 2394 KB/s, 0 seconds passed
    ...46%, 0.14 MB, 2530 KB/s, 0 seconds passed
    ...48%, 0.15 MB, 2659 KB/s, 0 seconds passed
    ...51%, 0.16 MB, 2793 KB/s, 0 seconds passed
    ...54%, 0.16 MB, 2928
  KB/s, 0 seconds passed
    ...56%, 0.17 MB, 3063 KB/s, 0 seconds passed
    ...59%, 0.18 MB, 3168 KB/s, 0 seconds passed
    ...61%, 0.19 MB, 3289 KB/s, 0 seconds passed
    ...64%, 0.20 MB, 3419 KB/s, 0 seconds passed
    ...66%, 0.20 MB, 3551 KB/s, 0 seconds passed
    ...69%, 0.21 MB, 3639 KB/s, 0 seconds passed
    ...72%, 0.22 MB, 3768 KB/s, 0 seconds passed
    ...74%, 0.23 MB, 3893 KB/s, 0 seconds passed
    ...77%, 0.23 MB, 4022 KB/s, 0 seconds passed
    ...79%, 0.24 MB, 3981 KB/s, 0 seconds passed
    ...82%, 0.25 MB, 4105 KB/s, 0 seconds passed
    ...84%, 0.26 MB, 4165 KB/s, 0 seconds passed
    ...87%, 0.27 MB, 4286 KB/s, 0 seconds passed
    ...90%, 0.27 MB, 4391 KB/s, 0 seconds passed
    ...92%, 0.28 MB, 4511 KB/s, 0 seconds passed
    ...95%, 0.29 MB, 4586 KB/s, 0 seconds passed
    ...97%, 0.30 MB, 4705 KB/s, 0 seconds passed
    ...100%, 0.30 MB, 4816 KB/s, 0 seconds passed
-    ========== Task 0  (workload key: ["b32ed43fb351136894c322ee49097a1a"]) ==========
+
    ...2%, 0.01 MB, 79 KB/s, 0 seconds passed
    ...5%, 0.02 MB, 158 KB/s, 0 seconds passed
    ...7%, 0.02 MB, 235 KB/s, 0 seconds passed
    ...10%, 0.03 MB, 313 KB/s, 0 seconds passed
    ...12%, 0.04 MB, 388 KB/s, 0 seconds passed
    ...15%, 0.05 MB, 451 KB/s, 0 seconds passed
    ...18%, 0.05 MB, 521 KB/s, 0 seconds passed
    ...20%, 0.06 MB, 595 KB/s, 0 seconds passed
    ...23%, 0.07 MB, 659 KB/s, 0 seconds passed
    ...25%, 0.08 MB, 731 KB/s, 0 seconds passed
    ...28%, 0.09 MB, 804 KB/s, 0 seconds passed
    ...30%, 0.09 MB, 875 KB/s, 0 seconds passed
    ...33%, 0.10 MB, 916 KB/s, 0 seconds passed
    ...36%, 0.11 MB, 986 KB/s, 0 seconds passed
    ...38%, 0.12 MB, 1055 KB/s, 0 seconds passed
    ...41%, 0.12 MB, 1125 KB/s, 0 seconds passed
    ...43%, 0.13 MB, 1193 KB/s, 0 seconds passed
    ...46%, 0.14 MB, 1261 KB/s, 0 seconds passed
    ...48%, 0.15 MB, 1330 KB/s, 0 seconds passed
    ...51%, 0.16 MB, 1399 KB/s, 0 seconds passed
    ...54%, 0.16 MB, 1456 KB/s, 0 
 seconds passed
    ...56%, 0.17 MB, 1523 KB/s, 0 seconds passed
    ...59%, 0.18 MB, 1590 KB/s, 0 seconds passed
    ...61%, 0.19 MB, 1658 KB/s, 0 seconds passed
    ...64%, 0.20 MB, 1725 KB/s, 0 seconds passed
    ...66%, 0.20 MB, 1792 KB/s, 0 seconds passed
    ...69%, 0.21 MB, 1859 KB/s, 0 seconds passed
    ...72%, 0.22 MB, 1927 KB/s, 0 seconds passed
    ...74%, 0.23 MB, 1936 KB/s, 0 seconds passed
    ...77%, 0.23 MB, 2001 KB/s, 0 seconds passed
    ...79%, 0.24 MB, 2067 KB/s, 0 seconds passed
    ...82%, 0.25 MB, 2132 KB/s, 0 seconds passed
    ...84%, 0.26 MB, 2186 KB/s, 0 seconds passed
    ...87%, 0.27 MB, 2251 KB/s, 0 seconds passed
    ...90%, 0.27 MB, 2299 KB/s, 0 seconds passed
    ...92%, 0.28 MB, 2364 KB/s, 0 seconds passed
    ...95%, 0.29 MB, 2423 KB/s, 0 seconds passed
    ...97%, 0.30 MB, 2487 KB/s, 0 seconds passed
    ...100%, 0.30 MB, 2547 KB/s, 0 seconds passed
+    ========== Task 0  (workload key: ["d7b65649a4dd54becea0a52aabbc5af5", 1, 1000, 1, 1000]) ==========
     placeholder = PLACEHOLDER [1, 1000]
     T_softmax_maxelem(i0) max= placeholder[i0, k]
     T_softmax_exp(i0, i1) = tir.exp((placeholder[i0, i1] - T_softmax_maxelem[i0]))
     T_softmax_expsum(i0) += T_softmax_exp[i0, k]
     T_softmax_norm(i0, i1) = (T_softmax_exp[i0, i1]/T_softmax_expsum[i0])
 
-    ========== Task 1  (workload key: ["35552028f3076f68df3063174e40b59f"]) ==========
+    ========== Task 1  (workload key: ["9847f8cc0b305137f49f2c5c0c8ab25d", 1, 1024, 1000, 1024, 1000, 1, 1000]) ==========
     placeholder = PLACEHOLDER [1, 1024]
     placeholder = PLACEHOLDER [1000, 1024]
     T_dense(i, j) += (placeholder[i, k]*placeholder[j, k])
     placeholder = PLACEHOLDER [1000]
     T_add(ax0, ax1) = (T_dense[ax0, ax1] + placeholder[ax1])
 
-    ========== Task 2  (workload key: ["cf95f3a14294b5393f63b280d0ec0ab6"]) ==========
+    ========== Task 2  (workload key: ["69115f188984ae34ede37c3b8ca40b43", 1, 7, 7, 1024, 1, 1, 1, 1024]) ==========
     placeholder = PLACEHOLDER [1, 7, 7, 1024]
     tensor(ax0, ax1, ax2, ax3) += placeholder[ax0, ((ax1*7) + rv0), ((ax2*7) + rv1), ax3]
     tensor(ax0, ax1, ax2, ax3) = (tensor[ax0, ax1, ax2, ax3]/(float32((select((bool)1, ((ax1 + 1)*7), (((ax1 + 1)*7) + 1)) - (ax1*7)))*float32((select((bool)1, ((ax2 + 1)*7), (((ax2 + 1)*7) + 1)) - (ax2*7)))))
 
-    ========== Task 3  (workload key: ["baa3a42d3cb6ab30685b0a7894b95da9"]) ==========
+    ========== Task 3  (workload key: ["6b7583cf23c7c37d3212cad9d06e58c1", 1, 7, 7, 1024, 1, 1, 1024, 1024, 1, 1, 1, 1024, 1, 7, 7, 1024]) ==========
     placeholder = PLACEHOLDER [1, 7, 7, 1024]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 1024, 1024]
@@ -237,7 +237,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 4  (workload key: ["089861a00a7dfcc7196c2b6b5c807855"]) ==========
+    ========== Task 4  (workload key: ["06fce76bd84cb904eee50b905ca9449a", 1, 7, 7, 1024, 3, 3, 1024, 1, 1, 1, 1, 1024, 1, 7, 7, 1024]) ==========
     placeholder = PLACEHOLDER [1, 7, 7, 1024]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 8)) && (i2 >= 1)) && (i2 < 8)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     placeholder = PLACEHOLDER [3, 3, 1024, 1]
@@ -246,7 +246,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (DepthwiseConv2d[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 5  (workload key: ["e7ff95f121397b87a0ca12ef428aef59"]) ==========
+    ========== Task 5  (workload key: ["6b7583cf23c7c37d3212cad9d06e58c1", 1, 7, 7, 512, 1, 1, 512, 1024, 1, 1, 1, 1024, 1, 7, 7, 1024]) ==========
     placeholder = PLACEHOLDER [1, 7, 7, 512]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 512, 1024]
@@ -255,7 +255,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 6  (workload key: ["c3831fcb49bfdfc679be0bbfb987da82"]) ==========
+    ========== Task 6  (workload key: ["c87ba68bc180312f5716af09a77ca15b", 1, 14, 14, 512, 3, 3, 512, 1, 1, 1, 1, 512, 1, 7, 7, 512]) ==========
     placeholder = PLACEHOLDER [1, 14, 14, 512]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 15)) && (i2 >= 1)) && (i2 < 15)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     placeholder = PLACEHOLDER [3, 3, 512, 1]
@@ -264,7 +264,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (DepthwiseConv2d[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 7  (workload key: ["33bb900cb60276282852b4b9c1346fe9"]) ==========
+    ========== Task 7  (workload key: ["6b7583cf23c7c37d3212cad9d06e58c1", 1, 14, 14, 512, 1, 1, 512, 512, 1, 1, 1, 512, 1, 14, 14, 512]) ==========
     placeholder = PLACEHOLDER [1, 14, 14, 512]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 512, 512]
@@ -273,7 +273,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 8  (workload key: ["f2a48dd923600da67abb78b4895f8f7b"]) ==========
+    ========== Task 8  (workload key: ["06fce76bd84cb904eee50b905ca9449a", 1, 14, 14, 512, 3, 3, 512, 1, 1, 1, 1, 512, 1, 14, 14, 512]) ==========
     placeholder = PLACEHOLDER [1, 14, 14, 512]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 15)) && (i2 >= 1)) && (i2 < 15)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     placeholder = PLACEHOLDER [3, 3, 512, 1]
@@ -282,7 +282,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (DepthwiseConv2d[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 9  (workload key: ["f6906ccbe2258e70648ea15f3c037ca0"]) ==========
+    ========== Task 9  (workload key: ["6b7583cf23c7c37d3212cad9d06e58c1", 1, 14, 14, 256, 1, 1, 256, 512, 1, 1, 1, 512, 1, 14, 14, 512]) ==========
     placeholder = PLACEHOLDER [1, 14, 14, 256]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 256, 512]
@@ -291,7 +291,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 10  (workload key: ["381331b022e1b4ddc705aa66c2cb90c8"]) ==========
+    ========== Task 10  (workload key: ["c87ba68bc180312f5716af09a77ca15b", 1, 28, 28, 256, 3, 3, 256, 1, 1, 1, 1, 256, 1, 14, 14, 256]) ==========
     placeholder = PLACEHOLDER [1, 28, 28, 256]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 29)) && (i2 >= 1)) && (i2 < 29)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     placeholder = PLACEHOLDER [3, 3, 256, 1]
@@ -300,7 +300,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (DepthwiseConv2d[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 11  (workload key: ["413e7c2a210f0fbf2fadeb2686aba8ee"]) ==========
+    ========== Task 11  (workload key: ["6b7583cf23c7c37d3212cad9d06e58c1", 1, 28, 28, 256, 1, 1, 256, 256, 1, 1, 1, 256, 1, 28, 28, 256]) ==========
     placeholder = PLACEHOLDER [1, 28, 28, 256]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 256, 256]
@@ -309,7 +309,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 12  (workload key: ["2b4a9c43c1bcbb5c68742378a4e72f74"]) ==========
+    ========== Task 12  (workload key: ["06fce76bd84cb904eee50b905ca9449a", 1, 28, 28, 256, 3, 3, 256, 1, 1, 1, 1, 256, 1, 28, 28, 256]) ==========
     placeholder = PLACEHOLDER [1, 28, 28, 256]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 29)) && (i2 >= 1)) && (i2 < 29)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     placeholder = PLACEHOLDER [3, 3, 256, 1]
@@ -318,7 +318,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (DepthwiseConv2d[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 13  (workload key: ["017340b550a0bda8bd8ec1933bc32756"]) ==========
+    ========== Task 13  (workload key: ["6b7583cf23c7c37d3212cad9d06e58c1", 1, 28, 28, 128, 1, 1, 128, 256, 1, 1, 1, 256, 1, 28, 28, 256]) ==========
     placeholder = PLACEHOLDER [1, 28, 28, 128]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 128, 256]
@@ -327,7 +327,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 14  (workload key: ["539b0d6ae7b6e1610e29ae571b8b8c25"]) ==========
+    ========== Task 14  (workload key: ["c87ba68bc180312f5716af09a77ca15b", 1, 56, 56, 128, 3, 3, 128, 1, 1, 1, 1, 128, 1, 28, 28, 128]) ==========
     placeholder = PLACEHOLDER [1, 56, 56, 128]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 57)) && (i2 >= 1)) && (i2 < 57)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     placeholder = PLACEHOLDER [3, 3, 128, 1]
@@ -336,7 +336,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (DepthwiseConv2d[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 15  (workload key: ["80b2e789f7bce126bde2176640ca76a4"]) ==========
+    ========== Task 15  (workload key: ["6b7583cf23c7c37d3212cad9d06e58c1", 1, 56, 56, 128, 1, 1, 128, 128, 1, 1, 1, 128, 1, 56, 56, 128]) ==========
     placeholder = PLACEHOLDER [1, 56, 56, 128]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 128, 128]
@@ -345,7 +345,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 16  (workload key: ["3dba2989a90e19861af284d74e40f5cd"]) ==========
+    ========== Task 16  (workload key: ["06fce76bd84cb904eee50b905ca9449a", 1, 56, 56, 128, 3, 3, 128, 1, 1, 1, 1, 128, 1, 56, 56, 128]) ==========
     placeholder = PLACEHOLDER [1, 56, 56, 128]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 57)) && (i2 >= 1)) && (i2 < 57)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     placeholder = PLACEHOLDER [3, 3, 128, 1]
@@ -354,7 +354,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (DepthwiseConv2d[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 17  (workload key: ["3b9a17584b6afa25229ef34c6f417660"]) ==========
+    ========== Task 17  (workload key: ["6b7583cf23c7c37d3212cad9d06e58c1", 1, 56, 56, 64, 1, 1, 64, 128, 1, 1, 1, 128, 1, 56, 56, 128]) ==========
     placeholder = PLACEHOLDER [1, 56, 56, 64]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 64, 128]
@@ -363,7 +363,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 18  (workload key: ["a4553ca6a00b6c8adb555bcde25d95c4"]) ==========
+    ========== Task 18  (workload key: ["c87ba68bc180312f5716af09a77ca15b", 1, 112, 112, 64, 3, 3, 64, 1, 1, 1, 1, 64, 1, 56, 56, 64]) ==========
     placeholder = PLACEHOLDER [1, 112, 112, 64]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 113)) && (i2 >= 1)) && (i2 < 113)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     placeholder = PLACEHOLDER [3, 3, 64, 1]
@@ -372,7 +372,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (DepthwiseConv2d[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 19  (workload key: ["63672689bf8f678a0abe0854828cbd3b"]) ==========
+    ========== Task 19  (workload key: ["6b7583cf23c7c37d3212cad9d06e58c1", 1, 112, 112, 32, 1, 1, 32, 64, 1, 1, 1, 64, 1, 112, 112, 64]) ==========
     placeholder = PLACEHOLDER [1, 112, 112, 32]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 32, 64]
@@ -381,7 +381,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 20  (workload key: ["1ceacb63c63eaa3da881bff2858acdbf"]) ==========
+    ========== Task 20  (workload key: ["06fce76bd84cb904eee50b905ca9449a", 1, 112, 112, 32, 3, 3, 32, 1, 1, 1, 1, 32, 1, 112, 112, 32]) ==========
     placeholder = PLACEHOLDER [1, 112, 112, 32]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 113)) && (i2 >= 1)) && (i2 < 113)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     placeholder = PLACEHOLDER [3, 3, 32, 1]
@@ -390,7 +390,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (DepthwiseConv2d[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 21  (workload key: ["2c2147047fd6dafd3d66d75165843f67"]) ==========
+    ========== Task 21  (workload key: ["98418eda02701ddd175ad50e364a0638", 1, 224, 224, 3, 3, 3, 3, 32, 1, 112, 1, 1, 1, 112, 1, 1, 1, 112, 112, 32]) ==========
     placeholder = PLACEHOLDER [1, 224, 224, 3]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 225)) && (i2 >= 1)) && (i2 < 225)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     placeholder = PLACEHOLDER [3, 3, 3, 32]
@@ -583,7 +583,7 @@ Other Tips
 1. During the tuning, the auto-scheduler needs to compile many programs and
    extract feature from them. This part is CPU-intensive,
    so a high-performance CPU with many cores is recommended for faster search.
-2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill --i log.json`
+2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json`
    to distill the large log file and only save the best useful records.
 3. You can resume a search from the previous log file. You just need to
    add a new argument :code:`load_log_file` when creating the task scheduler
diff --git a/docs/_sources/tutorials/auto_scheduler/tune_network_x86.rst.txt b/docs/_sources/tutorials/auto_scheduler/tune_network_x86.rst.txt
index 208e788..c577ac4 100644
--- a/docs/_sources/tutorials/auto_scheduler/tune_network_x86.rst.txt
+++ b/docs/_sources/tutorials/auto_scheduler/tune_network_x86.rst.txt
@@ -184,26 +184,26 @@ The task scheduler will just optimize this objective.
  .. code-block:: none
 
     Extract tasks...
-    ========== Task 0  (workload key: ["b32ed43fb351136894c322ee49097a1a"]) ==========
+    ========== Task 0  (workload key: ["d7b65649a4dd54becea0a52aabbc5af5", 1, 1000, 1, 1000]) ==========
     placeholder = PLACEHOLDER [1, 1000]
     T_softmax_maxelem(i0) max= placeholder[i0, k]
     T_softmax_exp(i0, i1) = tir.exp((placeholder[i0, i1] - T_softmax_maxelem[i0]))
     T_softmax_expsum(i0) += T_softmax_exp[i0, k]
     T_softmax_norm(i0, i1) = (T_softmax_exp[i0, i1]/T_softmax_expsum[i0])
 
-    ========== Task 1  (workload key: ["eca51cb8a8335304c6e670bdb115a9b7"]) ==========
+    ========== Task 1  (workload key: ["9847f8cc0b305137f49f2c5c0c8ab25d", 1, 2048, 1000, 2048, 1000, 1, 1000]) ==========
     placeholder = PLACEHOLDER [1, 2048]
     placeholder = PLACEHOLDER [1000, 2048]
     T_dense(i, j) += (placeholder[i, k]*placeholder[j, k])
     placeholder = PLACEHOLDER [1000]
     T_add(ax0, ax1) = (T_dense[ax0, ax1] + placeholder[ax1])
 
-    ========== Task 2  (workload key: ["36ee2798ed60bae3bcd1bb89a0285fe8"]) ==========
+    ========== Task 2  (workload key: ["69115f188984ae34ede37c3b8ca40b43", 1, 7, 7, 2048, 1, 1, 1, 2048]) ==========
     placeholder = PLACEHOLDER [1, 7, 7, 2048]
     tensor(ax0, ax1, ax2, ax3) += placeholder[ax0, ((ax1*7) + rv0), ((ax2*7) + rv1), ax3]
     tensor(ax0, ax1, ax2, ax3) = (tensor[ax0, ax1, ax2, ax3]/(float32((select((bool)1, ((ax1 + 1)*7), (((ax1 + 1)*7) + 1)) - (ax1*7)))*float32((select((bool)1, ((ax2 + 1)*7), (((ax2 + 1)*7) + 1)) - (ax2*7)))))
 
-    ========== Task 3  (workload key: ["dcf6fcf5f56fa614bf9aef0c82382caf"]) ==========
+    ========== Task 3  (workload key: ["875556d12d0be2269206a7775d5296a6", 1, 7, 7, 512, 1, 1, 512, 2048, 1, 7, 7, 2048, 1, 1, 1, 2048, 1, 1, 1, 2048, 1, 7, 7, 2048]) ==========
     placeholder = PLACEHOLDER [1, 7, 7, 512]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 512, 2048]
@@ -216,7 +216,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (T_multiply[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 4  (workload key: ["7e3f0cf5a6dd80d36dab1a3dad92674a"]) ==========
+    ========== Task 4  (workload key: ["2350d19dc42a0665244368384c66b3a5", 1, 7, 7, 512, 3, 3, 512, 512, 1, 1, 1, 512, 1, 7, 7, 512]) ==========
     placeholder = PLACEHOLDER [1, 7, 7, 512]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 8)) && (i2 >= 1)) && (i2 < 8)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     placeholder = PLACEHOLDER [3, 3, 512, 512]
@@ -225,7 +225,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 5  (workload key: ["e0a9eb3795b531085e0ebb772e7e800c"]) ==========
+    ========== Task 5  (workload key: ["6b7583cf23c7c37d3212cad9d06e58c1", 1, 7, 7, 2048, 1, 1, 2048, 512, 1, 1, 1, 512, 1, 7, 7, 512]) ==========
     placeholder = PLACEHOLDER [1, 7, 7, 2048]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 2048, 512]
@@ -234,7 +234,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 6  (workload key: ["03614e726dc588d11887eb0953a77e53"]) ==========
+    ========== Task 6  (workload key: ["1cc666833c122282e3fcf3595901b12b", 1, 7, 7, 512, 1, 1, 512, 2048, 1, 7, 7, 2048, 1, 7, 7, 2048]) ==========
     placeholder = PLACEHOLDER [1, 7, 7, 512]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 512, 2048]
@@ -242,7 +242,7 @@ The task scheduler will just optimize this objective.
     placeholder = PLACEHOLDER [1, 7, 7, 2048]
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3])
 
-    ========== Task 7  (workload key: ["7657f886f5e9d8b5f19a5fd2c5b90d8d"]) ==========
+    ========== Task 7  (workload key: ["de7d1695278cf52778b038e6573d7626", 1, 14, 14, 1024, 1, 1, 1024, 512, 1, 1, 1, 512, 1, 7, 7, 512]) ==========
     placeholder = PLACEHOLDER [1, 14, 14, 1024]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 1024, 512]
@@ -251,7 +251,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 8  (workload key: ["7e09b626cf077cd419190fee02091dd6"]) ==========
+    ========== Task 8  (workload key: ["1b524af89dd867d26059e1f621cf987c", 1, 14, 14, 256, 1, 1, 256, 1024, 1, 14, 14, 1024, 1, 1, 1, 1024, 1, 14, 14, 1024]) ==========
     placeholder = PLACEHOLDER [1, 14, 14, 256]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 256, 1024]
@@ -262,7 +262,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (T_add[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 9  (workload key: ["3d3c292c1a43eacd9d5453bf9c51afae"]) ==========
+    ========== Task 9  (workload key: ["c68f92478eb18145106184c587d212b6", 1, 14, 14, 256, 6, 6, 256, 256, 1, 1, 1, 256, 1, 14, 14, 256]) ==========
     placeholder = PLACEHOLDER [1, 14, 14, 256]
     data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 15)) && (i2 >= 1)) && (i2 < 15)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 16), ((floormod(floordiv(p, 4), 4)*4) + eps), ((floormod(p, 4)*4) + nu), ci]
@@ -277,7 +277,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 10  (workload key: ["e043f834cc7f19597227e09dc7f59503"]) ==========
+    ========== Task 10  (workload key: ["6b7583cf23c7c37d3212cad9d06e58c1", 1, 14, 14, 1024, 1, 1, 1024, 256, 1, 1, 1, 256, 1, 14, 14, 256]) ==========
     placeholder = PLACEHOLDER [1, 14, 14, 1024]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 1024, 256]
@@ -286,7 +286,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 11  (workload key: ["cd7c4a374fb2bbc0d075c8cae638ad14"]) ==========
+    ========== Task 11  (workload key: ["1cc666833c122282e3fcf3595901b12b", 1, 14, 14, 256, 1, 1, 256, 1024, 1, 14, 14, 1024, 1, 14, 14, 1024]) ==========
     placeholder = PLACEHOLDER [1, 14, 14, 256]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 256, 1024]
@@ -294,7 +294,7 @@ The task scheduler will just optimize this objective.
     placeholder = PLACEHOLDER [1, 14, 14, 1024]
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3])
 
-    ========== Task 12  (workload key: ["1dce2c5e4269b8a12dfc50cd4dd23ff1"]) ==========
+    ========== Task 12  (workload key: ["de7d1695278cf52778b038e6573d7626", 1, 28, 28, 512, 1, 1, 512, 256, 1, 1, 1, 256, 1, 14, 14, 256]) ==========
     placeholder = PLACEHOLDER [1, 28, 28, 512]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 512, 256]
@@ -303,7 +303,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 13  (workload key: ["d3b36ce001dc24d693facfbdae1979b4"]) ==========
+    ========== Task 13  (workload key: ["1b524af89dd867d26059e1f621cf987c", 1, 28, 28, 128, 1, 1, 128, 512, 1, 28, 28, 512, 1, 1, 1, 512, 1, 28, 28, 512]) ==========
     placeholder = PLACEHOLDER [1, 28, 28, 128]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 128, 512]
@@ -314,7 +314,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (T_add[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 14  (workload key: ["9c75fdad44e8597c31107fc9e8c350e6"]) ==========
+    ========== Task 14  (workload key: ["ecec634b4882c5731f86cce3109db636", 1, 28, 28, 128, 6, 6, 128, 128, 1, 1, 1, 128, 1, 28, 28, 128]) ==========
     placeholder = PLACEHOLDER [1, 28, 28, 128]
     data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 29)) && (i2 >= 1)) && (i2 < 29)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 49), ((floormod(floordiv(p, 7), 7)*4) + eps), ((floormod(p, 7)*4) + nu), ci]
@@ -329,7 +329,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 15  (workload key: ["45acfc473c772458684f36a34549d8aa"]) ==========
+    ========== Task 15  (workload key: ["6b7583cf23c7c37d3212cad9d06e58c1", 1, 28, 28, 512, 1, 1, 512, 128, 1, 1, 1, 128, 1, 28, 28, 128]) ==========
     placeholder = PLACEHOLDER [1, 28, 28, 512]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 512, 128]
@@ -338,7 +338,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 16  (workload key: ["5e3ceb6e23ae8c351d5a1770d5fc6c7c"]) ==========
+    ========== Task 16  (workload key: ["1cc666833c122282e3fcf3595901b12b", 1, 28, 28, 128, 1, 1, 128, 512, 1, 28, 28, 512, 1, 28, 28, 512]) ==========
     placeholder = PLACEHOLDER [1, 28, 28, 128]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 128, 512]
@@ -346,7 +346,7 @@ The task scheduler will just optimize this objective.
     placeholder = PLACEHOLDER [1, 28, 28, 512]
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3])
 
-    ========== Task 17  (workload key: ["a085717fb3dcb046e5c4c2c04d3dc541"]) ==========
+    ========== Task 17  (workload key: ["de7d1695278cf52778b038e6573d7626", 1, 56, 56, 256, 1, 1, 256, 128, 1, 1, 1, 128, 1, 28, 28, 128]) ==========
     placeholder = PLACEHOLDER [1, 56, 56, 256]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 256, 128]
@@ -355,7 +355,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 18  (workload key: ["691feef049c8693bbe91bd5e7c9cdf34"]) ==========
+    ========== Task 18  (workload key: ["1b524af89dd867d26059e1f621cf987c", 1, 56, 56, 64, 1, 1, 64, 256, 1, 56, 56, 256, 1, 1, 1, 256, 1, 56, 56, 256]) ==========
     placeholder = PLACEHOLDER [1, 56, 56, 64]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 64, 256]
@@ -366,7 +366,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (T_add[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 19  (workload key: ["a9e632e5167afb60fbe29e7aeef1d152"]) ==========
+    ========== Task 19  (workload key: ["2350d19dc42a0665244368384c66b3a5", 1, 56, 56, 64, 3, 3, 64, 64, 1, 1, 1, 64, 1, 56, 56, 64]) ==========
     placeholder = PLACEHOLDER [1, 56, 56, 64]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 57)) && (i2 >= 1)) && (i2 < 57)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
     placeholder = PLACEHOLDER [3, 3, 64, 64]
@@ -375,7 +375,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 20  (workload key: ["b51e06c1131d4cded40d1b215f722a4e"]) ==========
+    ========== Task 20  (workload key: ["6b7583cf23c7c37d3212cad9d06e58c1", 1, 56, 56, 256, 1, 1, 256, 64, 1, 1, 1, 64, 1, 56, 56, 64]) ==========
     placeholder = PLACEHOLDER [1, 56, 56, 256]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 256, 64]
@@ -384,7 +384,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 21  (workload key: ["8fcee68a4342c38248a827f1c6c69177"]) ==========
+    ========== Task 21  (workload key: ["1cc666833c122282e3fcf3595901b12b", 1, 56, 56, 64, 1, 1, 64, 256, 1, 56, 56, 256, 1, 56, 56, 256]) ==========
     placeholder = PLACEHOLDER [1, 56, 56, 64]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 64, 256]
@@ -392,7 +392,7 @@ The task scheduler will just optimize this objective.
     placeholder = PLACEHOLDER [1, 56, 56, 256]
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3])
 
-    ========== Task 22  (workload key: ["8dd7d81db440763f622f03fdc99e6d46"]) ==========
+    ========== Task 22  (workload key: ["6b7583cf23c7c37d3212cad9d06e58c1", 1, 56, 56, 64, 1, 1, 64, 64, 1, 1, 1, 64, 1, 56, 56, 64]) ==========
     placeholder = PLACEHOLDER [1, 56, 56, 64]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 64, 64]
@@ -401,7 +401,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 23  (workload key: ["ba2026d923536b75e9b4faed89287d5f"]) ==========
+    ========== Task 23  (workload key: ["a5612fdeb9db4d579a75ec225ea4c06a", 1, 112, 112, 64, 1, 1, 1, 64, 1, 56, 56, 64]) ==========
     placeholder = PLACEHOLDER [1, 112, 112, 64]
     pad_temp(ax0, ax1, ax2, ax3) = tir.if_then_else(((((ax1 >= 1) && (ax1 < 113)) && (ax2 >= 1)) && (ax2 < 113)), placeholder[ax0, (ax1 - 1), (ax2 - 1), ax3], -3.40282e+38f)
     tensor(ax0, ax1, ax2, ax3) max= pad_temp[ax0, ((ax1*2) + dh), ((ax2*2) + dw), ax3]
@@ -409,7 +409,7 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (tensor[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 24  (workload key: ["a0eb8d6048282a4a0986cc2ccf14eaa2"]) ==========
+    ========== Task 24  (workload key: ["12b88bedece6984af589a28b43e0f3c4", 1, 224, 224, 3, 7, 7, 3, 64, 1, 1, 1, 64, 1, 112, 112, 64]) ==========
     placeholder = PLACEHOLDER [1, 224, 224, 3]
     PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 3) && (i1 < 227)) && (i2 >= 3)) && (i2 < 227)), placeholder[i0, (i1 - 3), (i2 - 3), i3], 0f)
     placeholder = PLACEHOLDER [7, 7, 3, 64]
@@ -418,25 +418,25 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 25  (workload key: ["45b4de07687dee43ee1cbde9f516b2bf"]) ==========
+    ========== Task 25  (workload key: ["7006235cfc29b73be524cf390ed5a977", 1, 56, 56, 64, 1, 1, 64, 256, 1, 56, 56, 256]) ==========
     placeholder = PLACEHOLDER [1, 56, 56, 64]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 64, 256]
     Conv2dOutput(nn, yy, xx, ff) += (PaddedInput[nn, (yy + ry), (xx + rx), rc]*placeholder[ry, rx, rc, ff])
 
-    ========== Task 26  (workload key: ["b2010aa63c95dedf1f58f3fe8bc78634"]) ==========
+    ========== Task 26  (workload key: ["f4380bb1dc62422a69ad4a1a9771f927", 1, 56, 56, 256, 1, 1, 256, 512, 1, 28, 28, 512]) ==========
     placeholder = PLACEHOLDER [1, 56, 56, 256]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 256, 512]
     Conv2dOutput(nn, yy, xx, ff) += (PaddedInput[nn, ((yy*2) + ry), ((xx*2) + rx), rc]*placeholder[ry, rx, rc, ff])
 
-    ========== Task 27  (workload key: ["4d7e646d99bfa3cea8245bd7100369cb"]) ==========
+    ========== Task 27  (workload key: ["f4380bb1dc62422a69ad4a1a9771f927", 1, 28, 28, 512, 1, 1, 512, 1024, 1, 14, 14, 1024]) ==========
     placeholder = PLACEHOLDER [1, 28, 28, 512]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 512, 1024]
     Conv2dOutput(nn, yy, xx, ff) += (PaddedInput[nn, ((yy*2) + ry), ((xx*2) + rx), rc]*placeholder[ry, rx, rc, ff])
 
-    ========== Task 28  (workload key: ["537c8642716948c33a6eaaabc86b159d"]) ==========
+    ========== Task 28  (workload key: ["f4380bb1dc62422a69ad4a1a9771f927", 1, 14, 14, 1024, 1, 1, 1024, 2048, 1, 7, 7, 2048]) ==========
     placeholder = PLACEHOLDER [1, 14, 14, 1024]
     PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
     placeholder = PLACEHOLDER [1, 1, 1024, 2048]
@@ -598,7 +598,7 @@ so we can read the log file and load the best schedules.
 
     Compile...
     Evaluate inference time cost...
-    Mean inference time (std dev): 485.35 ms (3.04 ms)
+    Mean inference time (std dev): 553.12 ms (0.08 ms)
 
 
 
@@ -607,7 +607,7 @@ Other Tips
 1. During the tuning, the auto-scheduler needs to compile many programs and
    extract feature from them. This part is CPU-intensive,
    so a high-performance CPU with many cores is recommended for faster search.
-2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill --i log.json`
+2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json`
    to distill the large log file and only save the best useful records.
 3. You can resume a search from the previous log file. You just need to
    add a new argument :code:`load_log_file` when creating the task scheduler
diff --git a/docs/_sources/tutorials/autotvm/sg_execution_times.rst.txt b/docs/_sources/tutorials/autotvm/sg_execution_times.rst.txt
index 5085163..cd8dfb7 100644
--- a/docs/_sources/tutorials/autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorials/autotvm/sg_execution_times.rst.txt
@@ -5,11 +5,11 @@
 
 Computation times
 =================
-**01:01.038** total execution time for **tutorials_autotvm** files:
+**00:35.822** total execution time for **tutorials_autotvm** files:
 
-- **00:31.435**: :ref:`sphx_glr_tutorials_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``)
-- **00:28.900**: :ref:`sphx_glr_tutorials_autotvm_tune_simple_template.py` (``tune_simple_template.py``)
-- **00:00.202**: :ref:`sphx_glr_tutorials_autotvm_tune_relay_cuda.py` (``tune_relay_cuda.py``)
-- **00:00.170**: :ref:`sphx_glr_tutorials_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``)
-- **00:00.169**: :ref:`sphx_glr_tutorials_autotvm_tune_relay_mobile_gpu.py` (``tune_relay_mobile_gpu.py``)
+- **00:28.199**: :ref:`sphx_glr_tutorials_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``)
+- **00:06.922**: :ref:`sphx_glr_tutorials_autotvm_tune_simple_template.py` (``tune_simple_template.py``)
+- **00:00.207**: :ref:`sphx_glr_tutorials_autotvm_tune_relay_cuda.py` (``tune_relay_cuda.py``)
+- **00:00.173**: :ref:`sphx_glr_tutorials_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``)
 - **00:00.162**: :ref:`sphx_glr_tutorials_autotvm_tune_relay_arm.py` (``tune_relay_arm.py``)
+- **00:00.159**: :ref:`sphx_glr_tutorials_autotvm_tune_relay_mobile_gpu.py` (``tune_relay_mobile_gpu.py``)
diff --git a/docs/_sources/tutorials/autotvm/tune_conv2d_cuda.rst.txt b/docs/_sources/tutorials/autotvm/tune_conv2d_cuda.rst.txt
index 9fe4554..b4f2fa8 100644
--- a/docs/_sources/tutorials/autotvm/tune_conv2d_cuda.rst.txt
+++ b/docs/_sources/tutorials/autotvm/tune_conv2d_cuda.rst.txt
@@ -242,26 +242,26 @@ for this template
        7 unroll_explicit: OtherOption([0, 1]) len=2
     )
     Get devices for measurement successfully!
-    No: 1   GFLOPS: 195.50/195.50   result: MeasureResult(costs=(0.0011841530204081633,), error_no=0, all_cost=1.680870771408081, timestamp=1610825374.8415902)     [('tile_f', [-1, 2, 64, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4881186
-    No: 2   GFLOPS: 0.00/195.50     result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  [bt] (4) /workspace/build/libtvm.so(TVMFuncCall+0x61) [0x7f2edbf60811]\n  [bt] (3) /workspace/build/libtvm.so(+0x6c5317) [0x7f2edb30b317]\n  [bt] (2) /workspace/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ed) [0x7f2edb307d3d]\n  [bt] (1) /workspace/build/libtvm.so(tvm::tir::transform::PrimFunc [...]
-    No: 3   GFLOPS: 180.03/195.50   result: MeasureResult(costs=(0.0012859004596774194,), error_no=0, all_cost=1.6597881317138672, timestamp=1610825376.337518)     [('tile_f', [-1, 4, 32, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 16]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3605182
-    No: 4   GFLOPS: 0.00/195.50     result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  [bt] (4) /workspace/build/libtvm.so(TVMFuncCall+0x61) [0x7f2edbf60811]\n  [bt] (3) /workspace/build/libtvm.so(+0x6c5317) [0x7f2edb30b317]\n  [bt] (2) /workspace/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ed) [0x7f2edb307d3d]\n  [bt] (1) /workspace/build/libtvm.so(tvm::tir::transform::PrimFunc [...]
-    No: 5   GFLOPS: 0.00/195.50     result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  [bt] (4) /workspace/build/libtvm.so(TVMFuncCall+0x61) [0x7f2edbf60811]\n  [bt] (3) /workspace/build/libtvm.so(+0x6c5317) [0x7f2edb30b317]\n  [bt] (2) /workspace/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ed) [0x7f2edb307d3d]\n  [bt] (1) /workspace/build/libtvm.so(tvm::tir::transform::PrimFunc [...]
-    No: 6   GFLOPS: 0.00/195.50     result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  [bt] (4) /workspace/build/libtvm.so(TVMFuncCall+0x61) [0x7f2edbf60811]\n  [bt] (3) /workspace/build/libtvm.so(+0x6c5317) [0x7f2edb30b317]\n  [bt] (2) /workspace/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ed) [0x7f2edb307d3d]\n  [bt] (1) /workspace/build/libtvm.so(tvm::tir::transform::PrimFunc [...]
-    No: 7   GFLOPS: 0.00/195.50     result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  [bt] (4) /workspace/build/libtvm.so(TVMFuncCall+0x61) [0x7f2edbf60811]\n  [bt] (3) /workspace/build/libtvm.so(+0x6c5317) [0x7f2edb30b317]\n  [bt] (2) /workspace/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ed) [0x7f2edb307d3d]\n  [bt] (1) /workspace/build/libtvm.so(tvm::tir::transform::PrimFunc [...]
-    No: 8   GFLOPS: 1.76/195.50     result: MeasureResult(costs=(0.13133982625,), error_no=0, all_cost=3.3585078716278076, timestamp=1610825379.7806785)    [('tile_f', [-1, 2, 4, 64]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2716108
-    No: 9   GFLOPS: 12.81/195.50    result: MeasureResult(costs=(0.018067887,), error_no=0, all_cost=1.7872579097747803, timestamp=1610825382.9570186)      [('tile_f', [-1, 1, 4, 2]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 8]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1263092
-    No: 10  GFLOPS: 230.05/230.05   result: MeasureResult(costs=(0.0010063026625,), error_no=0, all_cost=1.6115117073059082, timestamp=1610825384.0870204)  [('tile_f', [-1, 1, 32, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,8921130
-    No: 11  GFLOPS: 0.00/230.05     result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  [bt] (4) /workspace/build/libtvm.so(TVMFuncCall+0x61) [0x7f2edbf60811]\n  [bt] (3) /workspace/build/libtvm.so(+0x6c5317) [0x7f2edb30b317]\n  [bt] (2) /workspace/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ed) [0x7f2edb307d3d]\n  [bt] (1) /workspace/build/libtvm.so(tvm::tir::transform::PrimFunc [...]
-    No: 12  GFLOPS: 120.13/230.05   result: MeasureResult(costs=(0.0019270348461538462,), error_no=0, all_cost=1.404073715209961, timestamp=1610825385.2786767)     [('tile_f', [-1, 2, 32, 4]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 1, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,5036371
-    No: 13  GFLOPS: 0.00/230.05     result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  [bt] (4) /workspace/build/libtvm.so(TVMFuncCall+0x61) [0x7f2edbf60811]\n  [bt] (3) /workspace/build/libtvm.so(+0x6c5317) [0x7f2edb30b317]\n  [bt] (2) /workspace/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ed) [0x7f2edb307d3d]\n  [bt] (1) /workspace/build/libtvm.so(tvm::tir::transform::PrimFunc [...]
-    No: 14  GFLOPS: 0.00/230.05     result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  [bt] (4) /workspace/build/libtvm.so(TVMFuncCall+0x61) [0x7f2edbf60811]\n  [bt] (3) /workspace/build/libtvm.so(+0x6c5317) [0x7f2edb30b317]\n  [bt] (2) /workspace/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ed) [0x7f2edb307d3d]\n  [bt] (1) /workspace/build/libtvm.so(tvm::tir::transform::PrimFunc [...]
-    No: 15  GFLOPS: 83.10/230.05    result: MeasureResult(costs=(0.0027859296052631577,), error_no=0, all_cost=1.571483850479126, timestamp=1610825386.768219)      [('tile_f', [-1, 1, 1, 4]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 1, 8]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3582580
-    No: 16  GFLOPS: 0.00/230.05     result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  [bt] (4) /workspace/build/libtvm.so(TVMFuncCall+0x61) [0x7f2edbf60811]\n  [bt] (3) /workspace/build/libtvm.so(+0x6c5317) [0x7f2edb30b317]\n  [bt] (2) /workspace/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ed) [0x7f2edb307d3d]\n  [bt] (1) /workspace/build/libtvm.so(tvm::tir::transform::PrimFunc [...]
-    No: 17  GFLOPS: 0.00/230.05     result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  [bt] (4) /workspace/build/libtvm.so(TVMFuncCall+0x61) [0x7f2edbf60811]\n  [bt] (3) /workspace/build/libtvm.so(+0x6c5317) [0x7f2edb30b317]\n  [bt] (2) /workspace/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ed) [0x7f2edb307d3d]\n  [bt] (1) /workspace/build/libtvm.so(tvm::tir::transform::PrimFunc [...]
-    No: 18  GFLOPS: 0.00/230.05     result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  [bt] (4) /workspace/build/libtvm.so(TVMFuncCall+0x61) [0x7f2edbf60811]\n  [bt] (3) /workspace/build/libtvm.so(+0x6c5317) [0x7f2edb30b317]\n  [bt] (2) /workspace/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ed) [0x7f2edb307d3d]\n  [bt] (1) /workspace/build/libtvm.so(tvm::tir::transform::PrimFunc [...]
-    No: 19  GFLOPS: 17.45/230.05    result: MeasureResult(costs=(0.013266478888888888,), error_no=0, all_cost=1.6585257053375244, timestamp=1610825390.1547663)     [('tile_f', [-1, 8, 64, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4107668
-    No: 20  GFLOPS: 0.00/230.05     result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  [bt] (4) /workspace/build/libtvm.so(TVMFuncCall+0x61) [0x7f2edbf60811]\n  [bt] (3) /workspace/build/libtvm.so(+0x6c5317) [0x7f2edb30b317]\n  [bt] (2) /workspace/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ed) [0x7f2edb307d3d]\n  [bt] (1) /workspace/build/libtvm.so(tvm::tir::transform::PrimFunc [...]
+    No: 1   GFLOPS: 0.00/0.00       result: MeasureResult(costs=('request_remote() argument after ** must be a mapping, not tuple',), error_no=7, all_cost=4, timestamp=1614595273.4283657) [('tile_f', [-1, 2, 64, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4881186
+    No: 2   GFLOPS: 0.00/0.00       result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  [bt] (4) /workspace/build/libtvm.so(TVMFuncCall+0x61) [0x7fb513396d81]\n  [bt] (3) /workspace/build/libtvm.so(+0x7b29b6) [0x7fb51268a9b6]\n  [bt] (2) /workspace/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ed) [0x7fb512687eed]\n  [bt] (1) /workspace/build/libtvm.so(tvm::tir::transform::PrimFunc [...]
+    No: 3   GFLOPS: 0.00/0.00       result: MeasureResult(costs=('request_remote() argument after ** must be a mapping, not tuple',), error_no=7, all_cost=4, timestamp=1614595274.0546365) [('tile_f', [-1, 4, 32, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 16]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3605182
+    No: 4   GFLOPS: 0.00/0.00       result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  [bt] (4) /workspace/build/libtvm.so(TVMFuncCall+0x61) [0x7fb513396d81]\n  [bt] (3) /workspace/build/libtvm.so(+0x7b29b6) [0x7fb51268a9b6]\n  [bt] (2) /workspace/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ed) [0x7fb512687eed]\n  [bt] (1) /workspace/build/libtvm.so(tvm::tir::transform::PrimFunc [...]
+    No: 5   GFLOPS: 0.00/0.00       result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  [bt] (4) /workspace/build/libtvm.so(TVMFuncCall+0x61) [0x7fb513396d81]\n  [bt] (3) /workspace/build/libtvm.so(+0x7b29b6) [0x7fb51268a9b6]\n  [bt] (2) /workspace/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ed) [0x7fb512687eed]\n  [bt] (1) /workspace/build/libtvm.so(tvm::tir::transform::PrimFunc [...]
+    No: 6   GFLOPS: 0.00/0.00       result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  [bt] (4) /workspace/build/libtvm.so(TVMFuncCall+0x61) [0x7fb513396d81]\n  [bt] (3) /workspace/build/libtvm.so(+0x7b29b6) [0x7fb51268a9b6]\n  [bt] (2) /workspace/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ed) [0x7fb512687eed]\n  [bt] (1) /workspace/build/libtvm.so(tvm::tir::transform::PrimFunc [...]
+    No: 7   GFLOPS: 0.00/0.00       result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  [bt] (4) /workspace/build/libtvm.so(TVMFuncCall+0x61) [0x7fb513396d81]\n  [bt] (3) /workspace/build/libtvm.so(+0x7b29b6) [0x7fb51268a9b6]\n  [bt] (2) /workspace/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ed) [0x7fb512687eed]\n  [bt] (1) /workspace/build/libtvm.so(tvm::tir::transform::PrimFunc [...]
+    No: 8   GFLOPS: 0.00/0.00       result: MeasureResult(costs=('request_remote() argument after ** must be a mapping, not tuple',), error_no=7, all_cost=4, timestamp=1614595275.569044)  [('tile_f', [-1, 2, 4, 64]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2716108
+    No: 9   GFLOPS: 0.00/0.00       result: MeasureResult(costs=('request_remote() argument after ** must be a mapping, not tuple',), error_no=7, all_cost=4, timestamp=1614595278.2489731) [('tile_f', [-1, 1, 4, 2]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 8]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1263092
+    No: 10  GFLOPS: 0.00/0.00       result: MeasureResult(costs=('request_remote() argument after ** must be a mapping, not tuple',), error_no=7, all_cost=4, timestamp=1614595278.554763)  [('tile_f', [-1, 1, 32, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,8921130
+    No: 11  GFLOPS: 0.00/0.00       result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  [bt] (4) /workspace/build/libtvm.so(TVMFuncCall+0x61) [0x7fb513396d81]\n  [bt] (3) /workspace/build/libtvm.so(+0x7b29b6) [0x7fb51268a9b6]\n  [bt] (2) /workspace/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ed) [0x7fb512687eed]\n  [bt] (1) /workspace/build/libtvm.so(tvm::tir::transform::PrimFunc [...]
+    No: 12  GFLOPS: 0.00/0.00       result: MeasureResult(costs=('request_remote() argument after ** must be a mapping, not tuple',), error_no=7, all_cost=4, timestamp=1614595279.1666207) [('tile_f', [-1, 2, 32, 4]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 1, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,5036371
+    No: 13  GFLOPS: 0.00/0.00       result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  [bt] (4) /workspace/build/libtvm.so(TVMFuncCall+0x61) [0x7fb513396d81]\n  [bt] (3) /workspace/build/libtvm.so(+0x7b29b6) [0x7fb51268a9b6]\n  [bt] (2) /workspace/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ed) [0x7fb512687eed]\n  [bt] (1) /workspace/build/libtvm.so(tvm::tir::transform::PrimFunc [...]
+    No: 14  GFLOPS: 0.00/0.00       result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  [bt] (4) /workspace/build/libtvm.so(TVMFuncCall+0x61) [0x7fb513396d81]\n  [bt] (3) /workspace/build/libtvm.so(+0x7b29b6) [0x7fb51268a9b6]\n  [bt] (2) /workspace/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ed) [0x7fb512687eed]\n  [bt] (1) /workspace/build/libtvm.so(tvm::tir::transform::PrimFunc [...]
+    No: 15  GFLOPS: 0.00/0.00       result: MeasureResult(costs=('request_remote() argument after ** must be a mapping, not tuple',), error_no=7, all_cost=4, timestamp=1614595280.0907054) [('tile_f', [-1, 1, 1, 4]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 1, 8]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3582580
+    No: 16  GFLOPS: 0.00/0.00       result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  [bt] (4) /workspace/build/libtvm.so(TVMFuncCall+0x61) [0x7fb513396d81]\n  [bt] (3) /workspace/build/libtvm.so(+0x7b29b6) [0x7fb51268a9b6]\n  [bt] (2) /workspace/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ed) [0x7fb512687eed]\n  [bt] (1) /workspace/build/libtvm.so(tvm::tir::transform::PrimFunc [...]
+    No: 17  GFLOPS: 0.00/0.00       result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  [bt] (4) /workspace/build/libtvm.so(TVMFuncCall+0x61) [0x7fb513396d81]\n  [bt] (3) /workspace/build/libtvm.so(+0x7b29b6) [0x7fb51268a9b6]\n  [bt] (2) /workspace/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ed) [0x7fb512687eed]\n  [bt] (1) /workspace/build/libtvm.so(tvm::tir::transform::PrimFunc [...]
+    No: 18  GFLOPS: 0.00/0.00       result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  [bt] (4) /workspace/build/libtvm.so(TVMFuncCall+0x61) [0x7fb513396d81]\n  [bt] (3) /workspace/build/libtvm.so(+0x7b29b6) [0x7fb51268a9b6]\n  [bt] (2) /workspace/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ed) [0x7fb512687eed]\n  [bt] (1) /workspace/build/libtvm.so(tvm::tir::transform::PrimFunc [...]
+    No: 19  GFLOPS: 0.00/0.00       result: MeasureResult(costs=('request_remote() argument after ** must be a mapping, not tuple',), error_no=7, all_cost=4, timestamp=1614595282.893545)  [('tile_f', [-1, 8, 64, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4107668
+    No: 20  GFLOPS: 0.00/0.00       result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  [bt] (4) /workspace/build/libtvm.so(TVMFuncCall+0x61) [0x7fb513396d81]\n  [bt] (3) /workspace/build/libtvm.so(+0x7b29b6) [0x7fb51268a9b6]\n  [bt] (2) /workspace/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ed) [0x7fb512687eed]\n  [bt] (1) /workspace/build/libtvm.so(tvm::tir::transform::PrimFunc [...]
 
 
 
@@ -313,8 +313,8 @@ and measure running time.
 
 
     Best config:
-    [('tile_f', [-1, 1, 32, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,8921130
-    Time cost of this operator: 0.001402
+    ,None
+    Time cost of this operator: 0.010105
 
 
 
diff --git a/docs/_sources/tutorials/autotvm/tune_simple_template.rst.txt b/docs/_sources/tutorials/autotvm/tune_simple_template.rst.txt
index 5cbd567..522a1ad 100644
--- a/docs/_sources/tutorials/autotvm/tune_simple_template.rst.txt
+++ b/docs/_sources/tutorials/autotvm/tune_simple_template.rst.txt
@@ -370,16 +370,16 @@ used to get the best config later.
  .. code-block:: none
 
     Get devices for measurement successfully!
-    No: 1   GFLOPS: 0.52/0.52       result: MeasureResult(costs=(0.5192541842,), error_no=0, all_cost=8.707226037979126, timestamp=1610825351.4167655)      [('tile_y', [-1, 64]), ('tile_x', [-1, 1])],None,6
-    No: 2   GFLOPS: 2.15/2.15       result: MeasureResult(costs=(0.1250656946,), error_no=0, all_cost=2.4241549968719482, timestamp=1610825353.9296935)     [('tile_y', [-1, 512]), ('tile_x', [-1, 8])],None,39
-    No: 3   GFLOPS: 2.57/2.57       result: MeasureResult(costs=(0.1045112922,), error_no=0, all_cost=2.1576664447784424, timestamp=1610825356.119733)      [('tile_y', [-1, 2]), ('tile_x', [-1, 8])],None,31
-    No: 4   GFLOPS: 7.55/7.55       result: MeasureResult(costs=(0.0355576346,), error_no=0, all_cost=1.0249900817871094, timestamp=1610825357.1883035)     [('tile_y', [-1, 1]), ('tile_x', [-1, 32])],None,50
-    No: 5   GFLOPS: 13.54/13.54     result: MeasureResult(costs=(0.0198238056,), error_no=0, all_cost=0.7991337776184082, timestamp=1610825358.0089042)     [('tile_y', [-1, 256]), ('tile_x', [-1, 64])],None,68
-    No: 6   GFLOPS: 12.00/13.54     result: MeasureResult(costs=(0.0223625672,), error_no=0, all_cost=0.8021728992462158, timestamp=1610825358.8741274)     [('tile_y', [-1, 256]), ('tile_x', [-1, 512])],None,98
-    No: 7   GFLOPS: 0.92/13.54      result: MeasureResult(costs=(0.2916841556,), error_no=0, all_cost=5.11684250831604, timestamp=1610825364.0431085)       [('tile_y', [-1, 128]), ('tile_x', [-1, 2])],None,17
-    No: 8   GFLOPS: 2.49/13.54      result: MeasureResult(costs=(0.107873912,), error_no=0, all_cost=2.1294100284576416, timestamp=1610825366.2664104)      [('tile_y', [-1, 8]), ('tile_x', [-1, 4])],None,23
-    No: 9   GFLOPS: 11.57/13.54     result: MeasureResult(costs=(0.023191816,), error_no=0, all_cost=0.7419209480285645, timestamp=1610825367.8942754)      [('tile_y', [-1, 256]), ('tile_x', [-1, 32])],None,58
-    No: 10  GFLOPS: 14.76/14.76     result: MeasureResult(costs=(0.018183146,), error_no=0, all_cost=0.7862186431884766, timestamp=1610825368.6922603)      [('tile_y', [-1, 64]), ('tile_x', [-1, 128])],None,76
+    No: 1   GFLOPS: 0.00/0.00       result: MeasureResult(costs=('request_remote() argument after ** must be a mapping, not tuple',), error_no=7, all_cost=10, timestamp=1614595264.4104242)        [('tile_y', [-1, 64]), ('tile_x', [-1, 1])],None,6
+    No: 2   GFLOPS: 0.00/0.00       result: MeasureResult(costs=('request_remote() argument after ** must be a mapping, not tuple',), error_no=7, all_cost=10, timestamp=1614595264.714398) [('tile_y', [-1, 512]), ('tile_x', [-1, 8])],None,39
+    No: 3   GFLOPS: 0.00/0.00       result: MeasureResult(costs=('request_remote() argument after ** must be a mapping, not tuple',), error_no=7, all_cost=10, timestamp=1614595265.0153906)        [('tile_y', [-1, 2]), ('tile_x', [-1, 8])],None,31
+    No: 4   GFLOPS: 0.00/0.00       result: MeasureResult(costs=('request_remote() argument after ** must be a mapping, not tuple',), error_no=7, all_cost=10, timestamp=1614595265.3186316)        [('tile_y', [-1, 1]), ('tile_x', [-1, 32])],None,50
+    No: 5   GFLOPS: 0.00/0.00       result: MeasureResult(costs=('request_remote() argument after ** must be a mapping, not tuple',), error_no=7, all_cost=10, timestamp=1614595265.6324012)        [('tile_y', [-1, 256]), ('tile_x', [-1, 64])],None,68
+    No: 6   GFLOPS: 0.00/0.00       result: MeasureResult(costs=('request_remote() argument after ** must be a mapping, not tuple',), error_no=7, all_cost=10, timestamp=1614595265.934279) [('tile_y', [-1, 256]), ('tile_x', [-1, 512])],None,98
+    No: 7   GFLOPS: 0.00/0.00       result: MeasureResult(costs=('request_remote() argument after ** must be a mapping, not tuple',), error_no=7, all_cost=10, timestamp=1614595266.2497342)        [('tile_y', [-1, 128]), ('tile_x', [-1, 2])],None,17
+    No: 8   GFLOPS: 0.00/0.00       result: MeasureResult(costs=('request_remote() argument after ** must be a mapping, not tuple',), error_no=7, all_cost=10, timestamp=1614595266.5602283)        [('tile_y', [-1, 8]), ('tile_x', [-1, 4])],None,23
+    No: 9   GFLOPS: 0.00/0.00       result: MeasureResult(costs=('request_remote() argument after ** must be a mapping, not tuple',), error_no=7, all_cost=10, timestamp=1614595267.6910803)        [('tile_y', [-1, 256]), ('tile_x', [-1, 32])],None,58
+    No: 10  GFLOPS: 0.00/0.00       result: MeasureResult(costs=('request_remote() argument after ** must be a mapping, not tuple',), error_no=7, all_cost=10, timestamp=1614595267.9915197)        [('tile_y', [-1, 64]), ('tile_x', [-1, 128])],None,76
 
 
 
diff --git a/docs/_sources/tutorials/dev/bring_your_own_datatypes.rst.txt b/docs/_sources/tutorials/dev/bring_your_own_datatypes.rst.txt
index 1f8f373..2c3969d 100644
--- a/docs/_sources/tutorials/dev/bring_your_own_datatypes.rst.txt
+++ b/docs/_sources/tutorials/dev/bring_your_own_datatypes.rst.txt
@@ -155,7 +155,7 @@ Additionally, note the "32" after the datatype: this is the bitwidth of the cust
 
  .. code-block:: none
 
-      Check failed: name_to_code_.find(type_name) != name_to_code_.end() == false: Type name myfloat not registered
+
 
 
 
@@ -447,8 +447,8 @@ It's easy to execute MobileNet with native TVM:
  .. code-block:: none
 
     File /workspace/.tvm_test_data/data/cat.png exists, skip.
-    [ -7.5350146   2.0367992 -12.706643   -5.637861  -12.684059    4.0723605
-       2.6188765   3.404948   -9.86791   -24.533121 ]
+    [ -7.5350165   2.0368009 -12.706646   -5.63786   -12.684058    4.0723605
+       2.618876    3.4049501  -9.867913  -24.53311  ]
 
 
 
@@ -641,8 +641,8 @@ Now we can finally run the model:
 
  .. code-block:: none
 
-    [ -7.5350146   2.0367992 -12.706643   -5.637861  -12.684059    4.0723605
-       2.6188765   3.404948   -9.86791   -24.533121 ]
+    [ -7.5350165   2.0368009 -12.706646   -5.63786   -12.684058    4.0723605
+       2.618876    3.4049501  -9.867913  -24.53311  ]
 
 
 
diff --git a/docs/_sources/tutorials/dev/low_level_custom_pass.rst.txt b/docs/_sources/tutorials/dev/low_level_custom_pass.rst.txt
index 7543617..45da42f 100644
--- a/docs/_sources/tutorials/dev/low_level_custom_pass.rst.txt
+++ b/docs/_sources/tutorials/dev/low_level_custom_pass.rst.txt
@@ -74,8 +74,8 @@ our customized lowering pass to manipulate the IR directly instead of using sche
 
     primfn(a_1: handle, b_1: handle, c_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
-      buffers = {b: Buffer(b_2: Pointer(float32), float32, [128], []),
-                 c: Buffer(c_2: Pointer(float32), float32, [128], []),
+      buffers = {c: Buffer(c_2: Pointer(float32), float32, [128], []),
+                 b: Buffer(b_2: Pointer(float32), float32, [128], []),
                  a: Buffer(a_2: Pointer(float32), float32, [128], [])}
       buffer_map = {a_1: a, b_1: b, c_1: c} {
       for (i: int32, 0, 128) {
@@ -156,8 +156,8 @@ this value.
             name = op.loop_var.name
             lo, li = te.var(name + ".outer"), te.var(name + ".inner")
             body = tvm.tir.stmt_functor.substitute(op.body, {op.loop_var: lo * 8 + li})
-            body = tvm.tir.For(li, 0, 8, tvm.tir.For.Vectorized, 0, body)
-            body = tvm.tir.For(lo, 0, extent // 8, tvm.tir.For.Serial, 0, body)
+            body = tvm.tir.For(li, 0, 8, tvm.tir.ForKind.VECTORIZED, body)
+            body = tvm.tir.For(lo, 0, extent // 8, tvm.tir.ForKind.SERIAL, body)
             return body
         return None
 
@@ -221,8 +221,8 @@ Thus, a good place to put this transformation pass is just after Phase 1.
 
     primfn(a_1: handle, b_1: handle, c_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
-      buffers = {c: Buffer(c_2: Pointer(float32), float32, [128], []),
-                 b: Buffer(b_2: Pointer(float32), float32, [128], []),
+      buffers = {b: Buffer(b_2: Pointer(float32), float32, [128], []),
+                 c: Buffer(c_2: Pointer(float32), float32, [128], []),
                  a: Buffer(a_2: Pointer(float32), float32, [128], [])}
       buffer_map = {a_1: a, b_1: b, c_1: c} {
       for (i.outer: int32, 0, 16) {
diff --git a/docs/_sources/tutorials/dev/sg_execution_times.rst.txt b/docs/_sources/tutorials/dev/sg_execution_times.rst.txt
index c5bb9a6..ab90c0f 100644
--- a/docs/_sources/tutorials/dev/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorials/dev/sg_execution_times.rst.txt
@@ -5,8 +5,8 @@
 
 Computation times
 =================
-**00:32.241** total execution time for **tutorials_dev** files:
+**00:32.506** total execution time for **tutorials_dev** files:
 
-- **00:31.665**: :ref:`sphx_glr_tutorials_dev_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``)
-- **00:00.392**: :ref:`sphx_glr_tutorials_dev_use_pass_infra.py` (``use_pass_infra.py``)
-- **00:00.184**: :ref:`sphx_glr_tutorials_dev_low_level_custom_pass.py` (``low_level_custom_pass.py``)
+- **00:31.932**: :ref:`sphx_glr_tutorials_dev_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``)
+- **00:00.396**: :ref:`sphx_glr_tutorials_dev_use_pass_infra.py` (``use_pass_infra.py``)
+- **00:00.178**: :ref:`sphx_glr_tutorials_dev_low_level_custom_pass.py` (``low_level_custom_pass.py``)
diff --git a/docs/_sources/tutorials/frontend/deploy_model_on_android.rst.txt b/docs/_sources/tutorials/frontend/deploy_model_on_android.rst.txt
index b5a4d09..0f22183 100644
--- a/docs/_sources/tutorials/frontend/deploy_model_on_android.rst.txt
+++ b/docs/_sources/tutorials/frontend/deploy_model_on_android.rst.txt
@@ -421,7 +421,7 @@ Execute on TVM
 
     TVM prediction top-1: tiger cat
     Evaluate inference time cost...
-    Mean inference time (std dev): 5.47 ms (0.14 ms)
+    Mean inference time (std dev): 6.03 ms (0.06 ms)
 
 
 
diff --git a/docs/_sources/tutorials/frontend/deploy_object_detection_pytorch.rst.txt b/docs/_sources/tutorials/frontend/deploy_object_detection_pytorch.rst.txt
index 54ab9f9..1641c47 100644
--- a/docs/_sources/tutorials/frontend/deploy_object_detection_pytorch.rst.txt
+++ b/docs/_sources/tutorials/frontend/deploy_object_detection_pytorch.rst.txt
@@ -247,7 +247,7 @@ Get boxes with score larger than 0.9
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  32.900 seconds)
+   **Total running time of the script:** ( 1 minutes  17.425 seconds)
 
 
 .. _sphx_glr_download_tutorials_frontend_deploy_object_detection_pytorch.py:
diff --git a/docs/_sources/tutorials/frontend/deploy_prequantized.rst.txt b/docs/_sources/tutorials/frontend/deploy_prequantized.rst.txt
index 92ce50e..ee58de6 100644
--- a/docs/_sources/tutorials/frontend/deploy_prequantized.rst.txt
+++ b/docs/_sources/tutorials/frontend/deploy_prequantized.rst.txt
@@ -350,7 +350,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
  .. code-block:: none
 
-    Elapsed average ms: 19.38913991
+    Elapsed average ms: 20.436994119999998
 
 
 
diff --git a/docs/_sources/tutorials/frontend/deploy_prequantized_tflite.rst.txt b/docs/_sources/tutorials/frontend/deploy_prequantized_tflite.rst.txt
index 20b6029..48abe03 100644
--- a/docs/_sources/tutorials/frontend/deploy_prequantized_tflite.rst.txt
+++ b/docs/_sources/tutorials/frontend/deploy_prequantized_tflite.rst.txt
@@ -368,7 +368,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
  .. code-block:: none
 
-    Elapsed average ms: 37.0458985
+    Elapsed average ms: 37.27674336999999
 
 
 
@@ -401,7 +401,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 2 minutes  37.373 seconds)
+   **Total running time of the script:** ( 2 minutes  38.207 seconds)
 
 
 .. _sphx_glr_download_tutorials_frontend_deploy_prequantized_tflite.py:
diff --git a/docs/_sources/tutorials/frontend/deploy_sparse.rst.txt b/docs/_sources/tutorials/frontend/deploy_sparse.rst.txt
index 8700db6..46271a3 100644
--- a/docs/_sources/tutorials/frontend/deploy_sparse.rst.txt
+++ b/docs/_sources/tutorials/frontend/deploy_sparse.rst.txt
@@ -105,10 +105,8 @@ and sparsity to run.
     batch_size = 1
     # The length of each input sequence.
     seq_len = 128
-    # TVM platform identifier. Although cuda is also supported, it requires
-    # tuning that is outside the scope of this tutorial. Note that best
-    # cpu performance can be achieved by setting -mcpu appropriately for
-    # your specific machine.
+    # TVM platform identifier. Note that best cpu performance can be achieved by setting -mcpu
+    # appropriately for your specific machine. CUDA and ROCm are also supported.
     target = "llvm"
     # Which device to run on. Should be one of tvm.cpu() or tvm.gpu().
     ctx = tvm.cpu()
@@ -397,6 +395,20 @@ and shows about a 2.5X speedup from using sparsity.
     # Block Sparse Model with 1x1 blocks:
     # Runtime:             67.75 ms            (8.83 ms)
 
+    # Here is the output of this script on a GPU (GTX 1070) with the target "cuda -libs=cublas".
+    #
+    # Dense Model Benchmark:
+    # Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (1, 768), 'float32'), ('TENSOR', (2, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
+    # Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (1, 768), 'float32'), ('TENSOR', (768, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
+    # Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (128, 3072), 'float32'), ('TENSOR', (768, 3072), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
+    # Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (128, 768), 'float32'), ('TENSOR', (3072, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
+    # Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (128, 768), 'float32'), ('TENSOR', (768, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
+    # Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('batch_matmul_cublas.cuda', ('TENSOR', (12, 128, 128), 'float32'), ('TENSOR', (12, 64, 128), 'float32'), (12, 128, 64)). A fallback configuration is used, which may bring great performance regression.
+    # Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('batch_matmul_cublas.cuda', ('TENSOR', (12, 128, 64), 'float32'), ('TENSOR', (12, 128, 64), 'float32'), (12, 128, 128)). A fallback configuration is used, which may bring great performance regression.
+    # Runtime:             10.64 ms            (0.29 ms)
+    # Block Sparse Model with 1x1 blocks:
+    # Runtime:             6.46 ms             (0.05 ms)
+
 
 
 
diff --git a/docs/_sources/tutorials/frontend/deploy_ssd_gluoncv.rst.txt b/docs/_sources/tutorials/frontend/deploy_ssd_gluoncv.rst.txt
index 46ddb7b..dd45bbc 100644
--- a/docs/_sources/tutorials/frontend/deploy_ssd_gluoncv.rst.txt
+++ b/docs/_sources/tutorials/frontend/deploy_ssd_gluoncv.rst.txt
@@ -139,6 +139,10 @@ Convert and compile model for CPU.
 
 
 Create TVM runtime and do inference
+.. note::
+
+  Use target = "cuda -libs" to enable thrust based sort, if you
+  enabled thrust during cmake by -DUSE_THRUST=ON.
 
 
 .. code-block:: default
@@ -195,7 +199,7 @@ Display result
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  54.491 seconds)
+   **Total running time of the script:** ( 1 minutes  48.639 seconds)
 
 
 .. _sphx_glr_download_tutorials_frontend_deploy_ssd_gluoncv.py:
diff --git a/docs/_sources/tutorials/frontend/from_onnx.rst.txt b/docs/_sources/tutorials/frontend/from_onnx.rst.txt
index 2c559b1..0887c18 100644
--- a/docs/_sources/tutorials/frontend/from_onnx.rst.txt
+++ b/docs/_sources/tutorials/frontend/from_onnx.rst.txt
@@ -142,7 +142,7 @@ provides a static definition of the input size.
 
  .. code-block:: none
 
-    /workspace/docs/../python/tvm/relay/frontend/onnx.py:3071: UserWarning: Mismatched attribute type in ' : kernel_shape'
+    /workspace/docs/../python/tvm/relay/frontend/onnx.py:3132: UserWarning: Mismatched attribute type in ' : kernel_shape'
 
     ==> Context: Bad node spec: input: "1" input: "2" output: "11" op_type: "Conv" attribute { name: "kernel_shape" ints: 5 ints: 5 } attribute { name: "strides" ints: 1 ints: 1 } attribute { name: "pads" ints: 2 ints: 2 ints: 2 ints: 2 } attribute { name: "dilations" ints: 1 ints: 1 } attribute { name: "group" i: 1 }
       warnings.warn(str(e))
diff --git a/docs/_sources/tutorials/frontend/from_pytorch.rst.txt b/docs/_sources/tutorials/frontend/from_pytorch.rst.txt
index 792dfb4..f00bc6b 100644
--- a/docs/_sources/tutorials/frontend/from_pytorch.rst.txt
+++ b/docs/_sources/tutorials/frontend/from_pytorch.rst.txt
@@ -155,8 +155,9 @@ Compile the graph to llvm target with given input specification.
 
  .. code-block:: none
 
-
    ...47%, 0.01 MB, 221 KB/s, 0 seconds passed
    ...94%, 0.02 MB, 439 KB/s, 0 seconds passed
    ...100%, 0.02 MB, 655 KB/s, 0 seconds passed
-    Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('dense_nopack.x86', ('TENSOR', (1, 512), 'float32'), ('TENSOR', (1000, 512), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
+
    ...47%, 0.01 MB, 53 KB/s, 0 seconds passed
    ...94%, 0.02 MB, 106 KB/s, 0 seconds passed
    ...100%, 0.02 MB, 159 KB/s, 0 seconds passed
+    Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('dense_pack.x86', ('TENSOR', (1, 512), 'float32'), ('TENSOR', (1000, 512), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
+    Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('dense_pack.x86', ('TENSOR', (1, 512), 'float32'), ('TENSOR', (100, 512, 10), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
 
 
 
diff --git a/docs/_sources/tutorials/frontend/from_tensorflow.rst.txt b/docs/_sources/tutorials/frontend/from_tensorflow.rst.txt
index ad4ac5b..b90aab8 100644
--- a/docs/_sources/tutorials/frontend/from_tensorflow.rst.txt
+++ b/docs/_sources/tutorials/frontend/from_tensorflow.rst.txt
@@ -195,9 +195,9 @@ Results:
 
  .. code-block:: none
 
-    /workspace/docs/../python/tvm/relay/frontend/tensorflow.py:2914: UserWarning: Ignore the passed shape. Shape in graphdef will be used for operator DecodeJpeg/contents.
+    /workspace/docs/../python/tvm/relay/frontend/tensorflow.py:3153: UserWarning: Ignore the passed shape. Shape in graphdef will be used for operator DecodeJpeg/contents.
       "will be used for operator %s." % node.name
-    /workspace/docs/../python/tvm/relay/frontend/tensorflow.py:745: UserWarning: DecodeJpeg: It's a pass through, please handle preprocessing before input
+    /workspace/docs/../python/tvm/relay/frontend/tensorflow.py:882: UserWarning: DecodeJpeg: It's a pass through, please handle preprocessing before input
       warnings.warn("DecodeJpeg: It's a pass through, please handle preprocessing before input")
     Tensorflow protobuf imported to relay frontend.
 
@@ -323,7 +323,8 @@ Results:
     conv2d NHWC layout is not optimized for x86 with autotvm.
     conv2d NHWC layout is not optimized for x86 with autotvm.
     conv2d NHWC layout is not optimized for x86 with autotvm.
-    Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('dense_nopack.x86', ('TENSOR', (1, 2048), 'float32'), ('TENSOR', (1008, 2048), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
+    Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('dense_pack.x86', ('TENSOR', (1, 2048), 'float32'), ('TENSOR', (1008, 2048), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
+    Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('dense_pack.x86', ('TENSOR', (1, 2048), 'float32'), ('TENSOR', (63, 2048, 16), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
     conv2d NHWC layout is not optimized for x86 with autotvm.
     conv2d NHWC layout is not optimized for x86 with autotvm.
     conv2d NHWC layout is not optimized for x86 with autotvm.
diff --git a/docs/_sources/tutorials/frontend/from_tflite.rst.txt b/docs/_sources/tutorials/frontend/from_tflite.rst.txt
index 4efc32e..f212a74 100644
--- a/docs/_sources/tutorials/frontend/from_tflite.rst.txt
+++ b/docs/_sources/tutorials/frontend/from_tflite.rst.txt
@@ -18,7 +18,7 @@ To get started, TFLite package needs to be installed as prerequisite.
 .. code-block:: bash
 
     # install tflite
-    pip install tflite=2.1.0 --user
+    pip install tflite==2.1.0 --user
 
 
 or you could generate TFLite package yourself. The steps are the following:
diff --git a/docs/_sources/tutorials/frontend/sg_execution_times.rst.txt b/docs/_sources/tutorials/frontend/sg_execution_times.rst.txt
index 62c9051..5531741 100644
--- a/docs/_sources/tutorials/frontend/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorials/frontend/sg_execution_times.rst.txt
@@ -5,24 +5,24 @@
 
 Computation times
 =================
-**10:05.489** total execution time for **tutorials_frontend** files:
+**09:41.231** total execution time for **tutorials_frontend** files:
 
-- **02:37.373**: :ref:`sphx_glr_tutorials_frontend_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``)
-- **01:54.491**: :ref:`sphx_glr_tutorials_frontend_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``)
-- **01:32.900**: :ref:`sphx_glr_tutorials_frontend_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``)
-- **00:39.350**: :ref:`sphx_glr_tutorials_frontend_from_tensorflow.py` (``from_tensorflow.py``)
-- **00:34.184**: :ref:`sphx_glr_tutorials_frontend_deploy_quantized.py` (``deploy_quantized.py``)
-- **00:29.143**: :ref:`sphx_glr_tutorials_frontend_deploy_prequantized.py` (``deploy_prequantized.py``)
-- **00:25.023**: :ref:`sphx_glr_tutorials_frontend_from_tflite.py` (``from_tflite.py``)
-- **00:23.035**: :ref:`sphx_glr_tutorials_frontend_from_darknet.py` (``from_darknet.py``)
-- **00:16.541**: :ref:`sphx_glr_tutorials_frontend_from_caffe2.py` (``from_caffe2.py``)
-- **00:14.960**: :ref:`sphx_glr_tutorials_frontend_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``)
-- **00:12.556**: :ref:`sphx_glr_tutorials_frontend_deploy_model_on_android.py` (``deploy_model_on_android.py``)
-- **00:11.264**: :ref:`sphx_glr_tutorials_frontend_from_pytorch.py` (``from_pytorch.py``)
-- **00:09.822**: :ref:`sphx_glr_tutorials_frontend_from_mxnet.py` (``from_mxnet.py``)
-- **00:09.108**: :ref:`sphx_glr_tutorials_frontend_from_coreml.py` (``from_coreml.py``)
-- **00:09.065**: :ref:`sphx_glr_tutorials_frontend_from_keras.py` (``from_keras.py``)
-- **00:03.484**: :ref:`sphx_glr_tutorials_frontend_using_external_lib.py` (``using_external_lib.py``)
-- **00:01.840**: :ref:`sphx_glr_tutorials_frontend_from_onnx.py` (``from_onnx.py``)
-- **00:01.134**: :ref:`sphx_glr_tutorials_frontend_build_gcn.py` (``build_gcn.py``)
-- **00:00.216**: :ref:`sphx_glr_tutorials_frontend_deploy_sparse.py` (``deploy_sparse.py``)
+- **02:38.207**: :ref:`sphx_glr_tutorials_frontend_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``)
+- **01:48.639**: :ref:`sphx_glr_tutorials_frontend_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``)
+- **01:17.425**: :ref:`sphx_glr_tutorials_frontend_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``)
+- **00:39.079**: :ref:`sphx_glr_tutorials_frontend_from_tensorflow.py` (``from_tensorflow.py``)
+- **00:29.630**: :ref:`sphx_glr_tutorials_frontend_deploy_prequantized.py` (``deploy_prequantized.py``)
+- **00:27.788**: :ref:`sphx_glr_tutorials_frontend_deploy_quantized.py` (``deploy_quantized.py``)
+- **00:25.509**: :ref:`sphx_glr_tutorials_frontend_from_tflite.py` (``from_tflite.py``)
+- **00:23.629**: :ref:`sphx_glr_tutorials_frontend_from_darknet.py` (``from_darknet.py``)
+- **00:16.795**: :ref:`sphx_glr_tutorials_frontend_from_caffe2.py` (``from_caffe2.py``)
+- **00:15.555**: :ref:`sphx_glr_tutorials_frontend_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``)
+- **00:12.806**: :ref:`sphx_glr_tutorials_frontend_deploy_model_on_android.py` (``deploy_model_on_android.py``)
+- **00:11.464**: :ref:`sphx_glr_tutorials_frontend_from_pytorch.py` (``from_pytorch.py``)
+- **00:10.021**: :ref:`sphx_glr_tutorials_frontend_from_mxnet.py` (``from_mxnet.py``)
+- **00:09.220**: :ref:`sphx_glr_tutorials_frontend_from_keras.py` (``from_keras.py``)
+- **00:09.195**: :ref:`sphx_glr_tutorials_frontend_from_coreml.py` (``from_coreml.py``)
+- **00:03.161**: :ref:`sphx_glr_tutorials_frontend_using_external_lib.py` (``using_external_lib.py``)
+- **00:01.739**: :ref:`sphx_glr_tutorials_frontend_from_onnx.py` (``from_onnx.py``)
+- **00:01.174**: :ref:`sphx_glr_tutorials_frontend_build_gcn.py` (``build_gcn.py``)
+- **00:00.195**: :ref:`sphx_glr_tutorials_frontend_deploy_sparse.py` (``deploy_sparse.py``)
diff --git a/docs/_sources/tutorials/get_started/cross_compilation_and_rpc.rst.txt b/docs/_sources/tutorials/get_started/cross_compilation_and_rpc.rst.txt
index a38f59a..8dd5259 100644
--- a/docs/_sources/tutorials/get_started/cross_compilation_and_rpc.rst.txt
+++ b/docs/_sources/tutorials/get_started/cross_compilation_and_rpc.rst.txt
@@ -235,7 +235,7 @@ device and returns the measured cost. Network overhead is excluded.
 
  .. code-block:: none
 
-    1.192e-07 secs/op
+    1.161e-07 secs/op
 
 
 
diff --git a/docs/_sources/tutorials/get_started/relay_quick_start.rst.txt b/docs/_sources/tutorials/get_started/relay_quick_start.rst.txt
index 35b1541..1e14c95 100644
--- a/docs/_sources/tutorials/get_started/relay_quick_start.rst.txt
+++ b/docs/_sources/tutorials/get_started/relay_quick_start.rst.txt
@@ -225,7 +225,7 @@ in this example. Then the machine code will be generated as the module library.
 
  .. code-block:: none
 
-
    ...1%, 0.01 MB, 227 KB/s, 0 seconds passed
    ...3%, 0.02 MB, 452 KB/s, 0 seconds passed
    ...5%, 0.02 MB, 671 KB/s, 0 seconds passed
    ...6%, 0.03 MB, 891 KB/s, 0 seconds passed
    ...8%, 0.04 MB, 978 KB/s, 0 seconds passed
    ...10%, 0.05 MB, 1169 KB/s, 0 seconds passed
    ...11%, 0.05 MB, 1361 KB/s, 0 seconds passed
    ...13%, 0.06 MB, 1551 KB/s, 0 seconds passed
    ...15%, 0.07 MB, 1741 KB/s, 0 seconds passed
    ...16%, 0.08 MB, 1907 KB/s, 0 seconds passed
    ...18%, 0.09 MB, 2092 KB/s, 0 seconds passed
    ...20%, 0.09 MB, 2244 KB/s, 0 seconds passed
    ...21%, 0.10 MB, 2403 KB/s, 0 seconds passed
    ...23%, 0.11 MB, 2578 KB/s, 0 seconds passed
    ...25%, 0.12 MB, 2555 KB/s, 0 seconds passed
    ...26%, 0.12 MB, 2716 KB/s, 0 seconds passed
    ...28%, 0.13 MB, 2879 KB/s, 0 seconds passed
    ...30%, 0.14 MB, 3042 KB/s, 0 seconds passed
    ...31%, 0.15 MB, 3152 KB/s, 0 seconds passed
    ...33%, 0.16 MB, 3310 KB/s, 0 seconds passed
    ...35%, 0.16 MB, 3469 
 KB/s, 0 seconds passed
    ...36%, 0.17 MB, 3627 KB/s, 0 seconds passed
    ...38%, 0.18 MB, 3785 KB/s, 0 seconds passed
    ...40%, 0.19 MB, 3914 KB/s, 0 seconds passed
    ...41%, 0.20 MB, 4058 KB/s, 0 seconds passed
    ...43%, 0.20 MB, 4212 KB/s, 0 seconds passed
    ...45%, 0.21 MB, 4365 KB/s, 0 seconds passed
    ...46%, 0.22 MB, 4518 KB/s, 0 seconds passed
    ...48%, 0.23 MB, 4671 KB/s, 0 seconds passed
    ...50%, 0.23 MB, 4775 KB/s, 0 seconds passed
    ...51%, 0.24 MB, 4921 KB/s, 0 seconds passed
    ...53%, 0.25 MB, 5051 KB/s, 0 seconds passed
    ...55%, 0.26 MB, 5198 KB/s, 0 seconds passed
    ...56%, 0.27 MB, 5345 KB/s, 0 seconds passed
    ...58%, 0.27 MB, 5275 KB/s, 0 seconds passed
    ...60%, 0.28 MB, 5397 KB/s, 0 seconds passed
    ...61%, 0.29 MB, 5536 KB/s, 0 seconds passed
    ...63%, 0.30 MB, 5676 KB/s, 0 seconds passed
    ...65%, 0.30 MB, 5815 KB/s, 0 seconds passed
    ...66%, 0.31 MB, 5883 KB/s, 0 seconds passed
    ...68%, 0.32 MB, 6018 KB/s, 0 seconds p
 assed
    ...70%, 0.33 MB, 6154 KB/s, 0 seconds passed
    ...71%, 0.34 MB, 6289 KB/s, 0 seconds passed
    ...73%, 0.34 MB, 6425 KB/s, 0 seconds passed
    ...75%, 0.35 MB, 6556 KB/s, 0 seconds passed
    ...76%, 0.36 MB, 6691 KB/s, 0 seconds passed
    ...78%, 0.37 MB, 6824 KB/s, 0 seconds passed
    ...80%, 0.38 MB, 6924 KB/s, 0 seconds passed
    ...81%, 0.38 MB, 7054 KB/s, 0 seconds passed
    ...83%, 0.39 MB, 7185 KB/s, 0 seconds passed
    ...85%, 0.40 MB, 7309 KB/s, 0 seconds passed
    ...86%, 0.41 MB, 7438 KB/s, 0 seconds passed
    ...88%, 0.41 MB, 7567 KB/s, 0 seconds passed
    ...90%, 0.42 MB, 7697 KB/s, 0 seconds passed
    ...91%, 0.43 MB, 7827 KB/s, 0 seconds passed
    ...93%, 0.44 MB, 7956 KB/s, 0 seconds passed
    ...95%, 0.45 MB, 8084 KB/s, 0 seconds passed
    ...96%, 0.45 MB, 8213 KB/s, 0 seconds passed
    ...98%, 0.46 MB, 8341 KB/s, 0 seconds passed
    ...100%, 0.47 MB, 8451 KB/s, 0 seconds passed
+
    ...1%, 0.01 MB, 15 KB/s, 0 seconds passed
    ...3%, 0.02 MB, 30 KB/s, 0 seconds passed
    ...5%, 0.02 MB, 44 KB/s, 0 seconds passed
    ...6%, 0.03 MB, 59 KB/s, 0 seconds passed
    ...8%, 0.04 MB, 74 KB/s, 0 seconds passed
    ...10%, 0.05 MB, 89 KB/s, 0 seconds passed
    ...11%, 0.05 MB, 103 KB/s, 0 seconds passed
    ...13%, 0.06 MB, 118 KB/s, 0 seconds passed
    ...15%, 0.07 MB, 133 KB/s, 0 seconds passed
    ...16%, 0.08 MB, 148 KB/s, 0 seconds passed
    ...18%, 0.09 MB, 162 KB/s, 0 seconds passed
    ...20%, 0.09 MB, 177 KB/s, 0 seconds passed
    ...21%, 0.10 MB, 190 KB/s, 0 seconds passed
    ...23%, 0.11 MB, 205 KB/s, 0 seconds passed
    ...25%, 0.12 MB, 220 KB/s, 0 seconds passed
    ...26%, 0.12 MB, 234 KB/s, 0 seconds passed
    ...28%, 0.13 MB, 249 KB/s, 0 seconds passed
    ...30%, 0.14 MB, 263 KB/s, 0 seconds passed
    ...31%, 0.15 MB, 278 KB/s, 0 seconds passed
    ...33%, 0.16 MB, 292 KB/s, 0 seconds passed
    ...35%, 0.16 MB, 307 KB/s, 0 seconds passed
 
    ...36%, 0.17 MB, 321 KB/s, 0 seconds passed
    ...38%, 0.18 MB, 336 KB/s, 0 seconds passed
    ...40%, 0.19 MB, 350 KB/s, 0 seconds passed
    ...41%, 0.20 MB, 365 KB/s, 0 seconds passed
    ...43%, 0.20 MB, 379 KB/s, 0 seconds passed
    ...45%, 0.21 MB, 394 KB/s, 0 seconds passed
    ...46%, 0.22 MB, 408 KB/s, 0 seconds passed
    ...48%, 0.23 MB, 423 KB/s, 0 seconds passed
    ...50%, 0.23 MB, 435 KB/s, 0 seconds passed
    ...51%, 0.24 MB, 449 KB/s, 0 seconds passed
    ...53%, 0.25 MB, 464 KB/s, 0 seconds passed
    ...55%, 0.26 MB, 478 KB/s, 0 seconds passed
    ...56%, 0.27 MB, 493 KB/s, 0 seconds passed
    ...58%, 0.27 MB, 507 KB/s, 0 seconds passed
    ...60%, 0.28 MB, 521 KB/s, 0 seconds passed
    ...61%, 0.29 MB, 536 KB/s, 0 seconds passed
    ...63%, 0.30 MB, 550 KB/s, 0 seconds passed
    ...65%, 0.30 MB, 564 KB/s, 0 seconds passed
    ...66%, 0.31 MB, 579 KB/s, 0 seconds passed
    ...68%, 0.32 MB, 593 KB/s, 0 seconds passed
    ...70%, 0.33 MB, 608 KB/s, 0 sec
 onds passed
    ...71%, 0.34 MB, 622 KB/s, 0 seconds passed
    ...73%, 0.34 MB, 636 KB/s, 0 seconds passed
    ...75%, 0.35 MB, 650 KB/s, 0 seconds passed
    ...76%, 0.36 MB, 665 KB/s, 0 seconds passed
    ...78%, 0.37 MB, 679 KB/s, 0 seconds passed
    ...80%, 0.38 MB, 694 KB/s, 0 seconds passed
    ...81%, 0.38 MB, 708 KB/s, 0 seconds passed
    ...83%, 0.39 MB, 722 KB/s, 0 seconds passed
    ...85%, 0.40 MB, 737 KB/s, 0 seconds passed
    ...86%, 0.41 MB, 751 KB/s, 0 seconds passed
    ...88%, 0.41 MB, 765 KB/s, 0 seconds passed
    ...90%, 0.42 MB, 779 KB/s, 0 seconds passed
    ...91%, 0.43 MB, 794 KB/s, 0 seconds passed
    ...93%, 0.44 MB, 807 KB/s, 0 seconds passed
    ...95%, 0.45 MB, 821 KB/s, 0 seconds passed
    ...96%, 0.45 MB, 836 KB/s, 0 seconds passed
    ...98%, 0.46 MB, 850 KB/s, 0 seconds passed
    ...100%, 0.47 MB, 862 KB/s, 0 seconds passed
     Cannot find config for target=cuda -keys=cuda,gpu -max_num_threads=1024 -model=unknown -thread_warp_size=32, workload=('dense_small_batch.cuda', ('TENSOR', (1, 512), 'float32'), ('TENSOR', (1000, 512), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
 
 
diff --git a/docs/_sources/tutorials/get_started/sg_execution_times.rst.txt b/docs/_sources/tutorials/get_started/sg_execution_times.rst.txt
index 282019b..f892099 100644
--- a/docs/_sources/tutorials/get_started/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorials/get_started/sg_execution_times.rst.txt
@@ -5,9 +5,9 @@
 
 Computation times
 =================
-**00:17.053** total execution time for **tutorials_get_started** files:
+**00:17.472** total execution time for **tutorials_get_started** files:
 
-- **00:16.501**: :ref:`sphx_glr_tutorials_get_started_relay_quick_start.py` (``relay_quick_start.py``)
-- **00:00.349**: :ref:`sphx_glr_tutorials_get_started_tensor_expr_get_started.py` (``tensor_expr_get_started.py``)
-- **00:00.116**: :ref:`sphx_glr_tutorials_get_started_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``)
-- **00:00.086**: :ref:`sphx_glr_tutorials_get_started_tvmc_command_line_driver.py` (``tvmc_command_line_driver.py``)
+- **00:16.887**: :ref:`sphx_glr_tutorials_get_started_relay_quick_start.py` (``relay_quick_start.py``)
+- **00:00.370**: :ref:`sphx_glr_tutorials_get_started_tensor_expr_get_started.py` (``tensor_expr_get_started.py``)
+- **00:00.127**: :ref:`sphx_glr_tutorials_get_started_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``)
+- **00:00.088**: :ref:`sphx_glr_tutorials_get_started_tvmc_command_line_driver.py` (``tvmc_command_line_driver.py``)
diff --git a/docs/_sources/tutorials/get_started/tensor_expr_get_started.rst.txt b/docs/_sources/tutorials/get_started/tensor_expr_get_started.rst.txt
index f86234d..d71b47f 100644
--- a/docs/_sources/tutorials/get_started/tensor_expr_get_started.rst.txt
+++ b/docs/_sources/tutorials/get_started/tensor_expr_get_started.rst.txt
@@ -255,6 +255,20 @@ The following code fetches the device module and prints the content code.
  .. code-block:: none
 
     -----GPU code-----
+
+    #ifdef _WIN32
+      using uint = unsigned int;
+      using uchar = unsigned char;
+      using ushort = unsigned short;
+      using int64_t = long long;
+      using uint64_t = unsigned long long;
+    #else
+      #define uint unsigned int
+      #define uchar unsigned char
+      #define ushort unsigned short
+      #define int64_t long
+      #define uint64_t ulong
+    #endif
     extern "C" __global__ void myadd_kernel0(float* __restrict__ C, float* __restrict__ A, float* __restrict__ B, int n, int stride, int stride1, int stride2) {
       if (((int)blockIdx.x) < (n >> 6)) {
         C[((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride2))] = (A[((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride))] + B[((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride1))]);
@@ -325,7 +339,7 @@ The following code first performs the following steps:
 
  .. code-block:: none
 
-    ['myadd.tvm_meta.json', 'myadd.ptx', 'myadd.so', 'myadd.o']
+    ['myadd.tvm_meta.json', 'myadd.o', 'myadd.ptx', 'myadd.so']
 
 
 
diff --git a/docs/_sources/tutorials/index.rst.txt b/docs/_sources/tutorials/index.rst.txt
index 26e7976..7fd8a5c 100644
--- a/docs/_sources/tutorials/index.rst.txt
+++ b/docs/_sources/tutorials/index.rst.txt
@@ -990,6 +990,26 @@ AutoScheduler : Template-free Auto Scheduling
 
 .. only:: html
 
+    .. figure:: /tutorials/auto_scheduler/images/thumb/sphx_glr_tune_network_arm_thumb.png
+
+        :ref:`sphx_glr_tutorials_auto_scheduler_tune_network_arm.py`
+
+.. raw:: html
+
+    </div>
+
+
+.. toctree::
+   :hidden:
+
+   /tutorials/auto_scheduler/tune_network_arm
+
+.. raw:: html
+
+    <div class="sphx-glr-thumbcontainer" tooltip="Auto-tuning for specific devices and workloads is critical for getting the best performance. Th...">
+
+.. only:: html
+
     .. figure:: /tutorials/auto_scheduler/images/thumb/sphx_glr_tune_network_mali_thumb.png
 
         :ref:`sphx_glr_tutorials_auto_scheduler_tune_network_mali.py`
diff --git a/docs/_sources/tutorials/language/intrin_math.rst.txt b/docs/_sources/tutorials/language/intrin_math.rst.txt
index 8792aa2..085c4ca 100644
--- a/docs/_sources/tutorials/language/intrin_math.rst.txt
+++ b/docs/_sources/tutorials/language/intrin_math.rst.txt
@@ -67,6 +67,20 @@ In the following example, we use :any:`tvm.tir.call_pure_extern` to call
 
  .. code-block:: none
 
+
+    #ifdef _WIN32
+      using uint = unsigned int;
+      using uchar = unsigned char;
+      using ushort = unsigned short;
+      using int64_t = long long;
+      using uint64_t = unsigned long long;
+    #else
+      #define uint unsigned int
+      #define uchar unsigned char
+      #define ushort unsigned short
+      #define int64_t long
+      #define uint64_t ulong
+    #endif
     extern "C" __global__ void myexp_kernel0(float* __restrict__ B, float* __restrict__ A, int n, int stride, int stride1) {
       if (((int)blockIdx.x) < (n >> 6)) {
         B[((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride1))] = __expf(A[((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride))]);
@@ -117,6 +131,20 @@ The following code use te.exp instead, which create an intrinsic call
 
  .. code-block:: none
 
+
+    #ifdef _WIN32
+      using uint = unsigned int;
+      using uchar = unsigned char;
+      using ushort = unsigned short;
+      using int64_t = long long;
+      using uint64_t = unsigned long long;
+    #else
+      #define uint unsigned int
+      #define uchar unsigned char
+      #define ushort unsigned short
+      #define int64_t long
+      #define uint64_t ulong
+    #endif
     extern "C" __global__ void myexp_kernel0(float* __restrict__ B, float* __restrict__ A, int n, int stride, int stride1) {
       if (((int)blockIdx.x) < (n >> 6)) {
         B[((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride1))] = __expf(A[((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride))]);
@@ -226,6 +254,20 @@ fast math version :code:`__expf`.
 
  .. code-block:: none
 
+
+    #ifdef _WIN32
+      using uint = unsigned int;
+      using uchar = unsigned char;
+      using ushort = unsigned short;
+      using int64_t = long long;
+      using uint64_t = unsigned long long;
+    #else
+      #define uint unsigned int
+      #define uchar unsigned char
+      #define ushort unsigned short
+      #define int64_t long
+      #define uint64_t ulong
+    #endif
     extern "C" __global__ void myexp_kernel0(float* __restrict__ B, float* __restrict__ A, int n, int stride, int stride1) {
       if (((int)blockIdx.x) < (n >> 6)) {
         B[((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride1))] = expf(A[((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride))]);
@@ -292,6 +334,20 @@ The following example add an intrinsic :code:`mylog` to the system.
 
  .. code-block:: none
 
+
+    #ifdef _WIN32
+      using uint = unsigned int;
+      using uchar = unsigned char;
+      using ushort = unsigned short;
+      using int64_t = long long;
+      using uint64_t = unsigned long long;
+    #else
+      #define uint unsigned int
+      #define uchar unsigned char
+      #define ushort unsigned short
+      #define int64_t long
+      #define uint64_t ulong
+    #endif
     extern "C" __global__ void mylog_kernel0(float* __restrict__ B, float* __restrict__ A, int n, int stride, int stride1) {
       if (((int)blockIdx.x) < (n >> 6)) {
         B[((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride1))] = logf(A[((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride))]);
diff --git a/docs/_sources/tutorials/language/reduction.rst.txt b/docs/_sources/tutorials/language/reduction.rst.txt
index c54fb31..f39673d 100644
--- a/docs/_sources/tutorials/language/reduction.rst.txt
+++ b/docs/_sources/tutorials/language/reduction.rst.txt
@@ -325,6 +325,20 @@ columns by threadIdx.x and finally do a cross thread reduction over threadIdx.x
 
  .. code-block:: none
 
+
+    #ifdef _WIN32
+      using uint = unsigned int;
+      using uchar = unsigned char;
+      using ushort = unsigned short;
+      using int64_t = long long;
+      using uint64_t = unsigned long long;
+    #else
+      #define uint unsigned int
+      #define uchar unsigned char
+      #define ushort unsigned short
+      #define int64_t long
+      #define uint64_t ulong
+    #endif
     extern "C" __global__ void default_function_kernel0(float* __restrict__ A, float* __restrict__ B, int m, int n, int stride, int stride1, int stride2) {
       float B_rf[1];
       __shared__ float red_buf0[512];
diff --git a/docs/_sources/tutorials/language/schedule_primitives.rst.txt b/docs/_sources/tutorials/language/schedule_primitives.rst.txt
index bedd793..a549ff4 100644
--- a/docs/_sources/tutorials/language/schedule_primitives.rst.txt
+++ b/docs/_sources/tutorials/language/schedule_primitives.rst.txt
@@ -106,7 +106,7 @@ methods to schedule every stage.
 
 split
 -----
-:code:`split` can split a specified axis into two axises by
+:code:`split` can split a specified axis into two axes by
 :code:`factor`.
 
 
@@ -191,7 +191,7 @@ contrary with :code:`factor`.
 tile
 ----
 :code:`tile` help you execute the computation tile by tile over two
-axises.
+axes.
 
 
 .. code-block:: default
@@ -239,7 +239,7 @@ axises.
 
 fuse
 ----
-:code:`fuse` can fuse two consecutive axises of one computation.
+:code:`fuse` can fuse two consecutive axes of one computation.
 
 
 .. code-block:: default
@@ -248,7 +248,7 @@ fuse
     B = te.compute((m, n), lambda i, j: A[i, j], name="B")
 
     s = te.create_schedule(B.op)
-    # tile to four axises first: (i.outer, j.outer, i.inner, j.inner)
+    # tile to four axes first: (i.outer, j.outer, i.inner, j.inner)
     xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)
     # then fuse (i.inner, j.inner) into one axis: (i.inner.j.inner.fused)
     fused = s[B].fuse(xi, yi)
@@ -288,7 +288,7 @@ fuse
 
 reorder
 -------
-:code:`reorder` can reorder the axises in the specified order.
+:code:`reorder` can reorder the axes in the specified order.
 
 
 .. code-block:: default
@@ -297,9 +297,9 @@ reorder
     B = te.compute((m, n), lambda i, j: A[i, j], name="B")
 
     s = te.create_schedule(B.op)
-    # tile to four axises first: (i.outer, j.outer, i.inner, j.inner)
+    # tile to four axes first: (i.outer, j.outer, i.inner, j.inner)
     xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)
-    # then reorder the axises: (i.inner, j.outer, i.outer, j.inner)
+    # then reorder the axes: (i.inner, j.outer, i.outer, j.inner)
     s[B].reorder(xi, yo, xo, yi)
     print(tvm.lower(s, [A, B], simple_mode=True))
 
@@ -492,12 +492,12 @@ tensor is required.
 
     primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
-      buffers = {C: Buffer(C_2: Pointer(float32), float32, [m: int32], [stride: int32], type="auto"),
-                 B: Buffer(B_2: Pointer(float32), float32, [m], [stride_1: int32], type="auto"),
+      buffers = {B: Buffer(B_2: Pointer(float32), float32, [m: int32], [stride: int32], type="auto"),
+                 C: Buffer(C_2: Pointer(float32), float32, [m], [stride_1: int32], type="auto"),
                  A: Buffer(A_2: Pointer(float32), float32, [m], [stride_2: int32], type="auto")}
       buffer_map = {A_1: A, B_1: B, C_1: C} {
       for (i: int32, 0, m) {
-        C_2[(i*stride)] = (((float32*)A_2[(i*stride_2)] + 1f32)*2f32)
+        C_2[(i*stride_1)] = (((float32*)A_2[(i*stride_2)] + 1f32)*2f32)
       }
     }
 
@@ -533,15 +533,15 @@ compute_root
 
     primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
-      buffers = {B: Buffer(B_2: Pointer(float32), float32, [m: int32], [stride: int32], type="auto"),
-                 C: Buffer(C_2: Pointer(float32), float32, [m], [stride_1: int32], type="auto"),
+      buffers = {C: Buffer(C_2: Pointer(float32), float32, [m: int32], [stride: int32], type="auto"),
+                 B: Buffer(B_2: Pointer(float32), float32, [m], [stride_1: int32], type="auto"),
                  A: Buffer(A_2: Pointer(float32), float32, [m], [stride_2: int32], type="auto")}
       buffer_map = {A_1: A, B_1: B, C_1: C} {
       for (i: int32, 0, m) {
-        B_2[(i*stride)] = ((float32*)A_2[(i*stride_2)] + 1f32)
+        B_2[(i*stride_1)] = ((float32*)A_2[(i*stride_2)] + 1f32)
       }
       for (i_1: int32, 0, m) {
-        C_2[(i_1*stride_1)] = ((float32*)B_2[(i_1*stride)]*2f32)
+        C_2[(i_1*stride)] = ((float32*)B_2[(i_1*stride_1)]*2f32)
       }
     }
 
diff --git a/docs/_sources/tutorials/language/sg_execution_times.rst.txt b/docs/_sources/tutorials/language/sg_execution_times.rst.txt
index 6502bf9..f826fb4 100644
--- a/docs/_sources/tutorials/language/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorials/language/sg_execution_times.rst.txt
@@ -5,13 +5,13 @@
 
 Computation times
 =================
-**00:04.625** total execution time for **tutorials_language** files:
+**00:04.819** total execution time for **tutorials_language** files:
 
-- **00:01.643**: :ref:`sphx_glr_tutorials_language_intrin_math.py` (``intrin_math.py``)
-- **00:00.832**: :ref:`sphx_glr_tutorials_language_tensorize.py` (``tensorize.py``)
-- **00:00.618**: :ref:`sphx_glr_tutorials_language_scan.py` (``scan.py``)
-- **00:00.565**: :ref:`sphx_glr_tutorials_language_reduction.py` (``reduction.py``)
-- **00:00.321**: :ref:`sphx_glr_tutorials_language_extern_op.py` (``extern_op.py``)
-- **00:00.229**: :ref:`sphx_glr_tutorials_language_schedule_primitives.py` (``schedule_primitives.py``)
-- **00:00.210**: :ref:`sphx_glr_tutorials_language_tuple_inputs.py` (``tuple_inputs.py``)
-- **00:00.208**: :ref:`sphx_glr_tutorials_language_tedd.py` (``tedd.py``)
+- **00:01.752**: :ref:`sphx_glr_tutorials_language_intrin_math.py` (``intrin_math.py``)
+- **00:00.875**: :ref:`sphx_glr_tutorials_language_tensorize.py` (``tensorize.py``)
+- **00:00.634**: :ref:`sphx_glr_tutorials_language_scan.py` (``scan.py``)
+- **00:00.593**: :ref:`sphx_glr_tutorials_language_reduction.py` (``reduction.py``)
+- **00:00.327**: :ref:`sphx_glr_tutorials_language_extern_op.py` (``extern_op.py``)
+- **00:00.228**: :ref:`sphx_glr_tutorials_language_schedule_primitives.py` (``schedule_primitives.py``)
+- **00:00.213**: :ref:`sphx_glr_tutorials_language_tuple_inputs.py` (``tuple_inputs.py``)
+- **00:00.197**: :ref:`sphx_glr_tutorials_language_tedd.py` (``tedd.py``)
diff --git a/docs/_sources/tutorials/language/tensorize.rst.txt b/docs/_sources/tutorials/language/tensorize.rst.txt
index 43afa2a..a3cdb76 100644
--- a/docs/_sources/tutorials/language/tensorize.rst.txt
+++ b/docs/_sources/tutorials/language/tensorize.rst.txt
@@ -120,8 +120,8 @@ Thus we break down the matmul loops to make the innermost loops a (16x64) GEMV.
 
     primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
-      buffers = {C: Buffer(C_2: Pointer(float32), float32, [1024, 512], []),
-                 B: Buffer(B_2: Pointer(float32), float32, [512, 64], []),
+      buffers = {B: Buffer(B_2: Pointer(float32), float32, [512, 64], []),
+                 C: Buffer(C_2: Pointer(float32), float32, [1024, 512], []),
                  A: Buffer(A_2: Pointer(float32), float32, [1024, 64], [])}
       buffer_map = {A_1: A, B_1: B, C_1: C} {
       for (i: int32, 0, 1024) {
@@ -313,8 +313,8 @@ The importing needs to happen before the tensorized GEMV being executed.
                  B: Buffer(B_2: Pointer(float32), float32, [512, 64], []),
                  A: Buffer(A_2: Pointer(float32), float32, [1024, 64], [])}
       buffer_map = {A_1: A, B_1: B, C_1: C} {
-      attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmp7la7jmx3/input0.cc'
-    source_filename = "/tmp/tmp7la7jmx3/input0.cc"
+      attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmpugf1o730/input0.cc'
+    source_filename = "/tmp/tmpugf1o730/input0.cc"
     target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
     target triple = "x86_64-pc-linux-gnu"
 
diff --git a/docs/_sources/tutorials/language/tuple_inputs.rst.txt b/docs/_sources/tutorials/language/tuple_inputs.rst.txt
index 8345eec..9289c63 100644
--- a/docs/_sources/tutorials/language/tuple_inputs.rst.txt
+++ b/docs/_sources/tutorials/language/tuple_inputs.rst.txt
@@ -65,14 +65,14 @@ together in the next schedule procedure.
     primfn(A0_1: handle, A1_1: handle, B.v0_1: handle, B.v1_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
       buffers = {B.v1: Buffer(B.v1_2: Pointer(float32), float32, [m: int32, n: int32], [stride: int32, stride_1: int32], type="auto"),
-                 B.v0: Buffer(B.v0_2: Pointer(float32), float32, [m, n], [stride_2: int32, stride_3: int32], type="auto"),
-                 A1: Buffer(A1_2: Pointer(float32), float32, [m, n], [stride_4: int32, stride_5: int32], type="auto"),
+                 A1: Buffer(A1_2: Pointer(float32), float32, [m, n], [stride_2: int32, stride_3: int32], type="auto"),
+                 B.v0: Buffer(B.v0_2: Pointer(float32), float32, [m, n], [stride_4: int32, stride_5: int32], type="auto"),
                  A0: Buffer(A0_2: Pointer(float32), float32, [m, n], [stride_6: int32, stride_7: int32], type="auto")}
       buffer_map = {A0_1: A0, A1_1: A1, B.v0_1: B.v0, B.v1_1: B.v1} {
       for (i: int32, 0, m) {
         for (j: int32, 0, n) {
-          B.v0_2[((i*stride_2) + (j*stride_3))] = ((float32*)A0_2[((i*stride_6) + (j*stride_7))] + 2f32)
-          B.v1_2[((i*stride) + (j*stride_1))] = ((float32*)A1_2[((i*stride_4) + (j*stride_5))]*3f32)
+          B.v0_2[((i*stride_4) + (j*stride_5))] = ((float32*)A0_2[((i*stride_6) + (j*stride_7))] + 2f32)
+          B.v1_2[((i*stride) + (j*stride_1))] = ((float32*)A1_2[((i*stride_2) + (j*stride_3))]*3f32)
         }
       }
     }
@@ -135,17 +135,17 @@ with :py:func:`te.comm_reducer` as below:
 
     primfn(idx_1: handle, val_1: handle, T.v0_1: handle, T.v1_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
-      buffers = {T.v0: Buffer(T.v0_2: Pointer(int32), int32, [m: int32], [stride: int32], type="auto"),
-                 T.v1: Buffer(T.v1_2: Pointer(int32), int32, [m], [stride_1: int32], type="auto"),
+      buffers = {T.v1: Buffer(T.v1_2: Pointer(int32), int32, [m: int32], [stride: int32], type="auto"),
+                 T.v0: Buffer(T.v0_2: Pointer(int32), int32, [m], [stride_1: int32], type="auto"),
                  val: Buffer(val_2: Pointer(int32), int32, [m, n: int32], [stride_2: int32, stride_3: int32], type="auto"),
                  idx: Buffer(idx_2: Pointer(int32), int32, [m, n], [stride_4: int32, stride_5: int32], type="auto")}
       buffer_map = {idx_1: idx, val_1: val, T.v0_1: T.v0, T.v1_1: T.v1} {
       for (i: int32, 0, m) {
-        T.v0_2[(i*stride)] = -1
-        T.v1_2[(i*stride_1)] = -2147483648
+        T.v0_2[(i*stride_1)] = -1
+        T.v1_2[(i*stride)] = -2147483648
         for (k: int32, 0, n) {
-          T.v0_2[(i*stride)] = @tir.if_then_else(((int32*)val_2[((i*stride_2) + (k*stride_3))] <= (int32*)T.v1_2[(i*stride_1)]), (int32*)T.v0_2[(i*stride)], (int32*)idx_2[((i*stride_4) + (k*stride_5))], dtype=int32)
-          T.v1_2[(i*stride_1)] = @tir.if_then_else(((int32*)val_2[((i*stride_2) + (k*stride_3))] <= (int32*)T.v1_2[(i*stride_1)]), (int32*)T.v1_2[(i*stride_1)], (int32*)val_2[((i*stride_2) + (k*stride_3))], dtype=int32)
+          T.v0_2[(i*stride_1)] = @tir.if_then_else(((int32*)val_2[((i*stride_2) + (k*stride_3))] <= (int32*)T.v1_2[(i*stride)]), (int32*)T.v0_2[(i*stride_1)], (int32*)idx_2[((i*stride_4) + (k*stride_5))], dtype=int32)
+          T.v1_2[(i*stride)] = @tir.if_then_else(((int32*)val_2[((i*stride_2) + (k*stride_3))] <= (int32*)T.v1_2[(i*stride)]), (int32*)T.v1_2[(i*stride)], (int32*)val_2[((i*stride_2) + (k*stride_3))], dtype=int32)
         }
       }
     }
diff --git a/docs/_sources/tutorials/micro/micro_reference_vm.rst.txt b/docs/_sources/tutorials/micro/micro_reference_vm.rst.txt
index 2d4760c..12651e8 100644
--- a/docs/_sources/tutorials/micro/micro_reference_vm.rst.txt
+++ b/docs/_sources/tutorials/micro/micro_reference_vm.rst.txt
@@ -51,15 +51,17 @@ Installing prerequisites
 
 A minimal set of prerequisites are needed:
 
-
 1. `Vagrant <https://vagrantup.com>`__
-2. A supported Virtual Machine hypervisor.
-   `VirtualBox <https://www.virtualbox.org>`__ is one suggested free hypervisor, but please note
+2. A supported Virtual Machine hypervisor (**VirtualBox**, **Parallels**, or **VMWare Fusion/Workstation**).
+   `VirtualBox <https://www.virtualbox.org>`__ is a suggested free hypervisor, but please note
    that the `VirtualBox Extension Pack`_ is required for proper USB forwarding. If using VirtualBox,
    also consider installing the `vbguest <https://github.com/dotless-de/vagrant-vbguest>`_ plugin.
 
 .. _VirtualBox Extension Pack: https://www.virtualbox.org/wiki/Downloads#VirtualBox6.1.16OracleVMVirtualBoxExtensionPack
 
+3. If required for your hypervisor, the
+   `Vagrant provider plugin <https://github.com/hashicorp/vagrant/wiki/Available-Vagrant-Plugins#providers>`__ (or see `here <https://www.vagrantup.com/vmware>`__ for VMWare).
+
 First boot
 ----------
 
@@ -67,9 +69,9 @@ The first time you use a reference VM, you need to create the box locally and th
 
 .. code-block:: bash
 
-    # Replace zepyhr with the name of a different platform, if you are not using Zephyr.
+    # Replace zephyr with the name of a different platform, if you are not using Zephyr.
     ~/.../tvm $ cd apps/microtvm/reference-vm/zephyr
-    # Replace <provider_name> with the name of the hypervisor you wish to use (i.e. virtualbox).
+    # Replace <provider_name> with the name of the hypervisor you wish to use (i.e. virtualbox, parallels, vmware_desktop).
     ~/.../tvm/apps/microtvm/reference-vm/zephyr $ vagrant up --provider=<provider_name>
 
 
@@ -132,7 +134,20 @@ Once the VM has been provisioned, tests can executed using ``poetry``:
 
 .. code-block:: bash
 
-    $ poetry run python3 tests/micro/qemu/test_zephyr.py --microtvm-platforms=stm32f746xx
+    $ cd apps/microtvm/reference-vm/zephyr
+    $ poetry run python3 ../../../../tests/micro/qemu/test_zephyr.py --microtvm-platforms=stm32f746xx
+
+If you do not have physical hardware attached, but wish to run the tests using the
+local QEMU emulator running within the VM, run the following commands instead:
+
+.. code-block:: bash
+
+    $ cd /Users/yourusername/path/to/tvm
+    $ sudo ./docker/install/ubuntu_install_qemu.sh
+    $ cd apps/microtvm/reference-vm/zephyr/
+    $ poetry run pytest ../../../../tests/micro/qemu/test_zephyr.py --microtvm-platforms=host
+
+
 
 
 
diff --git a/docs/_sources/tutorials/micro/micro_tflite.rst.txt b/docs/_sources/tutorials/micro/micro_tflite.rst.txt
index 5f33141..2ce7306 100644
--- a/docs/_sources/tutorials/micro/micro_tflite.rst.txt
+++ b/docs/_sources/tutorials/micro/micro_tflite.rst.txt
@@ -116,6 +116,8 @@ directory into a buffer
 
     import os
     import numpy as np
+    import logging
+
     import tvm
     import tvm.micro as micro
     from tvm.contrib.download import download_testdata
@@ -250,7 +252,7 @@ Now, compile the model for the target:
 
 
     with tvm.transform.PassContext(
-        opt_level=3, config={"tir.disable_vectorize": True}, disabled_pass=["FuseOps"]
+        opt_level=3, config={"tir.disable_vectorize": True}, disabled_pass=["FuseOps", "AlterOpLayout"]
     ):
         graph, c_mod, c_params = relay.build(mod, target=TARGET, params=params)
 
@@ -262,7 +264,9 @@ Now, compile the model for the target:
     # First, compile a static microTVM runtime for the targeted device. In this case, the host simulated
     # device is used.
     compiler = tvm.micro.DefaultCompiler(target=TARGET)
-    opts = tvm.micro.default_options(os.path.join(tvm.micro.CRT_ROOT_DIR, "host"))
+    opts = tvm.micro.default_options(
+        os.path.join(tvm.micro.get_standalone_crt_dir(), "template", "host")
+    )
 
     # %%
     # Compiling for physical hardware
@@ -282,21 +286,20 @@ Now, compile the model for the target:
     #     )
     #
     #     opts = tvm.micro.default_options(f"{project_dir}/crt")
+    #
+    # enable printing memory usage statistics of the runtime image
+    # generated by Zephyr compiler for the physical hardware
+    # logging.basicConfig(level="INFO")
 
     workspace = tvm.micro.Workspace()
     micro_binary = tvm.micro.build_static_runtime(
-        # the x86 compiler *expects* you to give the exact same dictionary for both
-        # lib_opts and bin_opts. so the library compiler is mutating lib_opts and
-        # the binary compiler is expecting those mutations to be in bin_opts.
-        # TODO(weberlo) fix this very bizarre behavior
         workspace,
         compiler,
         c_mod,
-        lib_opts=opts["lib_opts"],
-        bin_opts=opts["bin_opts"],
+        opts,
         # Use the microTVM memory manager. If, in your main.cc, you change TVMPlatformMemoryAllocate and
         # TVMPlatformMemoryFree to use e.g. malloc() and free(), you can omit this extra library.
-        extra_libs=[os.path.join(tvm.micro.build.CRT_ROOT_DIR, "memory")],
+        extra_libs=[tvm.micro.get_standalone_crt_lib("memory")],
     )
 
 
diff --git a/docs/_sources/tutorials/micro/sg_execution_times.rst.txt b/docs/_sources/tutorials/micro/sg_execution_times.rst.txt
index 95d504b..ac2faac 100644
--- a/docs/_sources/tutorials/micro/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorials/micro/sg_execution_times.rst.txt
@@ -5,7 +5,7 @@
 
 Computation times
 =================
-**00:07.239** total execution time for **tutorials_micro** files:
+**00:07.466** total execution time for **tutorials_micro** files:
 
-- **00:07.039**: :ref:`sphx_glr_tutorials_micro_micro_tflite.py` (``micro_tflite.py``)
+- **00:07.266**: :ref:`sphx_glr_tutorials_micro_micro_tflite.py` (``micro_tflite.py``)
 - **00:00.200**: :ref:`sphx_glr_tutorials_micro_micro_reference_vm.py` (``micro_reference_vm.py``)
diff --git a/docs/_sources/tutorials/optimize/opt_conv_cuda.rst.txt b/docs/_sources/tutorials/optimize/opt_conv_cuda.rst.txt
index 0f7f306..b70af13 100644
--- a/docs/_sources/tutorials/optimize/opt_conv_cuda.rst.txt
+++ b/docs/_sources/tutorials/optimize/opt_conv_cuda.rst.txt
@@ -296,7 +296,7 @@ latency of convolution.
 
  .. code-block:: none
 
-    Convolution: 53.307043 ms
+    Convolution: 53.482543 ms
 
 
 
diff --git a/docs/_sources/tutorials/optimize/opt_conv_tensorcore.rst.txt b/docs/_sources/tutorials/optimize/opt_conv_tensorcore.rst.txt
index 0c5ad32..226dabb 100644
--- a/docs/_sources/tutorials/optimize/opt_conv_tensorcore.rst.txt
+++ b/docs/_sources/tutorials/optimize/opt_conv_tensorcore.rst.txt
@@ -624,7 +624,7 @@ be able to run on our build server
 
  .. code-block:: none
 
-    conv2d with tensor core: 12.251759 ms
+    conv2d with tensor core: 9.780257 ms
 
 
 
diff --git a/docs/_sources/tutorials/optimize/opt_gemm.rst.txt b/docs/_sources/tutorials/optimize/opt_gemm.rst.txt
index 7db6bae..6dafcbb 100644
--- a/docs/_sources/tutorials/optimize/opt_gemm.rst.txt
+++ b/docs/_sources/tutorials/optimize/opt_gemm.rst.txt
@@ -118,8 +118,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
 
  .. code-block:: none
 
-    Numpy running time: 0.011535
-    Baseline: 3.228378
+    Numpy running time: 0.007555
+    Baseline: 3.509955
 
 
 
@@ -206,7 +206,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
 
  .. code-block:: none
 
-    Opt1: 0.290805
+    Opt1: 0.294489
 
 
 
@@ -300,7 +300,7 @@ In this tutorial, we chose to vectorize the inner loop row data since it is cach
 
  .. code-block:: none
 
-    Opt2: 0.325919
+    Opt2: 0.323880
 
 
 
@@ -389,7 +389,7 @@ the access pattern for A matrix is more cache friendly.
 
  .. code-block:: none
 
-    Opt3: 0.111198
+    Opt3: 0.109823
 
 
 
@@ -499,7 +499,7 @@ the corresponding value from the packed array.
 
  .. code-block:: none
 
-    Opt4: 0.130241
+    Opt4: 0.106586
 
 
 
@@ -523,8 +523,8 @@ Here is the generated IR after array packing.
 
     primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
-      buffers = {B: Buffer(B_2: Pointer(float32), float32, [1024, 1024], []),
-                 C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),
+      buffers = {C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),
+                 B: Buffer(B_2: Pointer(float32), float32, [1024, 1024], []),
                  A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], [])}
       buffer_map = {A_1: A, B_1: B, C_1: C} {
       attr [packedB: Pointer(float32x32)] "storage_scope" = "global";
@@ -609,7 +609,7 @@ write to C when all the block results are ready.
 
  .. code-block:: none
 
-    Opt5: 0.097576
+    Opt5: 0.106497
 
 
 
@@ -725,7 +725,7 @@ Futhermore, we can also utilize multi-core processors to do the thread-level par
 
  .. code-block:: none
 
-    Opt6: 0.032168
+    Opt6: 0.034866
 
 
 
diff --git a/docs/_sources/tutorials/optimize/sg_execution_times.rst.txt b/docs/_sources/tutorials/optimize/sg_execution_times.rst.txt
index ab91cc9..463d839 100644
--- a/docs/_sources/tutorials/optimize/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorials/optimize/sg_execution_times.rst.txt
@@ -5,9 +5,9 @@
 
 Computation times
 =================
-**00:32.312** total execution time for **tutorials_optimize** files:
+**00:27.942** total execution time for **tutorials_optimize** files:
 
-- **00:25.068**: :ref:`sphx_glr_tutorials_optimize_opt_gemm.py` (``opt_gemm.py``)
-- **00:04.598**: :ref:`sphx_glr_tutorials_optimize_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``)
-- **00:02.435**: :ref:`sphx_glr_tutorials_optimize_opt_conv_cuda.py` (``opt_conv_cuda.py``)
-- **00:00.211**: :ref:`sphx_glr_tutorials_optimize_opt_matmul_auto_tensorcore.py` (``opt_matmul_auto_tensorcore.py``)
+- **00:25.450**: :ref:`sphx_glr_tutorials_optimize_opt_gemm.py` (``opt_gemm.py``)
+- **00:01.264**: :ref:`sphx_glr_tutorials_optimize_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``)
+- **00:01.010**: :ref:`sphx_glr_tutorials_optimize_opt_conv_cuda.py` (``opt_conv_cuda.py``)
+- **00:00.219**: :ref:`sphx_glr_tutorials_optimize_opt_matmul_auto_tensorcore.py` (``opt_matmul_auto_tensorcore.py``)
diff --git a/docs/_sources/tutorials/topi/intro_topi.rst.txt b/docs/_sources/tutorials/topi/intro_topi.rst.txt
index d1d5dfc..84a10c5 100644
--- a/docs/_sources/tutorials/topi/intro_topi.rst.txt
+++ b/docs/_sources/tutorials/topi/intro_topi.rst.txt
@@ -231,7 +231,7 @@ As you can see, scheduled stages of computation have been accumulated and we can
 
  .. code-block:: none
 
-    [stage(a, placeholder(a, 0x1962a8290)), stage(b, placeholder(b, 0x18fb693b0)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range( [...]
+    [stage(a, placeholder(a, 0x17cbc1cd0)), stage(b, placeholder(b, 0x152bfe630)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range( [...]
 
 
 
diff --git a/docs/_sources/tutorials/topi/sg_execution_times.rst.txt b/docs/_sources/tutorials/topi/sg_execution_times.rst.txt
index da8938b..21d6fa3 100644
--- a/docs/_sources/tutorials/topi/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorials/topi/sg_execution_times.rst.txt
@@ -5,6 +5,6 @@
 
 Computation times
 =================
-**00:00.651** total execution time for **tutorials_topi** files:
+**00:00.687** total execution time for **tutorials_topi** files:
 
-- **00:00.651**: :ref:`sphx_glr_tutorials_topi_intro_topi.py` (``intro_topi.py``)
+- **00:00.687**: :ref:`sphx_glr_tutorials_topi_intro_topi.py` (``intro_topi.py``)
diff --git a/docs/_sources/vta/tutorials/autotvm/sg_execution_times.rst.txt b/docs/_sources/vta/tutorials/autotvm/sg_execution_times.rst.txt
index a072155..22aa451 100644
--- a/docs/_sources/vta/tutorials/autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/vta/tutorials/autotvm/sg_execution_times.rst.txt
@@ -5,6 +5,6 @@
 
 Computation times
 =================
-**00:07.668** total execution time for **vta_tutorials_autotvm** files:
+**00:07.694** total execution time for **vta_tutorials_autotvm** files:
 
-- **00:07.668**: :ref:`sphx_glr_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``)
+- **00:07.694**: :ref:`sphx_glr_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``)
diff --git a/docs/_sources/vta/tutorials/autotvm/tune_relay_vta.rst.txt b/docs/_sources/vta/tutorials/autotvm/tune_relay_vta.rst.txt
index e7668a7..eaf4b4b 100644
--- a/docs/_sources/vta/tutorials/autotvm/tune_relay_vta.rst.txt
+++ b/docs/_sources/vta/tutorials/autotvm/tune_relay_vta.rst.txt
@@ -225,6 +225,7 @@ Here we use an Pynq-Z1 board as an example.
                 port=tracker_port,
                 number=5,
                 timeout=60,
+                module_loader=vta.module_loader(),
                 # check_correctness=True, # TODO: re-enable when check_correctness works again.
             ),
         ),
@@ -498,7 +499,7 @@ Finally, we launch tuning jobs and evaluate the end-to-end performance.
  .. code-block:: none
 
     Extract tasks...
-
    ...1%, 0.01 MB, 47 KB/s, 0 seconds passed
    ...2%, 0.02 MB, 94 KB/s, 0 seconds passed
    ...3%, 0.02 MB, 141 KB/s, 0 seconds passed
    ...4%, 0.03 MB, 188 KB/s, 0 seconds passed
    ...5%, 0.04 MB, 227 KB/s, 0 seconds passed
    ...6%, 0.05 MB, 272 KB/s, 0 seconds passed
    ...7%, 0.05 MB, 318 KB/s, 0 seconds passed
    ...8%, 0.06 MB, 363 KB/s, 0 seconds passed
    ...9%, 0.07 MB, 408 KB/s, 0 seconds passed
    ...10%, 0.08 MB, 452 KB/s, 0 seconds passed
    ...11%, 0.09 MB, 497 KB/s, 0 seconds passed
    ...13%, 0.09 MB, 540 KB/s, 0 seconds passed
    ...14%, 0.10 MB, 585 KB/s, 0 seconds passed
    ...15%, 0.11 MB, 627 KB/s, 0 seconds passed
    ...16%, 0.12 MB, 660 KB/s, 0 seconds passed
    ...17%, 0.12 MB, 703 KB/s, 0 seconds passed
    ...18%, 0.13 MB, 746 KB/s, 0 seconds passed
    ...19%, 0.14 MB, 790 KB/s, 0 seconds passed
    ...20%, 0.15 MB, 831 KB/s, 0 seconds passed
    ...21%, 0.16 MB, 875 KB/s, 0 seconds passed
    ...22%, 0.16 MB, 918 KB/s, 0 seconds passed
 
    ...23%, 0.17 MB, 961 KB/s, 0 seconds passed
    ...24%, 0.18 MB, 1004 KB/s, 0 seconds passed
    ...26%, 0.19 MB, 1044 KB/s, 0 seconds passed
    ...27%, 0.20 MB, 1084 KB/s, 0 seconds passed
    ...28%, 0.20 MB, 1127 KB/s, 0 seconds passed
    ...29%, 0.21 MB, 1170 KB/s, 0 seconds passed
    ...30%, 0.22 MB, 1213 KB/s, 0 seconds passed
    ...31%, 0.23 MB, 1254 KB/s, 0 seconds passed
    ...32%, 0.23 MB, 1297 KB/s, 0 seconds passed
    ...33%, 0.24 MB, 1318 KB/s, 0 seconds passed
    ...34%, 0.25 MB, 1360 KB/s, 0 seconds passed
    ...35%, 0.26 MB, 1402 KB/s, 0 seconds passed
    ...36%, 0.27 MB, 1444 KB/s, 0 seconds passed
    ...38%, 0.27 MB, 1486 KB/s, 0 seconds passed
    ...39%, 0.28 MB, 1528 KB/s, 0 seconds passed
    ...40%, 0.29 MB, 1569 KB/s, 0 seconds passed
    ...41%, 0.30 MB, 1611 KB/s, 0 seconds passed
    ...42%, 0.30 MB, 1650 KB/s, 0 seconds passed
    ...43%, 0.31 MB, 1692 KB/s, 0 seconds passed
    ...44%, 0.32 MB, 1733 KB/s, 0 seconds passed
    ...45%, 0.33 
 MB, 1775 KB/s, 0 seconds passed
    ...46%, 0.34 MB, 1817 KB/s, 0 seconds passed
    ...47%, 0.34 MB, 1858 KB/s, 0 seconds passed
    ...48%, 0.35 MB, 1900 KB/s, 0 seconds passed
    ...49%, 0.36 MB, 1941 KB/s, 0 seconds passed
    ...51%, 0.37 MB, 1973 KB/s, 0 seconds passed
    ...52%, 0.38 MB, 2014 KB/s, 0 seconds passed
    ...53%, 0.38 MB, 2055 KB/s, 0 seconds passed
    ...54%, 0.39 MB, 2096 KB/s, 0 seconds passed
    ...55%, 0.40 MB, 2136 KB/s, 0 seconds passed
    ...56%, 0.41 MB, 2177 KB/s, 0 seconds passed
    ...57%, 0.41 MB, 2218 KB/s, 0 seconds passed
    ...58%, 0.42 MB, 2259 KB/s, 0 seconds passed
    ...59%, 0.43 MB, 2300 KB/s, 0 seconds passed
    ...60%, 0.44 MB, 2341 KB/s, 0 seconds passed
    ...61%, 0.45 MB, 2382 KB/s, 0 seconds passed
    ...63%, 0.45 MB, 2423 KB/s, 0 seconds passed
    ...64%, 0.46 MB, 2464 KB/s, 0 seconds passed
    ...65%, 0.47 MB, 2505 KB/s, 0 seconds passed
    ...66%, 0.48 MB, 2546 KB/s, 0 seconds passed
    ...67%, 0.48 MB, 2586 KB/s, 0 
 seconds passed
    ...68%, 0.49 MB, 2627 KB/s, 0 seconds passed
    ...69%, 0.50 MB, 2667 KB/s, 0 seconds passed
    ...70%, 0.51 MB, 2708 KB/s, 0 seconds passed
    ...71%, 0.52 MB, 2749 KB/s, 0 seconds passed
    ...72%, 0.52 MB, 2790 KB/s, 0 seconds passed
    ...73%, 0.53 MB, 2830 KB/s, 0 seconds passed
    ...74%, 0.54 MB, 2871 KB/s, 0 seconds passed
    ...76%, 0.55 MB, 2883 KB/s, 0 seconds passed
    ...77%, 0.55 MB, 2923 KB/s, 0 seconds passed
    ...78%, 0.56 MB, 2962 KB/s, 0 seconds passed
    ...79%, 0.57 MB, 3002 KB/s, 0 seconds passed
    ...80%, 0.58 MB, 3041 KB/s, 0 seconds passed
    ...81%, 0.59 MB, 3081 KB/s, 0 seconds passed
    ...82%, 0.59 MB, 3121 KB/s, 0 seconds passed
    ...83%, 0.60 MB, 3161 KB/s, 0 seconds passed
    ...84%, 0.61 MB, 3201 KB/s, 0 seconds passed
    ...85%, 0.62 MB, 3241 KB/s, 0 seconds passed
    ...86%, 0.62 MB, 3279 KB/s, 0 seconds passed
    ...87%, 0.63 MB, 3318 KB/s, 0 seconds passed
    ...89%, 0.64 MB, 3358 KB/s, 0 seconds passed
  
   ...90%, 0.65 MB, 3396 KB/s, 0 seconds passed
    ...91%, 0.66 MB, 3436 KB/s, 0 seconds passed
    ...92%, 0.66 MB, 3475 KB/s, 0 seconds passed
    ...93%, 0.67 MB, 3515 KB/s, 0 seconds passed
    ...94%, 0.68 MB, 3554 KB/s, 0 seconds passed
    ...95%, 0.69 MB, 3594 KB/s, 0 seconds passed
    ...96%, 0.70 MB, 3634 KB/s, 0 seconds passed
    ...97%, 0.70 MB, 3673 KB/s, 0 seconds passed
    ...98%, 0.71 MB, 3713 KB/s, 0 seconds passed
    ...99%, 0.72 MB, 3752 KB/s, 0 seconds passed
    ...100%, 0.73 MB, 3790 KB/s, 0 seconds passed
+
    ...1%, 0.01 MB, 217 KB/s, 0 seconds passed
    ...2%, 0.02 MB, 432 KB/s, 0 seconds passed
    ...3%, 0.02 MB, 641 KB/s, 0 seconds passed
    ...4%, 0.03 MB, 834 KB/s, 0 seconds passed
    ...5%, 0.04 MB, 1017 KB/s, 0 seconds passed
    ...6%, 0.05 MB, 1083 KB/s, 0 seconds passed
    ...7%, 0.05 MB, 1261 KB/s, 0 seconds passed
    ...8%, 0.06 MB, 1416 KB/s, 0 seconds passed
    ...9%, 0.07 MB, 1590 KB/s, 0 seconds passed
    ...10%, 0.08 MB, 1734 KB/s, 0 seconds passed
    ...11%, 0.09 MB, 1903 KB/s, 0 seconds passed
    ...13%, 0.09 MB, 2034 KB/s, 0 seconds passed
    ...14%, 0.10 MB, 1962 KB/s, 0 seconds passed
    ...15%, 0.11 MB, 2109 KB/s, 0 seconds passed
    ...16%, 0.12 MB, 2224 KB/s, 0 seconds passed
    ...17%, 0.12 MB, 2368 KB/s, 0 seconds passed
    ...18%, 0.13 MB, 2511 KB/s, 0 seconds passed
    ...19%, 0.14 MB, 2654 KB/s, 0 seconds passed
    ...20%, 0.15 MB, 2767 KB/s, 0 seconds passed
    ...21%, 0.16 MB, 2907 KB/s, 0 seconds passed
    ...22%, 0.16 MB, 3047 KB/
 s, 0 seconds passed
    ...23%, 0.17 MB, 3186 KB/s, 0 seconds passed
    ...24%, 0.18 MB, 3310 KB/s, 0 seconds passed
    ...26%, 0.19 MB, 3447 KB/s, 0 seconds passed
    ...27%, 0.20 MB, 3584 KB/s, 0 seconds passed
    ...28%, 0.20 MB, 3485 KB/s, 0 seconds passed
    ...29%, 0.21 MB, 3547 KB/s, 0 seconds passed
    ...30%, 0.22 MB, 3669 KB/s, 0 seconds passed
    ...31%, 0.23 MB, 3792 KB/s, 0 seconds passed
    ...32%, 0.23 MB, 3917 KB/s, 0 seconds passed
    ...33%, 0.24 MB, 4011 KB/s, 0 seconds passed
    ...34%, 0.25 MB, 4134 KB/s, 0 seconds passed
    ...35%, 0.26 MB, 4259 KB/s, 0 seconds passed
    ...36%, 0.27 MB, 4373 KB/s, 0 seconds passed
    ...38%, 0.27 MB, 4497 KB/s, 0 seconds passed
    ...39%, 0.28 MB, 4619 KB/s, 0 seconds passed
    ...40%, 0.29 MB, 4743 KB/s, 0 seconds passed
    ...41%, 0.30 MB, 4833 KB/s, 0 seconds passed
    ...42%, 0.30 MB, 4955 KB/s, 0 seconds passed
    ...43%, 0.31 MB, 5076 KB/s, 0 seconds passed
    ...44%, 0.32 MB, 5198 KB/s, 0 seconds pass
 ed
    ...45%, 0.33 MB, 5288 KB/s, 0 seconds passed
    ...46%, 0.34 MB, 5409 KB/s, 0 seconds passed
    ...47%, 0.34 MB, 5528 KB/s, 0 seconds passed
    ...48%, 0.35 MB, 5648 KB/s, 0 seconds passed
    ...49%, 0.36 MB, 5747 KB/s, 0 seconds passed
    ...51%, 0.37 MB, 5866 KB/s, 0 seconds passed
    ...52%, 0.38 MB, 5983 KB/s, 0 seconds passed
    ...53%, 0.38 MB, 6102 KB/s, 0 seconds passed
    ...54%, 0.39 MB, 6218 KB/s, 0 seconds passed
    ...55%, 0.40 MB, 6336 KB/s, 0 seconds passed
    ...56%, 0.41 MB, 6401 KB/s, 0 seconds passed
    ...57%, 0.41 MB, 6518 KB/s, 0 seconds passed
    ...58%, 0.42 MB, 6633 KB/s, 0 seconds passed
    ...59%, 0.43 MB, 6749 KB/s, 0 seconds passed
    ...60%, 0.44 MB, 6641 KB/s, 0 seconds passed
    ...61%, 0.45 MB, 6749 KB/s, 0 seconds passed
    ...63%, 0.45 MB, 6740 KB/s, 0 seconds passed
    ...64%, 0.46 MB, 6849 KB/s, 0 seconds passed
    ...65%, 0.47 MB, 6887 KB/s, 0 seconds passed
    ...66%, 0.48 MB, 6995 KB/s, 0 seconds passed
    ...67%, 0.
 48 MB, 7102 KB/s, 0 seconds passed
    ...68%, 0.49 MB, 7210 KB/s, 0 seconds passed
    ...69%, 0.50 MB, 7316 KB/s, 0 seconds passed
    ...70%, 0.51 MB, 7424 KB/s, 0 seconds passed
    ...71%, 0.52 MB, 7530 KB/s, 0 seconds passed
    ...72%, 0.52 MB, 7637 KB/s, 0 seconds passed
    ...73%, 0.53 MB, 7742 KB/s, 0 seconds passed
    ...74%, 0.54 MB, 7849 KB/s, 0 seconds passed
    ...76%, 0.55 MB, 7954 KB/s, 0 seconds passed
    ...77%, 0.55 MB, 8061 KB/s, 0 seconds passed
    ...78%, 0.56 MB, 8166 KB/s, 0 seconds passed
    ...79%, 0.57 MB, 8272 KB/s, 0 seconds passed
    ...80%, 0.58 MB, 8376 KB/s, 0 seconds passed
    ...81%, 0.59 MB, 8482 KB/s, 0 seconds passed
    ...82%, 0.59 MB, 8575 KB/s, 0 seconds passed
    ...83%, 0.60 MB, 8679 KB/s, 0 seconds passed
    ...84%, 0.61 MB, 8783 KB/s, 0 seconds passed
    ...85%, 0.62 MB, 8888 KB/s, 0 seconds passed
    ...86%, 0.62 MB, 8944 KB/s, 0 seconds passed
    ...87%, 0.63 MB, 9047 KB/s, 0 seconds passed
    ...89%, 0.64 MB, 9149 KB/s,
  0 seconds passed
    ...90%, 0.65 MB, 9253 KB/s, 0 seconds passed
    ...91%, 0.66 MB, 9354 KB/s, 0 seconds passed
    ...92%, 0.66 MB, 9411 KB/s, 0 seconds passed
    ...93%, 0.67 MB, 9513 KB/s, 0 seconds passed
    ...94%, 0.68 MB, 9614 KB/s, 0 seconds passed
    ...95%, 0.69 MB, 9716 KB/s, 0 seconds passed
    ...96%, 0.70 MB, 9816 KB/s, 0 seconds passed
    ...97%, 0.70 MB, 9918 KB/s, 0 seconds passed
    ...98%, 0.71 MB, 9942 KB/s, 0 seconds passed
    ...99%, 0.72 MB, 10042 KB/s, 0 seconds passed
    ...100%, 0.73 MB, 10130 KB/s, 0 seconds passed
     Extracted 10 conv2d tasks:
     (1, 14, 14, 256, 512, 1, 1, 0, 0, 2, 2)
     (1, 28, 28, 128, 256, 1, 1, 0, 0, 2, 2)
diff --git a/docs/_sources/vta/tutorials/frontend/deploy_classification.rst.txt b/docs/_sources/vta/tutorials/frontend/deploy_classification.rst.txt
index 764bb1e..3926c56 100644
--- a/docs/_sources/vta/tutorials/frontend/deploy_classification.rst.txt
+++ b/docs/_sources/vta/tutorials/frontend/deploy_classification.rst.txt
@@ -243,8 +243,8 @@ The compilation steps are:
 
  .. code-block:: none
 
-
    ...12%, 0.01 MB, 38 KB/s, 0 seconds passed
    ...25%, 0.02 MB, 77 KB/s, 0 seconds passed
    ...38%, 0.02 MB, 115 KB/s, 0 seconds passed
    ...51%, 0.03 MB, 153 KB/s, 0 seconds passed
    ...64%, 0.04 MB, 188 KB/s, 0 seconds passed
    ...77%, 0.05 MB, 226 KB/s, 0 seconds passed
    ...89%, 0.05 MB, 262 KB/s, 0 seconds passed
    ...100%, 0.06 MB, 299 KB/s, 0 seconds passed
-    resnet18_v1 inference graph built in 8.25s!
+
    ...12%, 0.01 MB, 52 KB/s, 0 seconds passed
    ...25%, 0.02 MB, 103 KB/s, 0 seconds passed
    ...38%, 0.02 MB, 154 KB/s, 0 seconds passed
    ...51%, 0.03 MB, 206 KB/s, 0 seconds passed
    ...64%, 0.04 MB, 256 KB/s, 0 seconds passed
    ...77%, 0.05 MB, 300 KB/s, 0 seconds passed
    ...89%, 0.05 MB, 349 KB/s, 0 seconds passed
    ...100%, 0.06 MB, 399 KB/s, 0 seconds passed
+    resnet18_v1 inference graph built in 8.42s!
 
 
 
diff --git a/docs/_sources/vta/tutorials/frontend/sg_execution_times.rst.txt b/docs/_sources/vta/tutorials/frontend/sg_execution_times.rst.txt
index 1a11ac5..1edd179 100644
--- a/docs/_sources/vta/tutorials/frontend/sg_execution_times.rst.txt
+++ b/docs/_sources/vta/tutorials/frontend/sg_execution_times.rst.txt
@@ -5,6 +5,6 @@
 
 Computation times
 =================
-**00:29.337** total execution time for **vta_tutorials_frontend** files:
+**00:29.675** total execution time for **vta_tutorials_frontend** files:
 
-- **00:29.337**: :ref:`sphx_glr_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``)
+- **00:29.675**: :ref:`sphx_glr_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``)
diff --git a/docs/_sources/vta/tutorials/optimize/matrix_multiply_opt.rst.txt b/docs/_sources/vta/tutorials/optimize/matrix_multiply_opt.rst.txt
index 5b75d4d..cc82bc9 100644
--- a/docs/_sources/vta/tutorials/optimize/matrix_multiply_opt.rst.txt
+++ b/docs/_sources/vta/tutorials/optimize/matrix_multiply_opt.rst.txt
@@ -189,8 +189,8 @@ Those include:
 
     primfn(data_1: handle, weight_1: handle, res_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
-      buffers = {weight: Buffer(weight_2: Pointer(int8), int8, [64, 64, 16, 16], []),
-                 res: Buffer(res_2: Pointer(int8), int8, [1, 64, 1, 16], []),
+      buffers = {res: Buffer(res_2: Pointer(int8), int8, [1, 64, 1, 16], []),
+                 weight: Buffer(weight_2: Pointer(int8), int8, [64, 64, 16, 16], []),
                  data: Buffer(data_2: Pointer(int8), int8, [1, 64, 1, 16], [])}
       buffer_map = {data_1: data, weight_1: weight, res_1: res} {
       attr [data_buf: Pointer(int8)] "storage_scope" = "global";
diff --git a/docs/_sources/vta/tutorials/optimize/sg_execution_times.rst.txt b/docs/_sources/vta/tutorials/optimize/sg_execution_times.rst.txt
index 62d2ac4..9c3a29b 100644
--- a/docs/_sources/vta/tutorials/optimize/sg_execution_times.rst.txt
+++ b/docs/_sources/vta/tutorials/optimize/sg_execution_times.rst.txt
@@ -5,7 +5,7 @@
 
 Computation times
 =================
-**00:03.816** total execution time for **vta_tutorials_optimize** files:
+**00:03.859** total execution time for **vta_tutorials_optimize** files:
 
-- **00:03.269**: :ref:`sphx_glr_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``)
-- **00:00.547**: :ref:`sphx_glr_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``)
+- **00:03.288**: :ref:`sphx_glr_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``)
+- **00:00.570**: :ref:`sphx_glr_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``)
diff --git a/docs/_sources/vta/tutorials/sg_execution_times.rst.txt b/docs/_sources/vta/tutorials/sg_execution_times.rst.txt
index 34df389..6eae28c 100644
--- a/docs/_sources/vta/tutorials/sg_execution_times.rst.txt
+++ b/docs/_sources/vta/tutorials/sg_execution_times.rst.txt
@@ -5,7 +5,7 @@
 
 Computation times
 =================
-**00:01.011** total execution time for **vta_tutorials** files:
+**00:01.051** total execution time for **vta_tutorials** files:
 
-- **00:00.516**: :ref:`sphx_glr_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``)
-- **00:00.495**: :ref:`sphx_glr_vta_tutorials_vta_get_started.py` (``vta_get_started.py``)
+- **00:00.536**: :ref:`sphx_glr_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``)
+- **00:00.516**: :ref:`sphx_glr_vta_tutorials_vta_get_started.py` (``vta_get_started.py``)
diff --git a/docs/_sources/vta/tutorials/vta_get_started.rst.txt b/docs/_sources/vta/tutorials/vta_get_started.rst.txt
index ec845bc..b90d68a 100644
--- a/docs/_sources/vta/tutorials/vta_get_started.rst.txt
+++ b/docs/_sources/vta/tutorials/vta_get_started.rst.txt
@@ -423,8 +423,8 @@ with an :code:`env.alu` pragma.
 
     primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
-      buffers = {C: Buffer(C_2: Pointer(int8), int8, [1, 64, 1, 16], []),
-                 B: Buffer(B_2: Pointer(int32), int32, [1, 64, 1, 16], []),
+      buffers = {B: Buffer(B_2: Pointer(int32), int32, [1, 64, 1, 16], []),
+                 C: Buffer(C_2: Pointer(int8), int8, [1, 64, 1, 16], []),
                  A: Buffer(A_2: Pointer(int32), int32, [1, 64, 1, 16], [])}
       buffer_map = {A_1: A, B_1: B, C_1: C} {
       attr [A_buf: Pointer(int32)] "storage_scope" = "local.acc_buffer" {
diff --git a/docs/api/doxygen/algorithm_8h.html b/docs/api/doxygen/algorithm_8h.html
index 05857bd..ad6d0af 100644
--- a/docs/api/doxygen/algorithm_8h.html
+++ b/docs/api/doxygen/algorithm_8h.html
@@ -99,7 +99,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 </div><div class="textblock"><div class="dynheader">
 Include dependency graph for algorithm.h:</div>
 <div class="dyncontent">
-<div class="center"><iframe scrolling="no" frameborder="0" src="algorithm_8h__incl.svg" width="5030" height="1440"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
+<div class="center"><iframe scrolling="no" frameborder="0" src="algorithm_8h__incl.svg" width="4972" height="1515"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
 </div>
 </div>
 </div>
diff --git a/docs/api/doxygen/algorithm_8h__incl.svg b/docs/api/doxygen/algorithm_8h__incl.svg
index 0d2ac55..458b38e 100644
--- a/docs/api/doxygen/algorithm_8h__incl.svg
+++ b/docs/api/doxygen/algorithm_8h__incl.svg
@@ -4,1478 +4,1483 @@
 <!-- Generated by graphviz version 2.38.0 (20140413.2041)
  -->
 <!-- Title: include/tvm/relay/attrs/algorithm.h Pages: 1 -->
-<svg width="3772pt" height="1080pt"
- viewBox="0.00 0.00 3771.69 1080.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
-<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 1076)">
+<svg width="3729pt" height="1136pt"
+ viewBox="0.00 0.00 3729.00 1136.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 1132)">
 <title>include/tvm/relay/attrs/algorithm.h</title>
-<polygon fill="white" stroke="none" points="-4,4 -4,-1076 3767.69,-1076 3767.69,4 -4,4"/>
+<polygon fill="white" stroke="none" points="-4,4 -4,-1132 3725,-1132 3725,4 -4,4"/>
 <!-- Node1 -->
 <g id="node1" class="node"><title>Node1</title>
-<polygon fill="#bfbfbf" stroke="black" points="826,-1041.5 826,-1071.5 950,-1071.5 950,-1041.5 826,-1041.5"/>
-<text text-anchor="start" x="834" y="-1059.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/relay/attrs</text>
-<text text-anchor="middle" x="888" y="-1048.5" font-family="Helvetica,sans-Serif" font-size="10.00">/algorithm.h</text>
+<polygon fill="#bfbfbf" stroke="black" points="2053,-1097.5 2053,-1127.5 2177,-1127.5 2177,-1097.5 2053,-1097.5"/>
+<text text-anchor="start" x="2061" y="-1115.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/relay/attrs</text>
+<text text-anchor="middle" x="2115" y="-1104.5" font-family="Helvetica,sans-Serif" font-size="10.00">/algorithm.h</text>
 </g>
 <!-- Node2 -->
 <g id="node2" class="node"><title>Node2</title>
 <g id="a_node2"><a xlink:href="ir_2attrs_8h.html" target="_top" xlink:title="Helpers for attribute objects. ">
-<polygon fill="white" stroke="black" points="982.5,-705.5 982.5,-724.5 1061.5,-724.5 1061.5,-705.5 982.5,-705.5"/>
-<text text-anchor="middle" x="1022" y="-712.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/ir/attrs.h</text>
+<polygon fill="white" stroke="black" points="2988.5,-761.5 2988.5,-780.5 3067.5,-780.5 3067.5,-761.5 2988.5,-761.5"/>
+<text text-anchor="middle" x="3028" y="-768.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/ir/attrs.h</text>
 </a>
 </g>
 </g>
 <!-- Node1&#45;&gt;Node2 -->
 <g id="edge1" class="edge"><title>Node1&#45;&gt;Node2</title>
-<path fill="none" stroke="midnightblue" d="M825.875,-1047.56C764.584,-1035.83 680,-1007.43 680,-940 680,-940 680,-940 680,-826 680,-765.296 877.737,-733.308 972.5,-721.464"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="972.933,-724.938 982.434,-720.25 972.083,-717.989 972.933,-724.938"/>
+<path fill="none" stroke="midnightblue" d="M2177.12,-1108.66C2391.38,-1098.44 3098.36,-1061.17 3318,-1005 3378.16,-989.616 3412.01,-1000.97 3446,-949 3535.35,-812.382 3206.44,-780.967 3077.63,-773.952"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3077.79,-770.455 3067.62,-773.434 3077.42,-777.446 3077.79,-770.455"/>
 </g>
 <!-- Node13 -->
 <g id="node13" class="node"><title>Node13</title>
-<polygon fill="white" stroke="#bfbfbf" points="1818,-6 1818,-25 1862,-25 1862,-6 1818,-6"/>
-<text text-anchor="middle" x="1840" y="-13" font-family="Helvetica,sans-Serif" font-size="10.00">string</text>
+<polygon fill="white" stroke="#bfbfbf" points="665,-6 665,-25 709,-25 709,-6 665,-6"/>
+<text text-anchor="middle" x="687" y="-13" font-family="Helvetica,sans-Serif" font-size="10.00">string</text>
 </g>
 <!-- Node1&#45;&gt;Node13 -->
-<g id="edge222" class="edge"><title>Node1&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M825.842,-1053.59C629.893,-1046.76 38,-1019.54 38,-940 38,-940 38,-940 38,-372.5 38,-175.872 165.435,-106.769 358,-67 504.369,-36.772 1604.65,-19.8304 1807.8,-16.9438"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1807.88,-20.4431 1817.83,-16.8024 1807.79,-13.4438 1807.88,-20.4431"/>
+<g id="edge223" class="edge"><title>Node1&#45;&gt;Node13</title>
+<path fill="none" stroke="midnightblue" d="M2052.85,-1111.44C1703.5,-1110.69 0,-1101.61 0,-996 0,-996 0,-996 0,-132 0,-95.8221 17.2842,-84.4053 49,-67 102.363,-37.7148 531.855,-21.5627 654.441,-17.5181"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="654.802,-21.0083 664.683,-17.185 654.575,-14.012 654.802,-21.0083"/>
 </g>
 <!-- Node34 -->
 <g id="node34" class="node"><title>Node34</title>
 <g id="a_node34"><a xlink:href="base_8h.html" target="_top" xlink:title="Base classes for the Relay IR. ">
-<polygon fill="white" stroke="black" points="2598.5,-873.5 2598.5,-892.5 2695.5,-892.5 2695.5,-873.5 2598.5,-873.5"/>
-<text text-anchor="middle" x="2647" y="-880.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/relay/base.h</text>
+<polygon fill="white" stroke="black" points="2000.5,-929.5 2000.5,-948.5 2097.5,-948.5 2097.5,-929.5 2000.5,-929.5"/>
+<text text-anchor="middle" x="2049" y="-936.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/relay/base.h</text>
 </a>
 </g>
 </g>
 <!-- Node1&#45;&gt;Node34 -->
-<g id="edge126" class="edge"><title>Node1&#45;&gt;Node34</title>
-<path fill="none" stroke="midnightblue" d="M950.261,-1053.35C1203.38,-1044.31 2155.77,-1007.04 2451,-949 2511.71,-937.065 2579.64,-911.682 2617.46,-896.406"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2618.92,-899.591 2626.86,-892.57 2616.28,-893.111 2618.92,-899.591"/>
+<g id="edge127" class="edge"><title>Node1&#45;&gt;Node34</title>
+<path fill="none" stroke="midnightblue" d="M2109.6,-1097.47C2097.71,-1066.57 2069.28,-992.69 2056.02,-958.251"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2059.15,-956.635 2052.29,-948.56 2052.62,-959.15 2059.15,-956.635"/>
 </g>
 <!-- Node38 -->
 <g id="node38" class="node"><title>Node38</title>
 <g id="a_node38"><a xlink:href="relay_2expr_8h.html" target="_top" xlink:title="Relay expression language. ">
-<polygon fill="white" stroke="black" points="1050,-985.5 1050,-1004.5 1144,-1004.5 1144,-985.5 1050,-985.5"/>
-<text text-anchor="middle" x="1097" y="-992.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/relay/expr.h</text>
+<polygon fill="white" stroke="black" points="2134,-1041.5 2134,-1060.5 2228,-1060.5 2228,-1041.5 2134,-1041.5"/>
+<text text-anchor="middle" x="2181" y="-1048.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/relay/expr.h</text>
 </a>
 </g>
 </g>
 <!-- Node1&#45;&gt;Node38 -->
-<g id="edge154" class="edge"><title>Node1&#45;&gt;Node38</title>
-<path fill="none" stroke="midnightblue" d="M937.265,-1041.47C973.952,-1031.03 1023.38,-1016.96 1057.49,-1007.25"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1058.46,-1010.61 1067.12,-1004.51 1056.54,-1003.88 1058.46,-1010.61"/>
+<g id="edge155" class="edge"><title>Node1&#45;&gt;Node38</title>
+<path fill="none" stroke="midnightblue" d="M2130.64,-1097.4C2140.63,-1088.4 2153.58,-1076.72 2163.85,-1067.47"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2166.31,-1069.95 2171.4,-1060.66 2161.63,-1064.75 2166.31,-1069.95"/>
 </g>
 <!-- Node3 -->
 <g id="node3" class="node"><title>Node3</title>
-<polygon fill="white" stroke="#bfbfbf" points="974.5,-649.5 974.5,-668.5 1069.5,-668.5 1069.5,-649.5 974.5,-649.5"/>
-<text text-anchor="middle" x="1022" y="-656.5" font-family="Helvetica,sans-Serif" font-size="10.00">dmlc/common.h</text>
+<polygon fill="white" stroke="#bfbfbf" points="2961.5,-705.5 2961.5,-724.5 3056.5,-724.5 3056.5,-705.5 2961.5,-705.5"/>
+<text text-anchor="middle" x="3009" y="-712.5" font-family="Helvetica,sans-Serif" font-size="10.00">dmlc/common.h</text>
 </g>
 <!-- Node2&#45;&gt;Node3 -->
 <g id="edge2" class="edge"><title>Node2&#45;&gt;Node3</title>
-<path fill="none" stroke="midnightblue" d="M1022,-705.083C1022,-698.006 1022,-687.861 1022,-678.986"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1025.5,-678.751 1022,-668.751 1018.5,-678.751 1025.5,-678.751"/>
+<path fill="none" stroke="midnightblue" d="M3024.86,-761.083C3022.32,-753.849 3018.64,-743.409 3015.47,-734.397"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3018.7,-733.023 3012.08,-724.751 3012.1,-735.346 3018.7,-733.023"/>
 </g>
 <!-- Node4 -->
 <g id="node4" class="node"><title>Node4</title>
 <g id="a_node4"><a xlink:href="ir_2expr_8h.html" target="_top" xlink:title="Base expr nodes in TVM. ">
-<polygon fill="white" stroke="black" points="2358.5,-649.5 2358.5,-668.5 2435.5,-668.5 2435.5,-649.5 2358.5,-649.5"/>
-<text text-anchor="middle" x="2397" y="-656.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/ir/expr.h</text>
+<polygon fill="white" stroke="black" points="1172.5,-705.5 1172.5,-724.5 1249.5,-724.5 1249.5,-705.5 1172.5,-705.5"/>
+<text text-anchor="middle" x="1211" y="-712.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/ir/expr.h</text>
 </a>
 </g>
 </g>
 <!-- Node2&#45;&gt;Node4 -->
 <g id="edge3" class="edge"><title>Node2&#45;&gt;Node4</title>
-<path fill="none" stroke="midnightblue" d="M1061.67,-712.442C1258.79,-704.701 2132.05,-670.405 2348.39,-661.909"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2348.63,-665.402 2358.48,-661.513 2348.35,-658.408 2348.63,-665.402"/>
+<path fill="none" stroke="midnightblue" d="M2988.36,-768.822C2751.23,-761.775 1521.84,-725.238 1260.06,-717.458"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1259.83,-713.95 1249.73,-717.151 1259.62,-720.947 1259.83,-713.95"/>
 </g>
 <!-- Node2&#45;&gt;Node13 -->
-<g id="edge121" class="edge"><title>Node2&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M1039,-705.37C1060.94,-693.105 1094.43,-669.808 1078,-649 1035.24,-594.853 986.753,-647.489 927,-613 764.884,-519.428 785.888,-422.915 643,-302 577.583,-246.643 521.357,-274.056 480,-199 475.71,-191.215 474.973,-186.331 480,-179 557.913,-65.3874 1609.18,-24.3073 1807.76,-17.5408"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1807.97,-21.0359 1817.85,-17.2019 1807.74,-14.0399 1807.97,-21.0359"/>
+<g id="edge122" class="edge"><title>Node2&#45;&gt;Node13</title>
+<path fill="none" stroke="midnightblue" d="M2988.49,-768.777C2796.92,-762.109 1946.64,-724.025 1275,-557 1138.5,-523.055 1093.27,-527.015 979,-445 819.658,-330.638 720.039,-100.571 693.899,-34.5744"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="697.11,-33.174 690.219,-25.1261 690.587,-35.7146 697.11,-33.174"/>
 </g>
 <!-- Node14 -->
 <g id="node14" class="node"><title>Node14</title>
-<polygon fill="white" stroke="#bfbfbf" points="3217,-6 3217,-25 3285,-25 3285,-6 3217,-6"/>
-<text text-anchor="middle" x="3251" y="-13" font-family="Helvetica,sans-Serif" font-size="10.00">type_traits</text>
+<polygon fill="white" stroke="#bfbfbf" points="2460,-6 2460,-25 2528,-25 2528,-6 2460,-6"/>
+<text text-anchor="middle" x="2494" y="-13" font-family="Helvetica,sans-Serif" font-size="10.00">type_traits</text>
 </g>
 <!-- Node2&#45;&gt;Node14 -->
-<g id="edge122" class="edge"><title>Node2&#45;&gt;Node14</title>
-<path fill="none" stroke="midnightblue" d="M1061.88,-713.863C1303,-712.741 2568.18,-702.605 2944,-613 3152.07,-563.391 3268.31,-617.348 3395,-445 3446.93,-374.35 3542.62,-271.824 3396,-67 3373.22,-35.1774 3328.88,-23.2457 3295.2,-18.8566"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="3295.45,-15.3632 3285.13,-17.7229 3294.67,-22.3193 3295.45,-15.3632"/>
+<g id="edge123" class="edge"><title>Node2&#45;&gt;Node14</title>
+<path fill="none" stroke="midnightblue" d="M3051.13,-761.464C3130.81,-731.356 3389,-627.646 3389,-548 3389,-548 3389,-548 3389,-490 3389,-400.745 3442.21,-376.318 3416,-291 3395.14,-223.105 3364.57,-212.612 3302,-179 3038.09,-37.2351 2665.42,-18.6619 2538.4,-16.5996"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2538.32,-13.0983 2528.28,-16.4624 2538.23,-20.0977 2538.32,-13.0983"/>
 </g>
 <!-- Node15 -->
 <g id="node15" class="node"><title>Node15</title>
-<polygon fill="white" stroke="#bfbfbf" points="955,-6 955,-25 997,-25 997,-6 955,-6"/>
-<text text-anchor="middle" x="976" y="-13" font-family="Helvetica,sans-Serif" font-size="10.00">utility</text>
+<polygon fill="white" stroke="#bfbfbf" points="3343,-6 3343,-25 3385,-25 3385,-6 3343,-6"/>
+<text text-anchor="middle" x="3364" y="-13" font-family="Helvetica,sans-Serif" font-size="10.00">utility</text>
 </g>
 <!-- Node2&#45;&gt;Node15 -->
-<g id="edge124" class="edge"><title>Node2&#45;&gt;Node15</title>
-<path fill="none" stroke="midnightblue" d="M982.473,-705.735C857.341,-678.141 462.398,-579.962 195,-389 111.138,-329.11 76,-293.052 76,-190 76,-190 76,-190 76,-132 76,-66.2072 145.421,-83.9246 209,-67 351.378,-29.0992 818.788,-19.0094 944.584,-16.9518"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="944.753,-20.4497 954.696,-16.7919 944.642,-13.4505 944.753,-20.4497"/>
+<g id="edge125" class="edge"><title>Node2&#45;&gt;Node15</title>
+<path fill="none" stroke="midnightblue" d="M3066.13,-761.409C3200.65,-728.628 3645,-598.336 3645,-307.5 3645,-307.5 3645,-307.5 3645,-132 3645,-24.9194 3468.28,-15.0375 3395.13,-15.534"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3395.07,-12.0343 3385.12,-15.6722 3395.17,-19.0336 3395.07,-12.0343"/>
 </g>
 <!-- Node18 -->
 <g id="node18" class="node"><title>Node18</title>
-<polygon fill="white" stroke="#bfbfbf" points="903.5,-123.5 903.5,-142.5 996.5,-142.5 996.5,-123.5 903.5,-123.5"/>
-<text text-anchor="middle" x="950" y="-130.5" font-family="Helvetica,sans-Serif" font-size="10.00">unordered_map</text>
+<polygon fill="white" stroke="#bfbfbf" points="2826.5,-123.5 2826.5,-142.5 2919.5,-142.5 2919.5,-123.5 2826.5,-123.5"/>
+<text text-anchor="middle" x="2873" y="-130.5" font-family="Helvetica,sans-Serif" font-size="10.00">unordered_map</text>
 </g>
 <!-- Node2&#45;&gt;Node18 -->
-<g id="edge123" class="edge"><title>Node2&#45;&gt;Node18</title>
-<path fill="none" stroke="midnightblue" d="M1005.41,-705.461C991.2,-697.512 970.89,-684.566 957,-669 938.202,-647.932 933.873,-640.386 927,-613 924.836,-604.379 926.354,-601.865 927,-593 932.125,-522.664 950,-506.523 950,-436 950,-436 950,-436 950,-372.5 950,-295.138 939.114,-276.299 936,-199 935.642,-190.118 934.665,-187.788 936,-179 937.368,-170.001 940.19,-160.325 942.945,-152.26"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="946.265,-153.372 946.4,-142.778 939.688,-150.976 946.265,-153.372"/>
+<g id="edge124" class="edge"><title>Node2&#45;&gt;Node18</title>
+<path fill="none" stroke="midnightblue" d="M3002.2,-761.364C2975.95,-750.946 2940.61,-731.476 2952,-705 2961.19,-683.629 2980.81,-690.371 2990,-669 2993.51,-660.834 2990.32,-657.883 2990,-649 2988.33,-602.91 2975.13,-278.568 2960,-235 2947.33,-198.523 2914.22,-166.69 2892.69,-148.943"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2894.81,-146.151 2884.81,-142.643 2890.44,-151.62 2894.81,-146.151"/>
 </g>
 <!-- Node19 -->
 <g id="node19" class="node"><title>Node19</title>
-<polygon fill="white" stroke="#bfbfbf" points="736.5,-123.5 736.5,-142.5 783.5,-142.5 783.5,-123.5 736.5,-123.5"/>
-<text text-anchor="middle" x="760" y="-130.5" font-family="Helvetica,sans-Serif" font-size="10.00">vector</text>
+<polygon fill="white" stroke="#bfbfbf" points="3093.5,-123.5 3093.5,-142.5 3140.5,-142.5 3140.5,-123.5 3093.5,-123.5"/>
+<text text-anchor="middle" x="3117" y="-130.5" font-family="Helvetica,sans-Serif" font-size="10.00">vector</text>
 </g>
 <!-- Node2&#45;&gt;Node19 -->
-<g id="edge125" class="edge"><title>Node2&#45;&gt;Node19</title>
-<path fill="none" stroke="midnightblue" d="M994.685,-705.414C976.307,-698.343 952.816,-686.587 938,-669 799.113,-504.137 767.456,-227.017 761.32,-152.771"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="764.798,-152.348 760.542,-142.645 757.819,-152.884 764.798,-152.348"/>
+<g id="edge126" class="edge"><title>Node2&#45;&gt;Node19</title>
+<path fill="none" stroke="midnightblue" d="M3042.84,-761.267C3054.86,-753.367 3071.2,-740.599 3080,-725 3169.27,-566.719 3132.88,-503.001 3149,-322 3154.64,-258.694 3169.77,-239.403 3150,-179 3146.51,-168.34 3139.43,-158.117 3132.71,-150.117"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3135.3,-147.763 3126.02,-142.66 3130.1,-152.439 3135.3,-147.763"/>
 </g>
 <!-- Node20 -->
 <g id="node20" class="node"><title>Node20</title>
 <g id="a_node20"><a xlink:href="packed__func_8h.html" target="_top" xlink:title="Type&#45;erased function used across TVM API. ">
-<polygon fill="white" stroke="black" points="1412.5,-235.5 1412.5,-265.5 1525.5,-265.5 1525.5,-235.5 1412.5,-235.5"/>
-<text text-anchor="start" x="1420.5" y="-253.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/runtime/packed</text>
-<text text-anchor="middle" x="1469" y="-242.5" font-family="Helvetica,sans-Serif" font-size="10.00">_func.h</text>
+<polygon fill="white" stroke="black" points="2244.5,-291.5 2244.5,-321.5 2357.5,-321.5 2357.5,-291.5 2244.5,-291.5"/>
+<text text-anchor="start" x="2252.5" y="-309.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/runtime/packed</text>
+<text text-anchor="middle" x="2301" y="-298.5" font-family="Helvetica,sans-Serif" font-size="10.00">_func.h</text>
 </a>
 </g>
 </g>
 <!-- Node2&#45;&gt;Node20 -->
-<g id="edge119" class="edge"><title>Node2&#45;&gt;Node20</title>
-<path fill="none" stroke="midnightblue" d="M1045.84,-705.456C1065.02,-697.829 1091.97,-685.342 1112,-669 1270.15,-539.999 1237.79,-440.169 1388,-302 1401.13,-289.922 1417.54,-279.072 1432.15,-270.552"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1433.99,-273.532 1440.97,-265.559 1430.55,-267.44 1433.99,-273.532"/>
+<g id="edge120" class="edge"><title>Node2&#45;&gt;Node20</title>
+<path fill="none" stroke="midnightblue" d="M3003.62,-761.379C2981.15,-753.098 2947.12,-739.702 2919,-725 2878.25,-703.695 2869.25,-695.96 2832,-669 2685.8,-563.193 2668.27,-510.87 2516,-414 2459.35,-377.963 2388.81,-345.089 2344.33,-325.679"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2345.54,-322.389 2334.97,-321.628 2342.76,-328.813 2345.54,-322.389"/>
 </g>
 <!-- Node24 -->
 <g id="node24" class="node"><title>Node24</title>
-<polygon fill="white" stroke="#bfbfbf" points="1054.5,-179.5 1054.5,-198.5 1117.5,-198.5 1117.5,-179.5 1054.5,-179.5"/>
-<text text-anchor="middle" x="1086" y="-186.5" font-family="Helvetica,sans-Serif" font-size="10.00">functional</text>
+<polygon fill="white" stroke="#bfbfbf" points="2778.5,-235.5 2778.5,-254.5 2841.5,-254.5 2841.5,-235.5 2778.5,-235.5"/>
+<text text-anchor="middle" x="2810" y="-242.5" font-family="Helvetica,sans-Serif" font-size="10.00">functional</text>
 </g>
 <!-- Node2&#45;&gt;Node24 -->
-<g id="edge120" class="edge"><title>Node2&#45;&gt;Node24</title>
-<path fill="none" stroke="midnightblue" d="M1004.54,-705.34C982.461,-693.227 949.142,-670.314 965,-649 998.238,-604.327 1046.92,-653.628 1085,-613 1159.39,-533.628 1140,-483.282 1140,-374.5 1140,-374.5 1140,-374.5 1140,-311 1140,-270.766 1114.79,-229.028 1098.82,-206.622"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1101.59,-204.482 1092.84,-198.516 1095.96,-208.64 1101.59,-204.482"/>
+<g id="edge121" class="edge"><title>Node2&#45;&gt;Node24</title>
+<path fill="none" stroke="midnightblue" d="M3038.44,-761.245C3047.42,-752.969 3059.75,-739.663 3065,-725 3068,-716.632 3067.23,-713.605 3065,-705 3058.57,-680.194 2866.66,-344.775 2820.1,-263.598"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2822.96,-261.553 2814.95,-254.621 2816.89,-265.037 2822.96,-261.553"/>
 </g>
 <!-- Node28 -->
 <g id="node28" class="node"><title>Node28</title>
 <g id="a_node28"><a xlink:href="structural__equal_8h.html" target="_top" xlink:title="Structural equality comparison. ">
-<polygon fill="white" stroke="black" points="2029.5,-358.5 2029.5,-388.5 2140.5,-388.5 2140.5,-358.5 2029.5,-358.5"/>
-<text text-anchor="start" x="2037.5" y="-376.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/node/structural</text>
-<text text-anchor="middle" x="2085" y="-365.5" font-family="Helvetica,sans-Serif" font-size="10.00">_equal.h</text>
+<polygon fill="white" stroke="black" points="1446.5,-414.5 1446.5,-444.5 1557.5,-444.5 1557.5,-414.5 1446.5,-414.5"/>
+<text text-anchor="start" x="1454.5" y="-432.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/node/structural</text>
+<text text-anchor="middle" x="1502" y="-421.5" font-family="Helvetica,sans-Serif" font-size="10.00">_equal.h</text>
 </a>
 </g>
 </g>
 <!-- Node2&#45;&gt;Node28 -->
-<g id="edge117" class="edge"><title>Node2&#45;&gt;Node28</title>
-<path fill="none" stroke="midnightblue" d="M1061.67,-714.214C1221.66,-714.032 1811,-702.851 1811,-548 1811,-548 1811,-548 1811,-490 1811,-400.023 1938.01,-379.027 2019.08,-374.79"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2019.39,-378.28 2029.22,-374.334 2019.07,-371.287 2019.39,-378.28"/>
+<g id="edge118" class="edge"><title>Node2&#45;&gt;Node28</title>
+<path fill="none" stroke="midnightblue" d="M2988.25,-765.677C2941.01,-759.787 2860.25,-747.327 2794,-725 2634.65,-671.3 2602.9,-637.013 2455,-557 2439.58,-548.658 2437.8,-542 2421,-537 2092.2,-439.173 1982.75,-584.408 1650,-501 1607.4,-490.322 1562.39,-466.904 1533.35,-449.983"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1534.89,-446.829 1524.5,-444.742 1531.32,-452.851 1534.89,-446.829"/>
 </g>
 <!-- Node30 -->
 <g id="node30" class="node"><title>Node30</title>
 <g id="a_node30"><a xlink:href="structural__hash_8h.html" target="_top" xlink:title="tvm/node/structural\l_hash.h">
-<polygon fill="white" stroke="black" points="1633.5,-358.5 1633.5,-388.5 1744.5,-388.5 1744.5,-358.5 1633.5,-358.5"/>
-<text text-anchor="start" x="1641.5" y="-376.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/node/structural</text>
-<text text-anchor="middle" x="1689" y="-365.5" font-family="Helvetica,sans-Serif" font-size="10.00">_hash.h</text>
+<polygon fill="white" stroke="black" points="1804.5,-414.5 1804.5,-444.5 1915.5,-444.5 1915.5,-414.5 1804.5,-414.5"/>
+<text text-anchor="start" x="1812.5" y="-432.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/node/structural</text>
+<text text-anchor="middle" x="1860" y="-421.5" font-family="Helvetica,sans-Serif" font-size="10.00">_hash.h</text>
 </a>
 </g>
 </g>
 <!-- Node2&#45;&gt;Node30 -->
-<g id="edge118" class="edge"><title>Node2&#45;&gt;Node30</title>
-<path fill="none" stroke="midnightblue" d="M1061.7,-713.241C1115.39,-710.915 1213.15,-702.033 1289,-669 1452.33,-597.872 1460.3,-526.063 1607,-425 1622.62,-414.239 1640.5,-403.098 1655.57,-394.026"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1657.75,-396.8 1664.54,-388.668 1654.16,-390.79 1657.75,-396.8"/>
+<g id="edge119" class="edge"><title>Node2&#45;&gt;Node30</title>
+<path fill="none" stroke="midnightblue" d="M2988.46,-764.458C2949.02,-758.067 2887.24,-745.69 2837,-725 2777.23,-700.388 2770.34,-678.832 2713,-649 2602.1,-591.308 2574.67,-573.145 2455,-537 2267.23,-480.284 2036.46,-449.614 1925.58,-437.191"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1925.88,-433.702 1915.56,-436.08 1925.11,-440.659 1925.88,-433.702"/>
 </g>
 <!-- Node5 -->
 <g id="node5" class="node"><title>Node5</title>
 <g id="a_node5"><a xlink:href="span_8h.html" target="_top" xlink:title="Span information for debugging purposes. ">
-<polygon fill="white" stroke="black" points="2379,-537.5 2379,-556.5 2459,-556.5 2459,-537.5 2379,-537.5"/>
-<text text-anchor="middle" x="2419" y="-544.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/ir/span.h</text>
+<polygon fill="white" stroke="black" points="922,-593.5 922,-612.5 1002,-612.5 1002,-593.5 922,-593.5"/>
+<text text-anchor="middle" x="962" y="-600.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/ir/span.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node5 -->
 <g id="edge4" class="edge"><title>Node4&#45;&gt;Node5</title>
-<path fill="none" stroke="midnightblue" d="M2398.73,-649.368C2402.28,-631.617 2410.47,-590.666 2415.28,-566.597"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2418.75,-567.084 2417.28,-556.591 2411.89,-565.711 2418.75,-567.084"/>
+<path fill="none" stroke="midnightblue" d="M1191.46,-705.368C1147.73,-686.051 1041.81,-639.259 990.631,-616.648"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="992.009,-613.431 981.448,-612.591 989.181,-619.834 992.009,-613.431"/>
 </g>
 <!-- Node6 -->
 <g id="node6" class="node"><title>Node6</title>
 <g id="a_node6"><a xlink:href="node_8h.html" target="_top" xlink:title="Definitions and helper macros for IR/AST nodes. ">
-<polygon fill="white" stroke="black" points="2220,-481.5 2220,-500.5 2318,-500.5 2318,-481.5 2220,-481.5"/>
-<text text-anchor="middle" x="2269" y="-488.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/node/node.h</text>
+<polygon fill="white" stroke="black" points="1419,-537.5 1419,-556.5 1517,-556.5 1517,-537.5 1419,-537.5"/>
+<text text-anchor="middle" x="1468" y="-544.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/node/node.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node6 -->
-<g id="edge111" class="edge"><title>Node4&#45;&gt;Node6</title>
-<path fill="none" stroke="midnightblue" d="M2390.12,-649.075C2369.62,-622.493 2308.74,-543.538 2281.93,-508.763"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2284.48,-506.35 2275.61,-500.568 2278.94,-510.624 2284.48,-506.35"/>
+<g id="edge112" class="edge"><title>Node4&#45;&gt;Node6</title>
+<path fill="none" stroke="midnightblue" d="M1224.46,-705.309C1266.35,-678.248 1394.73,-595.324 1446.26,-562.041"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1448.18,-564.971 1454.68,-556.605 1444.38,-559.091 1448.18,-564.971"/>
 </g>
 <!-- Node7 -->
 <g id="node7" class="node"><title>Node7</title>
 <g id="a_node7"><a xlink:href="node_2container_8h.html" target="_top" xlink:title="Array/Map container in the DSL graph. ">
-<polygon fill="white" stroke="black" points="1827.5,-302.5 1827.5,-321.5 1946.5,-321.5 1946.5,-302.5 1827.5,-302.5"/>
-<text text-anchor="middle" x="1887" y="-309.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/node/container.h</text>
+<polygon fill="white" stroke="black" points="2074.5,-358.5 2074.5,-377.5 2193.5,-377.5 2193.5,-358.5 2074.5,-358.5"/>
+<text text-anchor="middle" x="2134" y="-365.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/node/container.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node7 -->
-<g id="edge110" class="edge"><title>Node4&#45;&gt;Node7</title>
-<path fill="none" stroke="midnightblue" d="M2382.03,-649.384C2326.67,-617.33 2128.92,-501.041 1977,-389 1950.55,-369.493 1921.84,-344.528 1904.13,-328.644"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1906.15,-325.748 1896.38,-321.64 1901.46,-330.942 1906.15,-325.748"/>
+<g id="edge111" class="edge"><title>Node4&#45;&gt;Node7</title>
+<path fill="none" stroke="midnightblue" d="M1249.57,-708.944C1369.4,-692.201 1743.99,-632.022 2025,-501 2064.01,-482.813 2077.44,-478.866 2104,-445 2117.37,-427.95 2125.47,-404.189 2129.82,-387.795"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2133.29,-388.341 2132.26,-377.796 2126.49,-386.684 2133.29,-388.341"/>
 </g>
 <!-- Node11 -->
 <g id="node11" class="node"><title>Node11</title>
 <g id="a_node11"><a xlink:href="object_8h.html" target="_top" xlink:title="A managed object in the TVM runtime. ">
-<polygon fill="white" stroke="red" points="2315.5,-67.5 2315.5,-86.5 2432.5,-86.5 2432.5,-67.5 2315.5,-67.5"/>
-<text text-anchor="middle" x="2374" y="-74.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/runtime/object.h</text>
+<polygon fill="white" stroke="red" points="1647.5,-67.5 1647.5,-86.5 1764.5,-86.5 1764.5,-67.5 1647.5,-67.5"/>
+<text text-anchor="middle" x="1706" y="-74.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/runtime/object.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node11 -->
-<g id="edge112" class="edge"><title>Node4&#45;&gt;Node11</title>
-<path fill="none" stroke="midnightblue" d="M2435.59,-656.97C2534.17,-653.925 2793.13,-643.204 2873,-613 3017,-558.548 3178.64,-345.782 3209,-266 3213.62,-253.87 3214.76,-188.295 3208,-179 3162.94,-117.053 3120.09,-138.139 3045,-123 2931.34,-100.086 2590.17,-85.641 2442.91,-80.316"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2442.77,-76.8087 2432.65,-79.9484 2442.52,-83.8042 2442.77,-76.8087"/>
+<g id="edge113" class="edge"><title>Node4&#45;&gt;Node11</title>
+<path fill="none" stroke="midnightblue" d="M1210.49,-705.463C1208.94,-673.949 1206.53,-561.797 1244,-481 1333.67,-287.669 1384.03,-238.703 1563,-123 1588.4,-106.577 1620.22,-95.7906 1647.38,-88.9081"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1648.26,-92.2964 1657.16,-86.5458 1646.62,-85.4922 1648.26,-92.2964"/>
 </g>
 <!-- Node4&#45;&gt;Node13 -->
-<g id="edge115" class="edge"><title>Node4&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M2435.55,-651.424C2557.69,-628.586 2923.63,-543.747 2847,-358 2767.87,-166.213 2679.64,-130.105 2482,-67 2366.57,-30.1437 1986.35,-19.4963 1872.49,-17.098"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1872.4,-13.5953 1862.33,-16.8911 1872.25,-20.5939 1872.4,-13.5953"/>
+<g id="edge116" class="edge"><title>Node4&#45;&gt;Node13</title>
+<path fill="none" stroke="midnightblue" d="M1172.39,-711.235C1102.55,-705.733 958.361,-691.925 914,-669 859.34,-640.753 763.073,-504.473 731,-445 684.562,-358.889 662.382,-332.799 665,-235 667.001,-160.253 662.624,-140.903 674,-67 675.643,-56.3242 678.579,-44.6302 681.245,-35.186"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="684.68,-35.9107 684.157,-25.3287 677.967,-33.9271 684.68,-35.9107"/>
 </g>
 <!-- Node4&#45;&gt;Node14 -->
-<g id="edge116" class="edge"><title>Node4&#45;&gt;Node14</title>
-<path fill="none" stroke="midnightblue" d="M2435.65,-656.965C2539.13,-653.819 2820.51,-642.703 2908,-613 2951.9,-598.097 2959.11,-585.08 2996,-557 3193.3,-406.805 3269.91,-369.867 3370,-143 3383.69,-111.965 3396.01,-95.0952 3377,-67 3358.83,-40.1413 3323.79,-27.5659 3295.26,-21.6789"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="3295.56,-18.1749 3285.09,-19.7986 3294.29,-25.0583 3295.56,-18.1749"/>
+<g id="edge117" class="edge"><title>Node4&#45;&gt;Node14</title>
+<path fill="none" stroke="midnightblue" d="M1249.73,-710.128C1311.47,-703.636 1435.87,-689.247 1540,-669 1859.4,-606.893 1941.66,-581.562 2237,-445 2412.56,-363.822 2518.41,-376.175 2596,-199 2619.56,-145.205 2625.49,-113.794 2590,-67 2576.21,-48.8177 2553.97,-36.5338 2534.36,-28.6381"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2535.59,-25.3606 2525,-25.1079 2533.12,-31.9106 2535.59,-25.3606"/>
 </g>
 <!-- Node16 -->
 <g id="node16" class="node"><title>Node16</title>
-<polygon fill="white" stroke="#bfbfbf" points="2744,-123.5 2744,-142.5 2806,-142.5 2806,-123.5 2744,-123.5"/>
-<text text-anchor="middle" x="2775" y="-130.5" font-family="Helvetica,sans-Serif" font-size="10.00">algorithm</text>
+<polygon fill="white" stroke="#bfbfbf" points="1578,-123.5 1578,-142.5 1640,-142.5 1640,-123.5 1578,-123.5"/>
+<text text-anchor="middle" x="1609" y="-130.5" font-family="Helvetica,sans-Serif" font-size="10.00">algorithm</text>
 </g>
 <!-- Node4&#45;&gt;Node16 -->
-<g id="edge113" class="edge"><title>Node4&#45;&gt;Node16</title>
-<path fill="none" stroke="midnightblue" d="M2435.56,-656.155C2527.92,-651.393 2759.59,-637.331 2833,-613 2879.16,-597.7 2893.16,-591.927 2927,-557 2991.21,-490.717 3159.6,-248.601 3099,-179 3089.78,-168.413 2901.47,-147.33 2816.26,-138.285"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2816.46,-134.787 2806.15,-137.217 2815.73,-141.748 2816.46,-134.787"/>
+<g id="edge114" class="edge"><title>Node4&#45;&gt;Node16</title>
+<path fill="none" stroke="midnightblue" d="M1172.33,-706.769C1098.72,-692.119 943.117,-656.597 913,-613 907.948,-605.687 910.075,-601.394 913,-593 946.417,-497.107 976.299,-478.173 1055,-414 1227.05,-273.712 1480.92,-177.898 1573.54,-145.819"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1574.69,-149.125 1583.01,-142.567 1572.41,-142.504 1574.69,-149.125"/>
 </g>
 <!-- Node25 -->
 <g id="node25" class="node"><title>Node25</title>
-<polygon fill="white" stroke="#bfbfbf" points="3048,-179.5 3048,-198.5 3090,-198.5 3090,-179.5 3048,-179.5"/>
-<text text-anchor="middle" x="3069" y="-186.5" font-family="Helvetica,sans-Serif" font-size="10.00">limits</text>
+<polygon fill="white" stroke="#bfbfbf" points="674,-235.5 674,-254.5 716,-254.5 716,-235.5 674,-235.5"/>
+<text text-anchor="middle" x="695" y="-242.5" font-family="Helvetica,sans-Serif" font-size="10.00">limits</text>
 </g>
 <!-- Node4&#45;&gt;Node25 -->
-<g id="edge114" class="edge"><title>Node4&#45;&gt;Node25</title>
-<path fill="none" stroke="midnightblue" d="M2435.69,-657.571C2506.26,-655.899 2660.03,-648.138 2784,-613 2840.62,-596.951 2862.27,-598.505 2904,-557 2943.04,-518.164 2940.66,-497.59 2957,-445 2985.05,-354.709 2937.3,-312.819 2991,-235 3002.36,-218.543 3021.83,-207.297 3038.48,-200.164"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="3039.88,-203.377 3047.88,-196.431 3037.29,-196.872 3039.88,-203.377"/>
+<g id="edge115" class="edge"><title>Node4&#45;&gt;Node25</title>
+<path fill="none" stroke="midnightblue" d="M1172.29,-712.642C1120.92,-709.637 1028.31,-699.987 956,-669 917.435,-652.474 907.886,-644.349 880,-613 779.3,-499.794 718.372,-321.887 700.52,-264.502"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="703.791,-263.23 697.525,-254.687 697.096,-265.273 703.791,-263.23"/>
 </g>
 <!-- Node33 -->
 <g id="node33" class="node"><title>Node33</title>
 <g id="a_node33"><a xlink:href="ir_2type_8h.html" target="_top" xlink:title="IR/AST nodes for the unified type system in TVM. ">
-<polygon fill="white" stroke="black" points="2099.5,-593.5 2099.5,-612.5 2176.5,-612.5 2176.5,-593.5 2099.5,-593.5"/>
-<text text-anchor="middle" x="2138" y="-600.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/ir/type.h</text>
+<polygon fill="white" stroke="black" points="712.5,-649.5 712.5,-668.5 789.5,-668.5 789.5,-649.5 712.5,-649.5"/>
+<text text-anchor="middle" x="751" y="-656.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/ir/type.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node33 -->
-<g id="edge103" class="edge"><title>Node4&#45;&gt;Node33</title>
-<path fill="none" stroke="midnightblue" d="M2358.23,-649.916C2312.41,-640.364 2236.16,-624.466 2186.49,-614.109"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2187.12,-610.666 2176.62,-612.051 2185.69,-617.519 2187.12,-610.666"/>
+<g id="edge104" class="edge"><title>Node4&#45;&gt;Node33</title>
+<path fill="none" stroke="midnightblue" d="M1172.18,-709.443C1088.54,-699.625 891.518,-676.496 799.692,-665.716"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="799.897,-662.216 789.557,-664.526 799.08,-669.168 799.897,-662.216"/>
 </g>
 <!-- Node5&#45;&gt;Node6 -->
 <g id="edge5" class="edge"><title>Node5&#45;&gt;Node6</title>
-<path fill="none" stroke="midnightblue" d="M2395.23,-537.444C2370.11,-528.399 2330.29,-514.064 2302.1,-503.916"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2303.25,-500.611 2292.66,-500.516 2300.88,-507.197 2303.25,-500.611"/>
+<path fill="none" stroke="midnightblue" d="M1002.06,-595.021C1006.75,-594.294 1011.48,-593.601 1016,-593 1156.28,-574.347 1322,-559.733 1408.56,-552.661"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1409.03,-556.134 1418.72,-551.836 1408.47,-549.157 1409.03,-556.134"/>
 </g>
 <!-- Node5&#45;&gt;Node11 -->
-<g id="edge101" class="edge"><title>Node5&#45;&gt;Node11</title>
-<path fill="none" stroke="midnightblue" d="M2459.04,-538.045C2505.39,-526.462 2579.61,-500.024 2613,-445 2633.06,-411.943 2620.42,-395.948 2613,-358 2590.84,-244.636 2589.97,-197.86 2502,-123 2482.61,-106.503 2456.79,-95.9152 2433.41,-89.1872"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2434.14,-85.7575 2423.57,-86.5342 2432.31,-92.5163 2434.14,-85.7575"/>
+<g id="edge102" class="edge"><title>Node5&#45;&gt;Node11</title>
+<path fill="none" stroke="midnightblue" d="M964.92,-593.446C982.119,-544.393 1075.11,-296.917 1239,-179 1289.07,-142.972 1306.6,-139.659 1366,-123 1457.61,-97.3057 1567.33,-86.155 1636.85,-81.4028"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1637.49,-84.8689 1647.24,-80.7204 1637.03,-77.8839 1637.49,-84.8689"/>
 </g>
 <!-- Node5&#45;&gt;Node13 -->
-<g id="edge102" class="edge"><title>Node5&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M2459.12,-541.21C2493.51,-535.678 2543.33,-524.172 2581,-501 2665.14,-449.234 2861.85,-209.976 2815,-123 2793.54,-83.1717 2772.95,-81.2144 2730,-67 2566.22,-12.7952 2012.72,-14.7011 1872.5,-16.0972"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1872.36,-12.5984 1862.39,-16.2043 1872.43,-19.598 1872.36,-12.5984"/>
+<g id="edge103" class="edge"><title>Node5&#45;&gt;Node13</title>
+<path fill="none" stroke="midnightblue" d="M953.642,-593.344C931.817,-570.458 872.037,-505.921 832,-445 774.47,-357.462 784.715,-321.062 725,-235 712.627,-217.167 700.145,-219.119 692,-199 669.346,-143.041 677.634,-69.3257 683.362,-35.253"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="686.826,-35.7632 685.158,-25.3006 679.937,-34.5201 686.826,-35.7632"/>
 </g>
 <!-- Node6&#45;&gt;Node7 -->
 <g id="edge6" class="edge"><title>Node6&#45;&gt;Node7</title>
-<path fill="none" stroke="midnightblue" d="M2231.28,-481.469C2199.99,-473.773 2154.68,-461.184 2117,-445 2071.26,-425.354 2061.8,-416.035 2020,-389 2000.22,-376.205 1997.16,-370.196 1977,-358 1957.2,-346.02 1933.79,-334.389 1915.81,-325.954"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1916.99,-322.639 1906.44,-321.614 1914.04,-328.992 1916.99,-322.639"/>
+<path fill="none" stroke="midnightblue" d="M1517.04,-544.877C1624.68,-541.87 1878.38,-531.658 1958,-501 1993.14,-487.469 2078.57,-416.464 2115.97,-384.555"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2118.67,-386.849 2123.98,-377.684 2114.11,-381.534 2118.67,-386.849"/>
 </g>
 <!-- Node8 -->
 <g id="node8" class="node"><title>Node8</title>
 <g id="a_node8"><a xlink:href="runtime_2container_8h.html" target="_top" xlink:title="Common POD(plain old data) container types. ">
-<polygon fill="white" stroke="red" points="1680,-179.5 1680,-198.5 1812,-198.5 1812,-179.5 1680,-179.5"/>
-<text text-anchor="middle" x="1746" y="-186.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/runtime/container.h</text>
+<polygon fill="white" stroke="red" points="2410,-179.5 2410,-198.5 2542,-198.5 2542,-179.5 2410,-179.5"/>
+<text text-anchor="middle" x="2476" y="-186.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/runtime/container.h</text>
 </a>
 </g>
 </g>
 <!-- Node6&#45;&gt;Node8 -->
-<g id="edge94" class="edge"><title>Node6&#45;&gt;Node8</title>
-<path fill="none" stroke="midnightblue" d="M2219.81,-490.33C2082.26,-490.082 1698.82,-481.002 1624,-389 1572.48,-325.645 1677.66,-238.727 1724.77,-204.597"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1726.98,-207.324 1733.1,-198.677 1722.92,-201.619 1726.98,-207.324"/>
+<g id="edge95" class="edge"><title>Node6&#45;&gt;Node8</title>
+<path fill="none" stroke="midnightblue" d="M1517.32,-546.825C1610.79,-547.279 1819.58,-542.966 1989,-501 2187.55,-451.819 2242.07,-434.352 2413,-322 2448.99,-298.346 2466.09,-294.604 2483,-255 2489.33,-240.175 2486.48,-221.792 2482.69,-208.363"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2486.01,-207.256 2479.61,-198.817 2479.35,-209.409 2486.01,-207.256"/>
 </g>
 <!-- Node10 -->
 <g id="node10" class="node"><title>Node10</title>
 <g id="a_node10"><a xlink:href="runtime_2memory_8h.html" target="_top" xlink:title="Runtime memory management. ">
-<polygon fill="white" stroke="red" points="2020.5,-123.5 2020.5,-142.5 2147.5,-142.5 2147.5,-123.5 2020.5,-123.5"/>
-<text text-anchor="middle" x="2084" y="-130.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/runtime/memory.h</text>
+<polygon fill="white" stroke="red" points="2104.5,-123.5 2104.5,-142.5 2231.5,-142.5 2231.5,-123.5 2104.5,-123.5"/>
+<text text-anchor="middle" x="2168" y="-130.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/runtime/memory.h</text>
 </a>
 </g>
 </g>
 <!-- Node6&#45;&gt;Node10 -->
-<g id="edge95" class="edge"><title>Node6&#45;&gt;Node10</title>
-<path fill="none" stroke="midnightblue" d="M2275.37,-481.208C2290.06,-459.801 2323.8,-403.403 2307,-358 2269.44,-256.465 2157.33,-178.393 2107.88,-147.921"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2109.51,-144.813 2099.14,-142.629 2105.88,-150.802 2109.51,-144.813"/>
+<g id="edge96" class="edge"><title>Node6&#45;&gt;Node10</title>
+<path fill="none" stroke="midnightblue" d="M1517.12,-544.93C1585.4,-542.262 1712.19,-533.003 1815,-501 1867,-484.812 1882,-479.671 1924,-445 1981.15,-397.823 2115.54,-208.891 2156.16,-150.969"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2159.18,-152.76 2162.05,-142.56 2153.45,-148.747 2159.18,-152.76"/>
 </g>
 <!-- Node6&#45;&gt;Node11 -->
-<g id="edge96" class="edge"><title>Node6&#45;&gt;Node11</title>
-<path fill="none" stroke="midnightblue" d="M2318.35,-490.183C2368.44,-488.263 2446.59,-479.31 2504,-445 2613.92,-379.306 2629.43,-324.677 2654,-199 2671.98,-107.045 2531.36,-84.3989 2442.78,-79.182"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2442.95,-75.6863 2432.78,-78.6535 2442.58,-82.6766 2442.95,-75.6863"/>
+<g id="edge97" class="edge"><title>Node6&#45;&gt;Node11</title>
+<path fill="none" stroke="midnightblue" d="M1418.87,-542.09C1364.64,-536.645 1282.42,-524.659 1263,-501 1172.19,-390.36 1170.58,-293.1 1257,-179 1270.87,-160.69 1433.62,-128.152 1456,-123 1517.72,-108.796 1588.86,-96.3675 1639.44,-88.1919"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1640.27,-91.6035 1649.59,-86.5636 1639.16,-84.6919 1640.27,-91.6035"/>
 </g>
 <!-- Node12 -->
 <g id="node12" class="node"><title>Node12</title>
 <g id="a_node12"><a xlink:href="c__runtime__api_8h.html" target="_top" xlink:title="tvm/runtime/c_runtime\l_api.h">
-<polygon fill="white" stroke="red" points="2710.5,-0.5 2710.5,-30.5 2837.5,-30.5 2837.5,-0.5 2710.5,-0.5"/>
-<text text-anchor="start" x="2718.5" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/runtime/c_runtime</text>
-<text text-anchor="middle" x="2774" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00">_api.h</text>
+<polygon fill="white" stroke="red" points="1446.5,-0.5 1446.5,-30.5 1573.5,-30.5 1573.5,-0.5 1446.5,-0.5"/>
+<text text-anchor="start" x="1454.5" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/runtime/c_runtime</text>
+<text text-anchor="middle" x="1510" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00">_api.h</text>
 </a>
 </g>
 </g>
 <!-- Node6&#45;&gt;Node12 -->
-<g id="edge93" class="edge"><title>Node6&#45;&gt;Node12</title>
-<path fill="none" stroke="midnightblue" d="M2318.35,-481.753C2371.59,-472.826 2458.84,-458.106 2534,-445 2568.8,-438.931 2814.62,-403.133 2847,-389 2975.56,-332.89 3036.43,-324.538 3099,-199 3122.45,-151.95 3118.71,-122.599 3047,-67 3016.55,-43.3896 2917.64,-29.5727 2847.67,-22.5258"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2847.86,-19.0279 2837.57,-21.5349 2847.18,-25.9944 2847.86,-19.0279"/>
+<g id="edge94" class="edge"><title>Node6&#45;&gt;Node12</title>
+<path fill="none" stroke="midnightblue" d="M1418.86,-545.249C1371.16,-542.51 1298.21,-532.849 1244,-501 1179.07,-462.848 1011.88,-306.99 1034,-235 1083.95,-72.4191 1315.03,-30.7716 1435.88,-20.1312"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1436.48,-23.5933 1446.16,-19.2777 1435.9,-16.6173 1436.48,-23.5933"/>
 </g>
 <!-- Node6&#45;&gt;Node13 -->
-<g id="edge97" class="edge"><title>Node6&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M2278.42,-481.23C2329.52,-433.205 2569.5,-201.397 2527,-123 2505.26,-82.9026 2484.14,-81.8021 2441,-67 2334.25,-30.3739 1981.12,-19.6226 1872.25,-17.1402"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1872.27,-13.6401 1862.2,-16.9191 1872.12,-20.6384 1872.27,-13.6401"/>
+<g id="edge98" class="edge"><title>Node6&#45;&gt;Node13</title>
+<path fill="none" stroke="midnightblue" d="M1418.87,-539.873C1366.94,-532.815 1282.9,-519.732 1212,-501 1194.09,-496.269 1069.77,-456.177 1055,-445 948.418,-364.34 956.011,-310.302 882,-199 843.229,-140.694 848.935,-112.832 796,-67 773.475,-47.4974 742.146,-34.0351 718.853,-25.9449"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="719.689,-22.5344 709.095,-22.704 717.482,-29.1776 719.689,-22.5344"/>
 </g>
 <!-- Node6&#45;&gt;Node14 -->
-<g id="edge98" class="edge"><title>Node6&#45;&gt;Node14</title>
-<path fill="none" stroke="midnightblue" d="M2318.09,-487.752C2425.82,-481.615 2689.95,-459.96 2898,-389 3076.34,-328.173 3129.77,-316.587 3277,-199 3330.78,-156.046 3394.85,-125.137 3358,-67 3344.1,-45.0631 3317.83,-32.4767 3294.8,-25.3555"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="3295.74,-21.9868 3285.17,-22.6269 3293.84,-28.7215 3295.74,-21.9868"/>
+<g id="edge99" class="edge"><title>Node6&#45;&gt;Node14</title>
+<path fill="none" stroke="midnightblue" d="M1463.32,-537.386C1452.48,-516.377 1427.3,-460.793 1437,-414 1473.16,-239.48 1470.59,-128.138 1638,-67 1714.45,-39.0791 2285,-22.0078 2449.64,-17.6257"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2449.99,-21.1179 2459.89,-17.3554 2449.8,-14.1204 2449.99,-21.1179"/>
 </g>
 <!-- Node6&#45;&gt;Node15 -->
-<g id="edge99" class="edge"><title>Node6&#45;&gt;Node15</title>
-<path fill="none" stroke="midnightblue" d="M2219.61,-488.131C2096.6,-483.176 1779.06,-468.452 1676,-445 1639.31,-436.652 1071.47,-225.741 1045,-199 1021.1,-174.852 992.286,-76.4833 980.936,-35.0441"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="984.259,-33.9239 978.272,-25.1829 977.502,-35.7496 984.259,-33.9239"/>
+<g id="edge100" class="edge"><title>Node6&#45;&gt;Node15</title>
+<path fill="none" stroke="midnightblue" d="M1517.15,-546.56C1760.77,-548.514 2838.33,-547.315 3117,-378 3256.72,-293.108 3336.02,-95.6816 3357.8,-34.7444"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3361.1,-35.893 3361.1,-25.2982 3354.49,-33.5836 3361.1,-35.893"/>
 </g>
 <!-- Node6&#45;&gt;Node19 -->
-<g id="edge100" class="edge"><title>Node6&#45;&gt;Node19</title>
-<path fill="none" stroke="midnightblue" d="M2219.62,-489.929C2113.56,-489.014 1856.45,-482.714 1645,-445 1381.09,-397.928 1323.02,-351.663 1069,-266 983.359,-237.119 959.472,-235.971 877,-199 842.896,-183.712 805.462,-161.976 782.344,-147.93"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="784.004,-144.842 773.65,-142.599 780.345,-150.81 784.004,-144.842"/>
+<g id="edge101" class="edge"><title>Node6&#45;&gt;Node19</title>
+<path fill="none" stroke="midnightblue" d="M1517.09,-544.371C1616.7,-540.58 1849.54,-529.241 2043,-501 2492.91,-435.323 2669.3,-523.812 3036,-255 3072.91,-227.94 3098.13,-178.633 3109.73,-152.082"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3113.1,-153.096 3113.74,-142.521 3106.65,-150.386 3113.1,-153.096"/>
 </g>
 <!-- Node27 -->
 <g id="node27" class="node"><title>Node27</title>
 <g id="a_node27"><a xlink:href="reflection_8h.html" target="_top" xlink:title="Reflection and serialization of compiler IR/AST nodes. ">
-<polygon fill="white" stroke="black" points="2151.5,-425.5 2151.5,-444.5 2270.5,-444.5 2270.5,-425.5 2151.5,-425.5"/>
-<text text-anchor="middle" x="2211" y="-432.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/node/reflection.h</text>
+<polygon fill="white" stroke="black" points="1659.5,-481.5 1659.5,-500.5 1778.5,-500.5 1778.5,-481.5 1659.5,-481.5"/>
+<text text-anchor="middle" x="1719" y="-488.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/node/reflection.h</text>
 </a>
 </g>
 </g>
 <!-- Node6&#45;&gt;Node27 -->
-<g id="edge62" class="edge"><title>Node6&#45;&gt;Node27</title>
-<path fill="none" stroke="midnightblue" d="M2259.68,-481.324C2251.11,-473.341 2238.19,-461.313 2227.81,-451.653"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2230.19,-449.086 2220.49,-444.834 2225.42,-454.21 2230.19,-449.086"/>
+<g id="edge63" class="edge"><title>Node6&#45;&gt;Node27</title>
+<path fill="none" stroke="midnightblue" d="M1507.77,-537.444C1551.56,-528.023 1622.01,-512.866 1669.38,-502.675"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1670.37,-506.041 1679.41,-500.516 1668.9,-499.198 1670.37,-506.041"/>
 </g>
 <!-- Node6&#45;&gt;Node28 -->
-<g id="edge91" class="edge"><title>Node6&#45;&gt;Node28</title>
-<path fill="none" stroke="midnightblue" d="M2227.69,-481.491C2201.44,-474.669 2167.68,-463.17 2142,-445 2124.06,-432.31 2108.74,-412.495 2098.51,-397.1"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2101.43,-395.173 2093.1,-388.634 2095.54,-398.944 2101.43,-395.173"/>
+<g id="edge92" class="edge"><title>Node6&#45;&gt;Node28</title>
+<path fill="none" stroke="midnightblue" d="M1470.56,-537.319C1475.69,-519.889 1487.4,-480.104 1494.99,-454.313"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1498.4,-455.127 1497.87,-444.545 1491.68,-453.15 1498.4,-455.127"/>
 </g>
 <!-- Node6&#45;&gt;Node30 -->
-<g id="edge92" class="edge"><title>Node6&#45;&gt;Node30</title>
-<path fill="none" stroke="midnightblue" d="M2219.9,-483.29C2163.48,-475.32 2068.22,-461.065 1987,-445 1905.61,-428.901 1812.71,-406.23 1752.57,-390.983"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1753.4,-387.582 1742.84,-388.508 1751.67,-394.366 1753.4,-387.582"/>
+<g id="edge93" class="edge"><title>Node6&#45;&gt;Node30</title>
+<path fill="none" stroke="midnightblue" d="M1517.27,-542.772C1594.62,-537.087 1740.51,-523.692 1787,-501 1809.92,-489.813 1830.26,-468.733 1843.61,-452.527"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1846.45,-454.581 1849.93,-444.575 1840.97,-450.226 1846.45,-454.581"/>
 </g>
 <!-- Node31 -->
 <g id="node31" class="node"><title>Node31</title>
 <g id="a_node31"><a xlink:href="repr__printer_8h.html" target="_top" xlink:title="Printer class to print repr string of each AST/IR nodes. ">
-<polygon fill="white" stroke="black" points="2365,-425.5 2365,-444.5 2495,-444.5 2495,-425.5 2365,-425.5"/>
-<text text-anchor="middle" x="2430" y="-432.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/node/repr_printer.h</text>
+<polygon fill="white" stroke="black" points="1272,-481.5 1272,-500.5 1402,-500.5 1402,-481.5 1272,-481.5"/>
+<text text-anchor="middle" x="1337" y="-488.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/node/repr_printer.h</text>
 </a>
 </g>
 </g>
 <!-- Node6&#45;&gt;Node31 -->
-<g id="edge88" class="edge"><title>Node6&#45;&gt;Node31</title>
-<path fill="none" stroke="midnightblue" d="M2294.51,-481.444C2321.6,-472.359 2364.59,-457.939 2394.87,-447.783"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2396.24,-451.015 2404.61,-444.516 2394.01,-444.378 2396.24,-451.015"/>
+<g id="edge89" class="edge"><title>Node6&#45;&gt;Node31</title>
+<path fill="none" stroke="midnightblue" d="M1447.24,-537.444C1425.68,-528.557 1391.74,-514.563 1367.21,-504.451"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1368.24,-501.092 1357.66,-500.516 1365.57,-507.563 1368.24,-501.092"/>
 </g>
 <!-- Node7&#45;&gt;Node8 -->
 <g id="edge7" class="edge"><title>Node7&#45;&gt;Node8</title>
-<path fill="none" stroke="midnightblue" d="M1876.86,-302.298C1853.18,-281.978 1793.99,-231.186 1764.01,-205.457"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1765.92,-202.479 1756.05,-198.623 1761.36,-207.792 1765.92,-202.479"/>
+<path fill="none" stroke="midnightblue" d="M2193.6,-362.17C2241.65,-356.709 2310.16,-345.323 2366,-322 2414.69,-301.666 2435.23,-299.225 2464,-255 2472.9,-241.324 2475.61,-222.788 2476.27,-209.009"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2479.77,-208.818 2476.43,-198.765 2472.77,-208.711 2479.77,-208.818"/>
 </g>
 <!-- Node7&#45;&gt;Node10 -->
 <g id="edge25" class="edge"><title>Node7&#45;&gt;Node10</title>
-<path fill="none" stroke="midnightblue" d="M1919.69,-302.402C1942.07,-295.325 1971.47,-283.568 1993,-266 2032.59,-233.702 2062.19,-179.86 2075.77,-151.972"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2079.07,-153.19 2080.18,-142.653 2072.74,-150.196 2079.07,-153.19"/>
+<path fill="none" stroke="midnightblue" d="M2132.62,-358.261C2129.52,-336.973 2122.87,-280.817 2131,-235 2136.36,-204.763 2150.1,-171.697 2159.25,-151.912"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2162.53,-153.161 2163.65,-142.626 2156.2,-150.159 2162.53,-153.161"/>
 </g>
 <!-- Node7&#45;&gt;Node11 -->
 <g id="edge26" class="edge"><title>Node7&#45;&gt;Node11</title>
-<path fill="none" stroke="midnightblue" d="M1946.65,-304.84C2059.74,-291.905 2298.97,-258.12 2354,-199 2379.91,-171.17 2379.4,-123.289 2376.73,-96.8724"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2380.17,-96.1878 2375.48,-86.6894 2373.22,-97.0417 2380.17,-96.1878"/>
+<path fill="none" stroke="midnightblue" d="M2090.68,-358.458C2014.47,-341.298 1855.74,-295.708 1763,-199 1734.44,-169.216 1718.06,-122.478 1710.72,-96.6743"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1714.02,-95.4649 1708.04,-86.717 1707.26,-97.2828 1714.02,-95.4649"/>
 </g>
 <!-- Node7&#45;&gt;Node13 -->
-<g id="edge60" class="edge"><title>Node7&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M1901.79,-302.448C1933.62,-282.833 2004.17,-232.18 1984,-179 1959.09,-113.323 1894.14,-57.1575 1860.65,-31.4566"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1862.57,-28.5223 1852.47,-25.3114 1858.36,-34.1178 1862.57,-28.5223"/>
+<g id="edge61" class="edge"><title>Node7&#45;&gt;Node13</title>
+<path fill="none" stroke="midnightblue" d="M2074.49,-363.396C2005.28,-358.248 1888.01,-346.55 1790,-322 1615.36,-278.255 1580.99,-238.303 1411,-179 1257.19,-125.341 1220.9,-102.9 1062,-67 936.963,-38.7505 784.659,-24.2255 719.308,-18.9158"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="719.467,-15.4175 709.221,-18.1139 718.913,-22.3955 719.467,-15.4175"/>
 </g>
 <!-- Node7&#45;&gt;Node15 -->
-<g id="edge61" class="edge"><title>Node7&#45;&gt;Node15</title>
-<path fill="none" stroke="midnightblue" d="M1827.22,-308.436C1715.84,-303.165 1480.84,-289.586 1403,-266 1229.07,-213.295 1050.83,-77.1672 994.327,-31.6085"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="996.289,-28.6935 986.322,-25.1009 991.874,-34.125 996.289,-28.6935"/>
+<g id="edge62" class="edge"><title>Node7&#45;&gt;Node15</title>
+<path fill="none" stroke="midnightblue" d="M2193.95,-365.801C2311.49,-362.795 2567.92,-352.815 2650,-322 2711.34,-298.973 2711.24,-265.927 2769,-235 2972.66,-125.946 3242.44,-48.8113 3332.92,-24.5843"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3334.05,-27.9072 3342.81,-21.9567 3332.25,-21.1419 3334.05,-27.9072"/>
 </g>
 <!-- Node7&#45;&gt;Node16 -->
-<g id="edge59" class="edge"><title>Node7&#45;&gt;Node16</title>
-<path fill="none" stroke="midnightblue" d="M1946.71,-304.778C2013.76,-297.369 2125.66,-283.736 2221,-266 2233.91,-263.598 2607.2,-174.211 2733.61,-143.92"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2734.77,-147.243 2743.67,-141.508 2733.13,-140.435 2734.77,-147.243"/>
+<g id="edge60" class="edge"><title>Node7&#45;&gt;Node16</title>
+<path fill="none" stroke="midnightblue" d="M2074.4,-360.944C1990.04,-350.242 1833.26,-322.368 1718,-255 1685.37,-235.928 1679.17,-227.196 1654,-199 1640.76,-184.169 1628.1,-165.313 1619.6,-151.734"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1622.4,-149.611 1614.19,-142.912 1616.43,-153.268 1622.4,-149.611"/>
 </g>
 <!-- Node7&#45;&gt;Node20 -->
 <g id="edge27" class="edge"><title>Node7&#45;&gt;Node20</title>
-<path fill="none" stroke="midnightblue" d="M1827.11,-302.475C1749.93,-291.489 1616.16,-272.447 1535.77,-261.005"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1536.16,-257.524 1525.76,-259.58 1535.17,-264.454 1536.16,-257.524"/>
+<path fill="none" stroke="midnightblue" d="M2157.93,-358.475C2182.34,-349.776 2220.93,-336.026 2251.89,-324.998"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2253.4,-328.176 2261.64,-321.523 2251.05,-321.582 2253.4,-328.176"/>
 </g>
 <!-- Node9 -->
 <g id="node9" class="node"><title>Node9</title>
-<polygon fill="white" stroke="#bfbfbf" points="1726,-123.5 1726,-142.5 1812,-142.5 1812,-123.5 1726,-123.5"/>
-<text text-anchor="middle" x="1769" y="-130.5" font-family="Helvetica,sans-Serif" font-size="10.00">dmlc/logging.h</text>
+<polygon fill="white" stroke="#bfbfbf" points="1924,-123.5 1924,-142.5 2010,-142.5 2010,-123.5 1924,-123.5"/>
+<text text-anchor="middle" x="1967" y="-130.5" font-family="Helvetica,sans-Serif" font-size="10.00">dmlc/logging.h</text>
 </g>
 <!-- Node8&#45;&gt;Node9 -->
 <g id="edge8" class="edge"><title>Node8&#45;&gt;Node9</title>
-<path fill="none" stroke="midnightblue" d="M1749.8,-179.083C1752.91,-171.77 1757.42,-161.181 1761.29,-152.103"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1764.57,-153.323 1765.27,-142.751 1758.13,-150.58 1764.57,-153.323"/>
+<path fill="none" stroke="midnightblue" d="M2409.98,-180.996C2307.96,-170.172 2114,-149.595 2020.3,-139.654"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2020.51,-136.157 2010.19,-138.582 2019.77,-143.118 2020.51,-136.157"/>
 </g>
 <!-- Node8&#45;&gt;Node10 -->
 <g id="edge9" class="edge"><title>Node8&#45;&gt;Node10</title>
-<path fill="none" stroke="midnightblue" d="M1799.56,-179.444C1859.63,-169.846 1956.99,-154.292 2020.76,-144.104"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2021.37,-147.55 2030.69,-142.516 2020.27,-140.638 2021.37,-147.55"/>
+<path fill="none" stroke="midnightblue" d="M2427.2,-179.444C2372.79,-169.905 2284.83,-154.483 2226.71,-144.293"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2227.03,-140.796 2216.58,-142.516 2225.82,-147.691 2227.03,-140.796"/>
 </g>
 <!-- Node8&#45;&gt;Node11 -->
 <g id="edge17" class="edge"><title>Node8&#45;&gt;Node11</title>
-<path fill="none" stroke="midnightblue" d="M1775.96,-179.433C1824.8,-165.74 1924.81,-138.856 2011,-123 2112.7,-104.29 2231.9,-91.1747 2305.18,-84.1072"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2305.59,-87.5844 2315.21,-83.1501 2304.92,-80.616 2305.59,-87.5844"/>
+<path fill="none" stroke="midnightblue" d="M2477.99,-179.324C2480.66,-165.227 2483.42,-137.472 2468,-123 2442.93,-99.4696 1957.53,-84.5121 1774.79,-79.6954"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1774.84,-76.1957 1764.75,-79.4331 1774.66,-83.1934 1774.84,-76.1957"/>
 </g>
 <!-- Node8&#45;&gt;Node13 -->
 <g id="edge20" class="edge"><title>Node8&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M1707.24,-179.462C1672.41,-169.774 1629.13,-151.511 1646,-123 1681.13,-63.642 1762.39,-35.1625 1807.8,-23.4434"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1808.85,-26.7902 1817.73,-21.0033 1807.18,-19.9927 1808.85,-26.7902"/>
+<path fill="none" stroke="midnightblue" d="M2481.65,-179.209C2489.7,-165.448 2502.04,-138.767 2488,-123 2344.26,38.3689 2220.47,-83.5938 2005,-67 1493.26,-27.5892 867.265,-18.478 719.381,-16.8206"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="719.223,-13.3187 709.186,-16.7098 719.147,-20.3183 719.223,-13.3187"/>
 </g>
 <!-- Node8&#45;&gt;Node14 -->
 <g id="edge22" class="edge"><title>Node8&#45;&gt;Node14</title>
-<path fill="none" stroke="midnightblue" d="M1736.81,-179.49C1723.46,-166.114 1701.85,-139.95 1717,-123 1754.19,-81.407 2160.39,-71.4728 2216,-67 2413.16,-51.1427 3035.26,-25.267 3206.81,-18.2838"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="3207.14,-21.7736 3216.99,-17.8703 3206.85,-14.7793 3207.14,-21.7736"/>
+<path fill="none" stroke="midnightblue" d="M2497.48,-179.36C2512.39,-172.09 2531.11,-160.117 2540,-143 2559.56,-105.34 2528.22,-57.458 2508.36,-32.8043"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2511.01,-30.5262 2501.91,-25.1076 2505.65,-35.0211 2511.01,-30.5262"/>
 </g>
 <!-- Node8&#45;&gt;Node15 -->
 <g id="edge23" class="edge"><title>Node8&#45;&gt;Node15</title>
-<path fill="none" stroke="midnightblue" d="M1711.09,-179.4C1655.91,-166.019 1545.04,-139.988 1450,-123 1335.16,-102.474 1303.64,-113.358 1190,-87 1123.29,-71.5271 1047.74,-44.2472 1006.64,-28.5445"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1007.79,-25.2368 997.201,-24.9096 1005.28,-31.7693 1007.79,-25.2368"/>
+<path fill="none" stroke="midnightblue" d="M2492.81,-179.385C2538.3,-156.518 2668.23,-94.1802 2784,-67 2989.21,-18.8222 3245,-15.626 3332.76,-16.0962"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3332.82,-19.5967 3342.85,-16.1712 3332.88,-12.5969 3332.82,-19.5967"/>
 </g>
 <!-- Node8&#45;&gt;Node16 -->
 <g id="edge18" class="edge"><title>Node8&#45;&gt;Node16</title>
-<path fill="none" stroke="midnightblue" d="M1812,-182.404C1827.42,-181.191 1843.78,-179.98 1859,-179 2196.3,-157.288 2604.06,-140.641 2733.49,-135.589"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2733.89,-139.077 2743.74,-135.191 2733.61,-132.082 2733.89,-139.077"/>
+<path fill="none" stroke="midnightblue" d="M2409.99,-182.521C2394.57,-181.303 2378.22,-180.06 2363,-179 2091.21,-160.068 1763.95,-142.226 1650.27,-136.174"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1650.43,-132.678 1640.26,-135.642 1650.06,-139.668 1650.43,-132.678"/>
 </g>
 <!-- Node17 -->
 <g id="node17" class="node"><title>Node17</title>
-<polygon fill="white" stroke="#bfbfbf" points="1384.5,-123.5 1384.5,-142.5 1441.5,-142.5 1441.5,-123.5 1384.5,-123.5"/>
-<text text-anchor="middle" x="1413" y="-130.5" font-family="Helvetica,sans-Serif" font-size="10.00">memory</text>
+<polygon fill="white" stroke="#bfbfbf" points="2402.5,-123.5 2402.5,-142.5 2459.5,-142.5 2459.5,-123.5 2402.5,-123.5"/>
+<text text-anchor="middle" x="2431" y="-130.5" font-family="Helvetica,sans-Serif" font-size="10.00">memory</text>
 </g>
 <!-- Node8&#45;&gt;Node17 -->
 <g id="edge19" class="edge"><title>Node8&#45;&gt;Node17</title>
-<path fill="none" stroke="midnightblue" d="M1693.24,-179.444C1626.11,-168.558 1511.74,-150.012 1451.98,-140.321"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1452.26,-136.821 1441.83,-138.675 1451.14,-143.731 1452.26,-136.821"/>
+<path fill="none" stroke="midnightblue" d="M2468.57,-179.083C2462.15,-171.377 2452.69,-160.033 2444.88,-150.653"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2447.38,-148.193 2438.29,-142.751 2442.01,-152.674 2447.38,-148.193"/>
 </g>
 <!-- Node8&#45;&gt;Node18 -->
 <g id="edge21" class="edge"><title>Node8&#45;&gt;Node18</title>
-<path fill="none" stroke="midnightblue" d="M1679.84,-183.512C1526.8,-173.129 1150.46,-147.599 1006.9,-137.86"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1006.78,-134.344 996.566,-137.159 1006.31,-141.328 1006.78,-134.344"/>
+<path fill="none" stroke="midnightblue" d="M2538.91,-179.444C2615.23,-169.062 2742.78,-151.712 2816.23,-141.722"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2816.79,-145.179 2826.22,-140.363 2815.84,-138.243 2816.79,-145.179"/>
 </g>
 <!-- Node8&#45;&gt;Node19 -->
 <g id="edge24" class="edge"><title>Node8&#45;&gt;Node19</title>
-<path fill="none" stroke="midnightblue" d="M1679.97,-184.835C1536.77,-177.899 1186.94,-160.573 894,-143 859.892,-140.954 820.923,-138.301 793.835,-136.407"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="793.936,-132.905 783.715,-135.696 793.445,-139.888 793.936,-132.905"/>
+<path fill="none" stroke="midnightblue" d="M2542.18,-182.424C2677.5,-171.025 2981.61,-145.406 3083.22,-136.846"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3083.56,-140.33 3093.23,-136.003 3082.97,-133.355 3083.56,-140.33"/>
 </g>
 <!-- Node10&#45;&gt;Node11 -->
 <g id="edge10" class="edge"><title>Node10&#45;&gt;Node11</title>
-<path fill="none" stroke="midnightblue" d="M2129.95,-123.444C2181.07,-113.925 2263.65,-98.5472 2318.38,-88.3563"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2319.07,-91.788 2328.26,-86.5164 2317.79,-84.9063 2319.07,-91.788"/>
+<path fill="none" stroke="midnightblue" d="M2104.25,-124.548C2017.72,-114.435 1863.73,-96.4365 1774.72,-86.0319"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1775.06,-82.5483 1764.72,-84.8636 1774.25,-89.501 1775.06,-82.5483"/>
 </g>
 <!-- Node10&#45;&gt;Node14 -->
 <g id="edge15" class="edge"><title>Node10&#45;&gt;Node14</title>
-<path fill="none" stroke="midnightblue" d="M2106.6,-123.483C2146.29,-108.96 2231.38,-79.7409 2306,-67 2395.27,-51.7572 3031.82,-25.3082 3206.66,-18.265"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="3206.94,-21.7568 3216.79,-17.858 3206.66,-14.7625 3206.94,-21.7568"/>
+<path fill="none" stroke="midnightblue" d="M2231.97,-126.442C2278.42,-120.715 2341.98,-109.321 2394,-87 2425.99,-73.2723 2458.01,-48.2249 2476.91,-31.9791"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2479.41,-34.4352 2484.62,-25.2064 2474.79,-29.1756 2479.41,-34.4352"/>
 </g>
 <!-- Node10&#45;&gt;Node15 -->
 <g id="edge16" class="edge"><title>Node10&#45;&gt;Node15</title>
-<path fill="none" stroke="midnightblue" d="M2020.45,-125.375C1812.74,-103.724 1158.62,-35.5362 1007.41,-19.7742"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1007.33,-16.2467 997.018,-18.6909 1006.6,-23.2089 1007.33,-16.2467"/>
+<path fill="none" stroke="midnightblue" d="M2231.55,-124.52C2306.67,-115.658 2435.45,-100.404 2546,-87 2618.02,-78.2676 2635.81,-74.2305 2708,-67 2947.95,-42.9657 3238.27,-24.2283 3332.7,-18.3965"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3332.96,-21.8869 3342.73,-17.7801 3332.54,-14.9001 3332.96,-21.8869"/>
 </g>
 <!-- Node11&#45;&gt;Node12 -->
 <g id="edge11" class="edge"><title>Node11&#45;&gt;Node12</title>
-<path fill="none" stroke="midnightblue" d="M2431.31,-67.4751C2502.02,-56.9577 2622.37,-39.0556 2699.99,-27.5084"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2700.81,-30.926 2710.18,-25.9927 2699.78,-24.0022 2700.81,-30.926"/>
+<path fill="none" stroke="midnightblue" d="M1677.92,-67.4751C1648.89,-58.6635 1602.79,-44.6684 1566.23,-33.5698"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1566.78,-30.0788 1556.19,-30.523 1564.75,-36.7769 1566.78,-30.0788"/>
 </g>
 <!-- Node11&#45;&gt;Node13 -->
 <g id="edge12" class="edge"><title>Node11&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M2315.33,-69.4623C2202.71,-56.914 1959.66,-29.8334 1872.19,-20.0868"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1872.4,-16.589 1862.08,-18.96 1871.63,-23.5459 1872.4,-16.589"/>
+<path fill="none" stroke="midnightblue" d="M1647.18,-72.5656C1457.19,-61.4716 863.828,-26.8251 719.084,-18.3734"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="719.26,-14.8778 709.073,-17.7889 718.852,-21.8659 719.26,-14.8778"/>
 </g>
 <!-- Node11&#45;&gt;Node14 -->
 <g id="edge13" class="edge"><title>Node11&#45;&gt;Node14</title>
-<path fill="none" stroke="midnightblue" d="M2432.56,-72.0268C2596.91,-60.8764 3059.97,-29.4602 3206.18,-19.5409"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="3206.81,-23.0062 3216.55,-18.8373 3206.34,-16.0223 3206.81,-23.0062"/>
+<path fill="none" stroke="midnightblue" d="M1764.58,-71.5764C1915.65,-60.1699 2315.91,-29.9472 2449.72,-19.8433"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2450.29,-23.3105 2460,-19.0674 2449.76,-16.3304 2450.29,-23.3105"/>
 </g>
 <!-- Node11&#45;&gt;Node15 -->
 <g id="edge14" class="edge"><title>Node11&#45;&gt;Node15</title>
-<path fill="none" stroke="midnightblue" d="M2315.17,-73.013C2279.05,-71.2064 2231.85,-68.8941 2190,-67 1719.64,-45.7138 1146.6,-23.1561 1007.51,-17.7259"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1007.3,-14.2153 997.171,-17.3228 1007.03,-21.21 1007.3,-14.2153"/>
+<path fill="none" stroke="midnightblue" d="M1764.52,-73.8998C2031.87,-64.3056 3132.66,-24.8022 3332.73,-17.6222"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3332.98,-21.1156 3342.85,-17.2591 3332.73,-14.1201 3332.98,-21.1156"/>
 </g>
 <!-- Node20&#45;&gt;Node8 -->
 <g id="edge30" class="edge"><title>Node20&#45;&gt;Node8</title>
-<path fill="none" stroke="midnightblue" d="M1525.72,-237.316C1575.97,-226.524 1648.58,-210.927 1696.51,-200.631"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1697.3,-204.04 1706.34,-198.518 1695.83,-197.196 1697.3,-204.04"/>
+<path fill="none" stroke="midnightblue" d="M2321.59,-291.266C2342.48,-276.891 2375.81,-254.136 2405,-235 2421.22,-224.367 2439.73,-212.647 2453.79,-203.834"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2455.69,-206.773 2462.31,-198.503 2451.97,-200.838 2455.69,-206.773"/>
 </g>
 <!-- Node20&#45;&gt;Node9 -->
 <g id="edge28" class="edge"><title>Node20&#45;&gt;Node9</title>
-<path fill="none" stroke="midnightblue" d="M1518.99,-235.412C1550.59,-226.066 1592.11,-212.982 1628,-199 1647.64,-191.35 1651.87,-187.842 1671,-179 1694.55,-168.115 1721.32,-155.828 1740.93,-146.846"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1742.57,-149.946 1750.2,-142.6 1739.65,-143.581 1742.57,-149.946"/>
+<path fill="none" stroke="midnightblue" d="M2244.13,-302.693C2200.55,-298.028 2140.83,-285.945 2098,-255 2065.62,-231.604 2079.17,-206.295 2050,-179 2035.29,-165.239 2015.58,-154.32 1999.06,-146.686"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2000.32,-143.418 1989.76,-142.574 1997.49,-149.819 2000.32,-143.418"/>
 </g>
 <!-- Node20&#45;&gt;Node11 -->
-<g id="edge50" class="edge"><title>Node20&#45;&gt;Node11</title>
-<path fill="none" stroke="midnightblue" d="M1506.42,-235.492C1530.65,-226.032 1562.67,-212.793 1590,-199 1648.73,-169.367 1654.23,-142.679 1717,-123 1825.13,-89.0965 2159.78,-80.7165 2305.27,-78.6609"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2305.47,-82.1587 2315.42,-78.5236 2305.37,-75.1594 2305.47,-82.1587"/>
+<g id="edge51" class="edge"><title>Node20&#45;&gt;Node11</title>
+<path fill="none" stroke="midnightblue" d="M2244.39,-297.735C2209.37,-290.839 2164.8,-278.112 2131,-255 2067.49,-211.575 2087.32,-158.382 2019,-123 2018.66,-122.824 1866.93,-101.06 1774.88,-87.868"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1775.19,-84.3761 1764.79,-86.4221 1774.19,-91.3053 1775.19,-84.3761"/>
 </g>
 <!-- Node20&#45;&gt;Node12 -->
 <g id="edge29" class="edge"><title>Node20&#45;&gt;Node12</title>
-<path fill="none" stroke="midnightblue" d="M1525.56,-248.823C1694.85,-246.395 2195.56,-236.179 2354,-199 2505.91,-163.353 2671.76,-75.0937 2740.89,-35.848"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2743.04,-38.6475 2749.99,-30.6462 2739.57,-32.5703 2743.04,-38.6475"/>
+<path fill="none" stroke="midnightblue" d="M2244.2,-298.032C2197.95,-290.703 2131.62,-277.311 2077,-255 1929.94,-194.927 1921.01,-124.71 1773,-67 1711.65,-43.0799 1637.69,-30.1704 1583.85,-23.4096"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1584.05,-19.9081 1573.7,-22.1778 1583.2,-26.8571 1584.05,-19.9081"/>
 </g>
 <!-- Node20&#45;&gt;Node13 -->
-<g id="edge54" class="edge"><title>Node20&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M1499.04,-235.321C1516.8,-226.217 1539.21,-213.419 1557,-199 1618.13,-149.449 1608.72,-109.409 1675,-67 1716.51,-40.4437 1772.79,-27.1653 1807.73,-21.0655"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1808.55,-24.4775 1817.85,-19.3961 1807.41,-17.5709 1808.55,-24.4775"/>
+<g id="edge55" class="edge"><title>Node20&#45;&gt;Node13</title>
+<path fill="none" stroke="midnightblue" d="M2244.35,-295.327C2189.48,-285.382 2103.94,-269.642 2030,-255 1911.06,-231.448 1870.54,-255.006 1763,-199 1719.22,-176.197 1725.28,-146.752 1682,-123 1593.93,-74.6624 1562.22,-82.7742 1463,-67 1177.15,-21.5547 825.844,-16.8089 719.187,-16.4669"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="719.084,-12.9668 709.077,-16.4457 719.069,-19.9667 719.084,-12.9668"/>
 </g>
 <!-- Node20&#45;&gt;Node14 -->
-<g id="edge56" class="edge"><title>Node20&#45;&gt;Node14</title>
-<path fill="none" stroke="midnightblue" d="M1525.55,-249.567C1772.31,-249.269 2755.26,-241.785 3045,-143 3120.85,-117.14 3198.15,-59.4846 3232.54,-31.8485"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="3235.16,-34.2314 3240.7,-25.203 3230.74,-28.8034 3235.16,-34.2314"/>
+<g id="edge57" class="edge"><title>Node20&#45;&gt;Node14</title>
+<path fill="none" stroke="midnightblue" d="M2357.54,-299.852C2417.81,-290.416 2510.06,-265.348 2551,-199 2581.8,-149.071 2577.9,-119.138 2551,-67 2543.34,-52.1559 2529.36,-39.6641 2517.16,-30.818"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2518.94,-27.7943 2508.71,-25.0395 2514.99,-33.5728 2518.94,-27.7943"/>
 </g>
 <!-- Node20&#45;&gt;Node15 -->
-<g id="edge57" class="edge"><title>Node20&#45;&gt;Node15</title>
-<path fill="none" stroke="midnightblue" d="M1434.11,-235.491C1409.83,-225.606 1376.78,-211.881 1348,-199 1215.83,-139.837 1061.52,-60.8789 1001.6,-29.8349"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1002.98,-26.6073 992.487,-25.1074 999.751,-32.8204 1002.98,-26.6073"/>
+<g id="edge58" class="edge"><title>Node20&#45;&gt;Node15</title>
+<path fill="none" stroke="midnightblue" d="M2357.63,-295.54C2402.34,-286.938 2465.97,-273.051 2520,-255 2588.5,-232.114 2749.99,-149.955 2817,-123 2884.95,-95.6638 2900.81,-84.2266 2972,-67 3103.72,-35.1268 3265.52,-22.3757 3332.59,-18.2126"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3333.11,-21.6875 3342.88,-17.5954 3332.69,-14.7001 3333.11,-21.6875"/>
 </g>
 <!-- Node20&#45;&gt;Node17 -->
-<g id="edge53" class="edge"><title>Node20&#45;&gt;Node17</title>
-<path fill="none" stroke="midnightblue" d="M1412.43,-238.333C1390.65,-230.978 1367.93,-218.867 1355,-199 1342.35,-179.565 1365.66,-160.096 1386.39,-147.604"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1388.33,-150.524 1395.29,-142.539 1384.87,-144.439 1388.33,-150.524"/>
+<g id="edge54" class="edge"><title>Node20&#45;&gt;Node17</title>
+<path fill="none" stroke="midnightblue" d="M2311.64,-291.465C2335.36,-260.169 2392.52,-184.764 2418.17,-150.929"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2421.26,-152.643 2424.51,-142.56 2415.68,-148.415 2421.26,-152.643"/>
 </g>
 <!-- Node20&#45;&gt;Node19 -->
-<g id="edge58" class="edge"><title>Node20&#45;&gt;Node19</title>
-<path fill="none" stroke="midnightblue" d="M1412.21,-236.367C1344.39,-221.024 1227.39,-195.625 1126,-179 1004.22,-159.031 858.313,-143.62 793.667,-137.225"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="793.938,-133.735 783.644,-136.242 793.254,-140.701 793.938,-133.735"/>
+<g id="edge59" class="edge"><title>Node20&#45;&gt;Node19</title>
+<path fill="none" stroke="midnightblue" d="M2357.58,-295.652C2412.88,-285.882 2499.38,-270.211 2574,-255 2767.51,-215.556 2998.31,-161.929 3083.38,-141.939"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3084.32,-145.314 3093.25,-139.617 3082.71,-138.5 3084.32,-145.314"/>
 </g>
 <!-- Node21 -->
 <g id="node21" class="node"><title>Node21</title>
 <g id="a_node21"><a xlink:href="data__type_8h.html" target="_top" xlink:title="tvm/runtime/data_type.h">
-<polygon fill="white" stroke="red" points="2900,-123.5 2900,-142.5 3036,-142.5 3036,-123.5 2900,-123.5"/>
-<text text-anchor="middle" x="2968" y="-130.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/runtime/data_type.h</text>
+<polygon fill="white" stroke="red" points="1266,-179.5 1266,-198.5 1402,-198.5 1402,-179.5 1266,-179.5"/>
+<text text-anchor="middle" x="1334" y="-186.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/runtime/data_type.h</text>
 </a>
 </g>
 </g>
 <!-- Node20&#45;&gt;Node21 -->
 <g id="edge31" class="edge"><title>Node20&#45;&gt;Node21</title>
-<path fill="none" stroke="midnightblue" d="M1525.73,-247.895C1673.95,-243.402 2085,-229.01 2426,-199 2498.27,-192.64 2515.98,-187.71 2588,-179 2692.3,-166.386 2812.76,-152.183 2889.6,-143.169"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2890.16,-146.627 2899.69,-141.986 2889.35,-139.675 2890.16,-146.627"/>
+<path fill="none" stroke="midnightblue" d="M2244.17,-298.915C2161.71,-289.344 2004.09,-270.987 1870,-255 1707.18,-235.589 1516.61,-212.357 1412.24,-199.588"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1412.45,-196.088 1402.1,-198.348 1411.6,-203.037 1412.45,-196.088"/>
 </g>
 <!-- Node22 -->
 <g id="node22" class="node"><title>Node22</title>
 <g id="a_node22"><a xlink:href="runtime_2module_8h.html" target="_top" xlink:title="Runtime container of the functions generated by TVM, This is used to support dynamically link...">
-<polygon fill="white" stroke="red" points="1424,-179.5 1424,-198.5 1548,-198.5 1548,-179.5 1424,-179.5"/>
-<text text-anchor="middle" x="1486" y="-186.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/runtime/module.h</text>
+<polygon fill="white" stroke="red" points="2230,-179.5 2230,-198.5 2354,-198.5 2354,-179.5 2230,-179.5"/>
+<text text-anchor="middle" x="2292" y="-186.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/runtime/module.h</text>
 </a>
 </g>
 </g>
 <!-- Node20&#45;&gt;Node22 -->
 <g id="edge35" class="edge"><title>Node20&#45;&gt;Node22</title>
-<path fill="none" stroke="midnightblue" d="M1467.19,-235.399C1468.48,-227.199 1471.32,-216.781 1474.6,-207.994"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1477.88,-209.228 1478.6,-198.658 1471.44,-206.474 1477.88,-209.228"/>
+<path fill="none" stroke="midnightblue" d="M2295.28,-291.337C2291.01,-270.622 2287.82,-232.016 2288.32,-208.899"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2291.83,-208.852 2288.9,-198.669 2284.84,-208.455 2291.83,-208.852"/>
 </g>
 <!-- Node23 -->
 <g id="node23" class="node"><title>Node23</title>
 <g id="a_node23"><a xlink:href="ndarray_8h.html" target="_top" xlink:title="A device&#45;independent managed NDArray abstraction. ">
-<polygon fill="white" stroke="red" points="2222.5,-179.5 2222.5,-198.5 2345.5,-198.5 2345.5,-179.5 2222.5,-179.5"/>
-<text text-anchor="middle" x="2284" y="-186.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/runtime/ndarray.h</text>
+<polygon fill="white" stroke="red" points="2140.5,-235.5 2140.5,-254.5 2263.5,-254.5 2263.5,-235.5 2140.5,-235.5"/>
+<text text-anchor="middle" x="2202" y="-242.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/runtime/ndarray.h</text>
 </a>
 </g>
 </g>
 <!-- Node20&#45;&gt;Node23 -->
 <g id="edge44" class="edge"><title>Node20&#45;&gt;Node23</title>
-<path fill="none" stroke="midnightblue" d="M1525.66,-245.363C1670.22,-234.81 2052.56,-206.897 2212.29,-195.235"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2212.77,-198.71 2222.48,-194.491 2212.26,-191.729 2212.77,-198.71"/>
+<path fill="none" stroke="midnightblue" d="M2277.54,-291.399C2261.67,-281.862 2240.81,-269.325 2225.03,-259.842"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2226.78,-256.809 2216.41,-254.658 2223.17,-262.809 2226.78,-256.809"/>
 </g>
 <!-- Node20&#45;&gt;Node24 -->
-<g id="edge51" class="edge"><title>Node20&#45;&gt;Node24</title>
-<path fill="none" stroke="midnightblue" d="M1412.23,-241.778C1345.15,-232.473 1230.23,-215.961 1132,-199 1130.55,-198.749 1129.07,-198.489 1127.58,-198.221"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1128.11,-194.76 1117.64,-196.382 1126.84,-201.644 1128.11,-194.76"/>
+<g id="edge52" class="edge"><title>Node20&#45;&gt;Node24</title>
+<path fill="none" stroke="midnightblue" d="M2357.91,-298.848C2461.22,-286.771 2677.32,-261.51 2768.26,-250.88"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2768.93,-254.325 2778.46,-249.687 2768.12,-247.372 2768.93,-254.325"/>
 </g>
 <!-- Node20&#45;&gt;Node25 -->
-<g id="edge52" class="edge"><title>Node20&#45;&gt;Node25</title>
-<path fill="none" stroke="midnightblue" d="M1525.55,-247.73C1716.46,-241.702 2354.7,-221.111 2882,-199 2936.83,-196.701 3000.67,-193.52 3037.79,-191.62"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="3038.16,-195.106 3047.97,-191.097 3037.8,-188.115 3038.16,-195.106"/>
+<g id="edge53" class="edge"><title>Node20&#45;&gt;Node25</title>
+<path fill="none" stroke="midnightblue" d="M2244.31,-303.4C1985.75,-293.82 922.398,-254.425 726.199,-247.156"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="726.136,-243.651 716.013,-246.778 725.876,-250.646 726.136,-243.651"/>
 </g>
 <!-- Node26 -->
 <g id="node26" class="node"><title>Node26</title>
-<polygon fill="white" stroke="#bfbfbf" points="1364.5,-179.5 1364.5,-198.5 1405.5,-198.5 1405.5,-179.5 1364.5,-179.5"/>
-<text text-anchor="middle" x="1385" y="-186.5" font-family="Helvetica,sans-Serif" font-size="10.00">tuple</text>
+<polygon fill="white" stroke="#bfbfbf" points="2414.5,-235.5 2414.5,-254.5 2455.5,-254.5 2455.5,-235.5 2414.5,-235.5"/>
+<text text-anchor="middle" x="2435" y="-242.5" font-family="Helvetica,sans-Serif" font-size="10.00">tuple</text>
 </g>
 <!-- Node20&#45;&gt;Node26 -->
-<g id="edge55" class="edge"><title>Node20&#45;&gt;Node26</title>
-<path fill="none" stroke="midnightblue" d="M1449.09,-235.399C1436.01,-226.129 1418.92,-214.025 1405.68,-204.647"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1407.41,-201.582 1397.22,-198.658 1403.36,-207.294 1407.41,-201.582"/>
+<g id="edge56" class="edge"><title>Node20&#45;&gt;Node26</title>
+<path fill="none" stroke="midnightblue" d="M2332.76,-291.399C2355.04,-281.505 2384.59,-268.383 2406.19,-258.792"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2407.78,-261.915 2415.5,-254.658 2404.94,-255.517 2407.78,-261.915"/>
 </g>
 <!-- Node21&#45;&gt;Node12 -->
 <g id="edge32" class="edge"><title>Node21&#45;&gt;Node12</title>
-<path fill="none" stroke="midnightblue" d="M2954.81,-123.344C2935.36,-110.653 2897.75,-86.3982 2865,-67 2846.9,-56.2776 2826.44,-44.8619 2809.51,-35.6017"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2810.96,-32.4069 2800.51,-30.6971 2807.61,-38.5541 2810.96,-32.4069"/>
+<path fill="none" stroke="midnightblue" d="M1342.97,-179.26C1369.7,-153.215 1449.35,-75.5954 1488.21,-37.7372"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1490.68,-40.2084 1495.4,-30.7228 1485.8,-35.195 1490.68,-40.2084"/>
 </g>
 <!-- Node21&#45;&gt;Node13 -->
 <g id="edge33" class="edge"><title>Node21&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M2947.28,-123.47C2926.34,-114.725 2893.14,-100.53 2865,-87 2847.8,-78.7297 2845.4,-72.0501 2827,-67 2641.16,-16.0078 2021.53,-15.621 1872.46,-16.292"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1872.15,-12.7934 1862.17,-16.3435 1872.18,-19.7933 1872.15,-12.7934"/>
+<path fill="none" stroke="midnightblue" d="M1318.94,-179.442C1278.69,-156.987 1164.68,-96.1061 1062,-67 938.371,-31.9562 784.727,-20.9819 719.119,-17.7545"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="719.152,-14.2523 709.001,-17.2846 718.828,-21.2448 719.152,-14.2523"/>
 </g>
 <!-- Node21&#45;&gt;Node14 -->
 <g id="edge34" class="edge"><title>Node21&#45;&gt;Node14</title>
-<path fill="none" stroke="midnightblue" d="M2989.27,-123.319C3038.97,-103.035 3162.86,-52.4732 3220.55,-28.9284"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="3221.93,-32.1437 3229.87,-25.1245 3219.29,-25.6627 3221.93,-32.1437"/>
+<path fill="none" stroke="midnightblue" d="M1347.42,-179.45C1387.65,-153.989 1507.18,-79.6533 1551,-67 1722.86,-17.3805 2286.88,-15.6595 2449.65,-16.2326"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2449.77,-19.7331 2459.79,-16.2732 2449.8,-12.7331 2449.77,-19.7331"/>
 </g>
 <!-- Node22&#45;&gt;Node10 -->
 <g id="edge37" class="edge"><title>Node22&#45;&gt;Node10</title>
-<path fill="none" stroke="midnightblue" d="M1548.02,-182.399C1659.07,-172.372 1891.33,-151.398 2010.31,-140.655"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2010.77,-144.127 2020.41,-139.742 2010.14,-137.156 2010.77,-144.127"/>
+<path fill="none" stroke="midnightblue" d="M2272.35,-179.444C2252.04,-170.596 2220.1,-156.687 2196.9,-146.586"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2198.12,-143.3 2187.56,-142.516 2195.33,-149.718 2198.12,-143.3"/>
 </g>
 <!-- Node22&#45;&gt;Node11 -->
 <g id="edge38" class="edge"><title>Node22&#45;&gt;Node11</title>
-<path fill="none" stroke="midnightblue" d="M1506.35,-179.413C1541.74,-164.921 1617.22,-135.923 1684,-123 1801.59,-100.244 2155.08,-85.6227 2305.28,-80.2804"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2305.43,-83.7772 2315.3,-79.9268 2305.19,-76.7815 2305.43,-83.7772"/>
+<path fill="none" stroke="midnightblue" d="M2288.02,-179.385C2280.78,-164.848 2263.97,-135.788 2240,-123 2161.26,-80.9923 1900.61,-76.6744 1774.94,-77.1478"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1774.65,-73.6491 1764.67,-77.1984 1774.69,-80.649 1774.65,-73.6491"/>
 </g>
 <!-- Node22&#45;&gt;Node12 -->
 <g id="edge36" class="edge"><title>Node22&#45;&gt;Node12</title>
-<path fill="none" stroke="midnightblue" d="M1498.9,-179.307C1520.49,-165.185 1565.75,-137.394 1608,-123 1811.73,-53.5992 2476.26,-26.1414 2699.83,-18.7243"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2700.26,-22.2121 2710.14,-18.3861 2700.03,-15.2159 2700.26,-22.2121"/>
+<path fill="none" stroke="midnightblue" d="M2310.39,-179.471C2324.1,-171.997 2341.81,-159.71 2350,-143 2366.12,-110.087 2349.11,-86.3757 2318,-67 2256.61,-28.7628 1771.57,-19.3667 1583.84,-17.1526"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1583.82,-13.6523 1573.78,-17.0376 1583.74,-20.6519 1583.82,-13.6523"/>
 </g>
 <!-- Node22&#45;&gt;Node13 -->
 <g id="edge40" class="edge"><title>Node22&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M1441.09,-179.485C1402.01,-169.953 1354.6,-151.929 1375,-123 1425.54,-51.3148 1710.77,-25.3459 1807.73,-18.5166"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1808.09,-22.0009 1817.82,-17.8278 1807.61,-15.0171 1808.09,-22.0009"/>
+<path fill="none" stroke="midnightblue" d="M2291.26,-179.154C2289.55,-164.812 2283.97,-136.712 2266,-123 2090.67,10.7536 1992.82,-84.5759 1773,-67 1357.59,-33.7853 850.908,-20.2993 719.4,-17.2181"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="719.287,-13.7146 709.209,-16.9827 719.126,-20.7128 719.287,-13.7146"/>
 </g>
 <!-- Node22&#45;&gt;Node17 -->
 <g id="edge39" class="edge"><title>Node22&#45;&gt;Node17</title>
-<path fill="none" stroke="midnightblue" d="M1474.27,-179.324C1463.16,-171.107 1446.26,-158.602 1433.02,-148.807"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1435.06,-145.967 1424.94,-142.834 1430.9,-151.595 1435.06,-145.967"/>
+<path fill="none" stroke="midnightblue" d="M2314.02,-179.444C2337.1,-170.478 2373.56,-156.314 2399.64,-146.183"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2401.02,-149.4 2409.08,-142.516 2398.49,-142.875 2401.02,-149.4"/>
 </g>
 <!-- Node22&#45;&gt;Node18 -->
 <g id="edge41" class="edge"><title>Node22&#45;&gt;Node18</title>
-<path fill="none" stroke="midnightblue" d="M1423.96,-180.134C1420.59,-179.742 1417.26,-179.361 1414,-179 1267.37,-162.729 1094.31,-146.814 1006.79,-138.998"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1007.08,-135.509 996.805,-138.108 1006.46,-142.482 1007.08,-135.509"/>
+<path fill="none" stroke="midnightblue" d="M2354.18,-182.221C2467.61,-171.678 2706.53,-149.473 2816.01,-139.297"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2816.56,-142.761 2826.19,-138.351 2815.91,-135.791 2816.56,-142.761"/>
 </g>
 <!-- Node22&#45;&gt;Node19 -->
 <g id="edge42" class="edge"><title>Node22&#45;&gt;Node19</title>
-<path fill="none" stroke="midnightblue" d="M1423.97,-180.024C1420.6,-179.66 1417.26,-179.316 1414,-179 1382.87,-175.986 922.995,-144.969 793.63,-136.262"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="793.733,-132.761 783.521,-135.581 793.263,-139.745 793.733,-132.761"/>
+<path fill="none" stroke="midnightblue" d="M2354.24,-182.649C2369.52,-181.393 2385.85,-180.101 2401,-179 2663.45,-159.919 2980.37,-141.683 3083.1,-135.894"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3083.41,-139.382 3093.19,-135.326 3083.01,-132.393 3083.41,-139.382"/>
 </g>
 <!-- Node22&#45;&gt;Node20 -->
 <g id="edge43" class="edge"><title>Node22&#45;&gt;Node20</title>
-<path fill="none" stroke="midnightblue" d="M1488.46,-198.658C1487.95,-205.805 1485.52,-216.297 1482.36,-225.897"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1479.04,-224.806 1478.87,-235.399 1485.61,-227.222 1479.04,-224.806"/>
+<path fill="none" stroke="midnightblue" d="M2296.45,-198.669C2301.01,-215.987 2304.69,-255.408 2304.73,-281.215"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2301.23,-281.265 2304.51,-291.337 2308.22,-281.415 2301.23,-281.265"/>
+</g>
+<!-- Node23&#45;&gt;Node8 -->
+<g id="edge46" class="edge"><title>Node23&#45;&gt;Node8</title>
+<path fill="none" stroke="midnightblue" d="M2245.42,-235.444C2293.52,-225.964 2371.09,-210.675 2422.81,-200.483"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2423.65,-203.884 2432.79,-198.516 2422.3,-197.016 2423.65,-203.884"/>
 </g>
 <!-- Node23&#45;&gt;Node11 -->
-<g id="edge47" class="edge"><title>Node23&#45;&gt;Node11</title>
-<path fill="none" stroke="midnightblue" d="M2287.88,-179.433C2294,-166.432 2306.8,-141.299 2322,-123 2331.3,-111.808 2343.53,-101.073 2353.79,-92.8972"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2356.07,-95.5606 2361.84,-86.6772 2351.79,-90.0212 2356.07,-95.5606"/>
+<g id="edge48" class="edge"><title>Node23&#45;&gt;Node11</title>
+<path fill="none" stroke="midnightblue" d="M2194.66,-235.282C2174.61,-212.158 2116.1,-149.24 2052,-123 2003.25,-103.042 1861.03,-89.4311 1774.85,-82.7556"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1774.87,-79.2468 1764.63,-81.976 1774.33,-86.2265 1774.87,-79.2468"/>
 </g>
 <!-- Node23&#45;&gt;Node12 -->
 <g id="edge45" class="edge"><title>Node23&#45;&gt;Node12</title>
-<path fill="none" stroke="midnightblue" d="M2308.3,-179.495C2384.63,-152.781 2620.64,-70.1764 2724.3,-33.8933"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2725.63,-37.1371 2733.91,-30.53 2723.32,-30.5301 2725.63,-37.1371"/>
+<path fill="none" stroke="midnightblue" d="M2203.93,-235.401C2206.89,-222.78 2213.05,-198.613 2221,-179 2227.79,-162.233 2235.65,-160.561 2240,-143 2242.14,-134.372 2245.74,-129.791 2240,-123 2156.68,-24.3543 1751.76,-15.0325 1583.52,-15.5224"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1583.5,-12.0224 1573.51,-15.564 1583.52,-19.0223 1583.5,-12.0224"/>
 </g>
 <!-- Node23&#45;&gt;Node15 -->
-<g id="edge48" class="edge"><title>Node23&#45;&gt;Node15</title>
-<path fill="none" stroke="midnightblue" d="M2271.23,-179.436C2248.94,-164.978 2201.01,-136.033 2156,-123 1930.99,-57.8449 1172.29,-24.3242 1007.68,-17.7229"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1007.42,-14.2099 997.289,-17.3105 1007.14,-21.2044 1007.42,-14.2099"/>
+<g id="edge49" class="edge"><title>Node23&#45;&gt;Node15</title>
+<path fill="none" stroke="midnightblue" d="M2263.78,-242.07C2334.52,-238.486 2453.26,-228.247 2551,-199 2679.35,-160.594 2692.99,-103.133 2822,-67 2919.35,-39.7335 3232.56,-22.7696 3332.82,-17.9327"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3333.09,-21.424 3342.91,-17.4524 3332.75,-14.4319 3333.09,-21.424"/>
 </g>
 <!-- Node23&#45;&gt;Node19 -->
-<g id="edge49" class="edge"><title>Node23&#45;&gt;Node19</title>
-<path fill="none" stroke="midnightblue" d="M2222.35,-186.13C1970.59,-178.479 1027.54,-149.644 894,-143 859.873,-141.302 820.906,-138.599 793.824,-136.598"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="793.94,-133.097 783.707,-135.842 793.418,-140.077 793.94,-133.097"/>
+<g id="edge50" class="edge"><title>Node23&#45;&gt;Node19</title>
+<path fill="none" stroke="midnightblue" d="M2263.54,-236.829C2339.04,-228.013 2470.95,-212.552 2584,-199 2772.84,-176.363 2999.22,-148.523 3083.36,-138.151"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3083.87,-141.615 3093.36,-136.918 3083.01,-134.668 3083.87,-141.615"/>
 </g>
 <!-- Node23&#45;&gt;Node21 -->
-<g id="edge46" class="edge"><title>Node23&#45;&gt;Node21</title>
-<path fill="none" stroke="midnightblue" d="M2345.6,-183.137C2470.31,-173.292 2752.41,-151.02 2889.83,-140.171"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2890.18,-143.655 2899.87,-139.379 2889.63,-136.677 2890.18,-143.655"/>
+<g id="edge47" class="edge"><title>Node23&#45;&gt;Node21</title>
+<path fill="none" stroke="midnightblue" d="M2140.27,-240.16C1985.89,-230.555 1583.2,-205.503 1412.41,-194.878"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1412.43,-191.373 1402.23,-194.245 1412,-198.359 1412.43,-191.373"/>
 </g>
 <!-- Node27&#45;&gt;Node10 -->
-<g id="edge81" class="edge"><title>Node27&#45;&gt;Node10</title>
-<path fill="none" stroke="midnightblue" d="M2210.1,-425.405C2207.66,-403.975 2199.95,-346.685 2183,-302 2160.47,-242.589 2117.76,-179.99 2096.57,-150.812"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2099.25,-148.541 2090.5,-142.56 2093.61,-152.689 2099.25,-148.541"/>
+<g id="edge82" class="edge"><title>Node27&#45;&gt;Node10</title>
+<path fill="none" stroke="midnightblue" d="M1717.27,-481.221C1714.8,-466.702 1711.45,-437.007 1720,-414 1741.35,-356.53 1804.82,-249.677 1932,-179 1981.78,-151.335 2045.59,-140.38 2094.31,-136.179"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2094.77,-139.654 2104.47,-135.382 2094.22,-132.675 2094.77,-139.654"/>
 </g>
 <!-- Node27&#45;&gt;Node11 -->
-<g id="edge83" class="edge"><title>Node27&#45;&gt;Node11</title>
-<path fill="none" stroke="midnightblue" d="M2229.35,-425.479C2244.49,-417.708 2265.72,-405.01 2280,-389 2342.84,-318.538 2340.05,-286.705 2375,-199 2387.81,-166.846 2398.55,-156.988 2392,-123 2390.21,-113.705 2386.52,-103.889 2382.94,-95.8048"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2386.06,-94.2089 2378.62,-86.6611 2379.73,-97.1981 2386.06,-94.2089"/>
+<g id="edge84" class="edge"><title>Node27&#45;&gt;Node11</title>
+<path fill="none" stroke="midnightblue" d="M1713.64,-481.333C1690.47,-443.255 1598.65,-286.321 1569,-143 1567.2,-134.295 1563.79,-130.2 1569,-123 1584.88,-101.064 1611.81,-89.6317 1637.6,-83.7429"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1638.31,-87.1712 1647.41,-81.747 1636.91,-80.3118 1638.31,-87.1712"/>
 </g>
 <!-- Node27&#45;&gt;Node12 -->
-<g id="edge79" class="edge"><title>Node27&#45;&gt;Node12</title>
-<path fill="none" stroke="midnightblue" d="M2255.74,-425.481C2415.03,-394.152 2950.06,-279.994 3045,-143 3097.84,-66.7479 2944.91,-35.3135 2847.7,-23.2809"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2847.94,-19.7853 2837.6,-22.0751 2847.11,-26.736 2847.94,-19.7853"/>
+<g id="edge80" class="edge"><title>Node27&#45;&gt;Node12</title>
+<path fill="none" stroke="midnightblue" d="M1659.39,-485.618C1588.06,-479.498 1473.67,-466.64 1437,-445 1313.2,-371.93 1238,-333.76 1238,-190 1238,-190 1238,-190 1238,-132 1238,-45.6557 1355.56,-22.9213 1436.11,-17.4779"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1436.46,-20.9635 1446.23,-16.8708 1436.04,-13.9761 1436.46,-20.9635"/>
 </g>
 <!-- Node27&#45;&gt;Node13 -->
-<g id="edge85" class="edge"><title>Node27&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M2151.29,-432.134C2022.51,-427.67 1722.38,-414.777 1624,-389 1498.78,-356.191 1442.56,-365.697 1360,-266 1303.07,-197.256 1311.45,-114.533 1387,-67 1457.23,-22.8145 1716.35,-17.1322 1807.79,-16.5221"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1807.86,-20.0219 1817.85,-16.4714 1807.83,-13.022 1807.86,-20.0219"/>
+<g id="edge86" class="edge"><title>Node27&#45;&gt;Node13</title>
+<path fill="none" stroke="midnightblue" d="M1659.24,-485.851C1576.41,-479.448 1431.44,-465.691 1383,-445 1352.07,-431.787 940.312,-81.5848 910,-67 846.584,-36.4867 764.053,-24.0165 719.296,-19.2327"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="719.59,-15.7447 709.29,-18.225 718.889,-22.7094 719.59,-15.7447"/>
 </g>
 <!-- Node27&#45;&gt;Node14 -->
-<g id="edge86" class="edge"><title>Node27&#45;&gt;Node14</title>
-<path fill="none" stroke="midnightblue" d="M2270.72,-427.199C2330.05,-419.88 2423.42,-406.842 2503,-389 2889.3,-302.39 3127.81,-420.586 3341,-87 3357.61,-61.0067 3320.88,-40.5897 3289.45,-28.5589"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="3290.53,-25.226 3279.94,-25.0976 3288.14,-31.8046 3290.53,-25.226"/>
+<g id="edge87" class="edge"><title>Node27&#45;&gt;Node14</title>
+<path fill="none" stroke="midnightblue" d="M1727.17,-481.362C1745.1,-462.464 1786.97,-418.748 1795,-414 1871.89,-368.548 1900.27,-374.752 1988,-358 2224.56,-312.829 2324.16,-395.174 2520,-255 2573.5,-216.707 2596.66,-188.078 2587,-123 2583.18,-97.2715 2585.52,-87.8703 2570,-67 2557.91,-50.7427 2538.93,-38.1248 2522.96,-29.5677"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2524.48,-26.4153 2513.98,-25.0019 2521.31,-32.6548 2524.48,-26.4153"/>
 </g>
 <!-- Node27&#45;&gt;Node19 -->
-<g id="edge87" class="edge"><title>Node27&#45;&gt;Node19</title>
-<path fill="none" stroke="midnightblue" d="M2151.45,-434.113C2040.3,-433.347 1794.14,-427.036 1591,-389 1270.23,-328.94 898.419,-188.391 790.684,-146.191"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="791.963,-142.933 781.376,-142.532 789.402,-149.448 791.963,-142.933"/>
+<g id="edge88" class="edge"><title>Node27&#45;&gt;Node19</title>
+<path fill="none" stroke="midnightblue" d="M1778.6,-488.064C1959.14,-480.217 2512.66,-441.456 2922,-255 2989.9,-224.072 3062.81,-173.645 3097.06,-148.796"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3099.28,-151.507 3105.29,-142.779 3095.15,-145.857 3099.28,-151.507"/>
 </g>
 <!-- Node27&#45;&gt;Node20 -->
-<g id="edge84" class="edge"><title>Node27&#45;&gt;Node20</title>
-<path fill="none" stroke="midnightblue" d="M2206.29,-425.256C2197.29,-409.409 2176.14,-375.926 2149,-358 2048.79,-291.807 1686.72,-263.903 1535.79,-254.969"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1535.91,-251.47 1525.72,-254.383 1535.5,-258.458 1535.91,-251.47"/>
+<g id="edge85" class="edge"><title>Node27&#45;&gt;Node20</title>
+<path fill="none" stroke="midnightblue" d="M1721.95,-481.421C1727.58,-466.132 1741.19,-433.915 1762,-414 1801.64,-376.071 1819.49,-373.899 1872,-358 1996.15,-320.406 2148.38,-310.56 2234.07,-308.127"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2234.27,-311.623 2244.18,-307.866 2234.09,-304.625 2234.27,-311.623"/>
 </g>
 <!-- Node27&#45;&gt;Node21 -->
-<g id="edge80" class="edge"><title>Node27&#45;&gt;Node21</title>
-<path fill="none" stroke="midnightblue" d="M2235.32,-425.431C2260.38,-416.54 2300.54,-402.125 2335,-389 2547.87,-307.915 2599.99,-284.791 2811,-199 2855.26,-181.004 2906.36,-159.76 2937.93,-146.582"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2939.58,-149.685 2947.46,-142.601 2936.88,-143.226 2939.58,-149.685"/>
+<g id="edge81" class="edge"><title>Node27&#45;&gt;Node21</title>
+<path fill="none" stroke="midnightblue" d="M1690.8,-481.443C1667.97,-473.808 1635.53,-461.318 1610,-445 1551.42,-407.56 1399.07,-255.708 1349.7,-205.905"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1351.96,-203.22 1342.44,-198.573 1346.99,-208.143 1351.96,-203.22"/>
 </g>
 <!-- Node27&#45;&gt;Node23 -->
-<g id="edge82" class="edge"><title>Node27&#45;&gt;Node23</title>
-<path fill="none" stroke="midnightblue" d="M2224.43,-425.413C2235.67,-417.433 2251.12,-404.47 2259,-389 2289.67,-328.812 2288.21,-245.562 2285.71,-208.77"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2289.18,-208.237 2284.9,-198.545 2282.2,-208.791 2289.18,-208.237"/>
+<g id="edge83" class="edge"><title>Node27&#45;&gt;Node23</title>
+<path fill="none" stroke="midnightblue" d="M1719.07,-481.297C1719.56,-466.369 1722.32,-435.424 1736,-414 1758.01,-379.535 1772.47,-376.373 1809,-358 1918.56,-302.898 2059.34,-271.145 2139.23,-256.331"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2139.93,-259.761 2149.14,-254.523 2138.68,-252.875 2139.93,-259.761"/>
 </g>
 <!-- Node27&#45;&gt;Node28 -->
-<g id="edge63" class="edge"><title>Node27&#45;&gt;Node28</title>
-<path fill="none" stroke="midnightblue" d="M2192.95,-425.475C2175.01,-417.002 2146.91,-403.737 2123.89,-392.863"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2125.23,-389.628 2114.7,-388.523 2122.24,-395.958 2125.23,-389.628"/>
+<g id="edge64" class="edge"><title>Node27&#45;&gt;Node28</title>
+<path fill="none" stroke="midnightblue" d="M1687.91,-481.475C1655.43,-472.569 1603.64,-458.369 1562.96,-447.215"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1563.71,-443.792 1553.14,-444.523 1561.86,-450.543 1563.71,-443.792"/>
 </g>
 <!-- Node27&#45;&gt;Node30 -->
-<g id="edge73" class="edge"><title>Node27&#45;&gt;Node30</title>
-<path fill="none" stroke="midnightblue" d="M2151.37,-427.204C2052.97,-415.987 1857.28,-393.681 1754.9,-382.012"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1755.02,-378.503 1744.69,-380.848 1754.23,-385.458 1755.02,-378.503"/>
+<g id="edge74" class="edge"><title>Node27&#45;&gt;Node30</title>
+<path fill="none" stroke="midnightblue" d="M1739.2,-481.475C1759.55,-472.889 1791.56,-459.383 1817.51,-448.429"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1818.92,-451.636 1826.77,-444.523 1816.2,-445.186 1818.92,-451.636"/>
 </g>
 <!-- Node28&#45;&gt;Node7 -->
-<g id="edge64" class="edge"><title>Node28&#45;&gt;Node7</title>
-<path fill="none" stroke="midnightblue" d="M2038.33,-358.475C2003.87,-348.12 1957.54,-334.199 1925.26,-324.498"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1925.89,-321.032 1915.31,-321.506 1923.88,-327.736 1925.89,-321.032"/>
+<g id="edge65" class="edge"><title>Node28&#45;&gt;Node7</title>
+<path fill="none" stroke="midnightblue" d="M1557.53,-423.272C1672.79,-412.421 1937.74,-387.477 2064.27,-375.565"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2064.61,-379.048 2074.24,-374.626 2063.95,-372.079 2064.61,-379.048"/>
 </g>
 <!-- Node28&#45;&gt;Node13 -->
-<g id="edge72" class="edge"><title>Node28&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M2081.75,-358.239C2069.98,-309.854 2025.04,-150.342 1927,-67 1909.15,-51.8254 1886.11,-38.6866 1868.23,-29.6172"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1869.41,-26.2955 1858.89,-25.0103 1866.31,-32.5732 1869.41,-26.2955"/>
+<g id="edge73" class="edge"><title>Node28&#45;&gt;Node13</title>
+<path fill="none" stroke="midnightblue" d="M1471.1,-414.395C1429.14,-394.942 1352.14,-358.167 1289,-322 1168.85,-253.176 1141.72,-230.002 1034,-143 994.317,-110.95 993.764,-89.5279 948,-67 871.844,-29.5113 770.4,-19.8234 719.311,-17.34"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="719.34,-13.8382 709.201,-16.912 719.044,-20.8319 719.34,-13.8382"/>
 </g>
 <!-- Node28&#45;&gt;Node21 -->
-<g id="edge71" class="edge"><title>Node28&#45;&gt;Node21</title>
-<path fill="none" stroke="midnightblue" d="M2140.92,-362.651C2191.79,-353.348 2268.81,-338.406 2335,-322 2565.29,-264.919 2836.94,-177.26 2932.56,-145.77"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2933.91,-149.013 2942.31,-142.556 2931.71,-142.365 2933.91,-149.013"/>
+<g id="edge72" class="edge"><title>Node28&#45;&gt;Node21</title>
+<path fill="none" stroke="midnightblue" d="M1488.53,-414.382C1463.48,-388.476 1407.49,-332.227 1355,-291 1332.02,-272.954 1313.89,-280.7 1300,-255 1291.07,-238.486 1303.7,-219.439 1316.02,-206.251"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1318.82,-208.401 1323.45,-198.872 1313.89,-203.432 1318.82,-208.401"/>
 </g>
 <!-- Node29 -->
 <g id="node29" class="node"><title>Node29</title>
 <g id="a_node29"><a xlink:href="functor_8h.html" target="_top" xlink:title="Defines the Functor data structures. ">
-<polygon fill="white" stroke="black" points="1868.5,-179.5 1868.5,-198.5 1975.5,-198.5 1975.5,-179.5 1868.5,-179.5"/>
-<text text-anchor="middle" x="1922" y="-186.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/node/functor.h</text>
+<polygon fill="white" stroke="black" points="1772.5,-179.5 1772.5,-198.5 1879.5,-198.5 1879.5,-179.5 1772.5,-179.5"/>
+<text text-anchor="middle" x="1826" y="-186.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/node/functor.h</text>
 </a>
 </g>
 </g>
 <!-- Node28&#45;&gt;Node29 -->
-<g id="edge65" class="edge"><title>Node28&#45;&gt;Node29</title>
-<path fill="none" stroke="midnightblue" d="M2072.48,-358.483C2042.8,-325.256 1968.31,-241.846 1936.63,-206.375"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1939.15,-203.946 1929.88,-198.819 1933.93,-208.609 1939.15,-203.946"/>
+<g id="edge66" class="edge"><title>Node28&#45;&gt;Node29</title>
+<path fill="none" stroke="midnightblue" d="M1515.94,-414.21C1541.9,-388.026 1600,-331.341 1655,-291 1703.67,-255.307 1765.73,-221.204 1800.11,-203.205"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1801.78,-206.282 1809.04,-198.567 1798.56,-200.07 1801.78,-206.282"/>
 </g>
 <!-- Node29&#45;&gt;Node9 -->
-<g id="edge66" class="edge"><title>Node29&#45;&gt;Node9</title>
-<path fill="none" stroke="midnightblue" d="M1897.76,-179.444C1872.13,-170.399 1831.51,-156.064 1802.76,-145.916"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1803.72,-142.544 1793.13,-142.516 1801.39,-149.145 1803.72,-142.544"/>
+<g id="edge67" class="edge"><title>Node29&#45;&gt;Node9</title>
+<path fill="none" stroke="midnightblue" d="M1848.34,-179.444C1871.75,-170.478 1908.74,-156.314 1935.19,-146.183"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1936.68,-149.361 1944.76,-142.516 1934.17,-142.824 1936.68,-149.361"/>
 </g>
 <!-- Node29&#45;&gt;Node11 -->
-<g id="edge67" class="edge"><title>Node29&#45;&gt;Node11</title>
-<path fill="none" stroke="midnightblue" d="M1974.37,-179.47C2022.01,-171.346 2094.12,-158.136 2156,-143 2221.17,-127.06 2295.85,-103.652 2338.71,-89.7068"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2339.97,-92.978 2348.39,-86.5441 2337.8,-86.3245 2339.97,-92.978"/>
+<g id="edge68" class="edge"><title>Node29&#45;&gt;Node11</title>
+<path fill="none" stroke="midnightblue" d="M1817.33,-179.445C1804.53,-166.88 1779.75,-142.781 1758,-123 1746.96,-112.958 1734.3,-101.994 1724.24,-93.4023"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1726.38,-90.6249 1716.49,-86.8157 1721.84,-95.9587 1726.38,-90.6249"/>
 </g>
 <!-- Node29&#45;&gt;Node14 -->
-<g id="edge68" class="edge"><title>Node29&#45;&gt;Node14</title>
-<path fill="none" stroke="midnightblue" d="M1975.74,-181.065C2193.41,-152.976 3007.34,-47.9435 3206.71,-22.2155"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="3207.34,-25.6627 3216.81,-20.9115 3206.45,-18.7202 3207.34,-25.6627"/>
+<g id="edge69" class="edge"><title>Node29&#45;&gt;Node14</title>
+<path fill="none" stroke="midnightblue" d="M1825.67,-179.341C1825.63,-165.269 1827.79,-137.549 1844,-123 1933.89,-42.3049 2318.14,-22.2657 2449.65,-17.727"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2449.89,-21.2211 2459.77,-17.3914 2449.66,-14.2249 2449.89,-21.2211"/>
 </g>
 <!-- Node29&#45;&gt;Node15 -->
-<g id="edge69" class="edge"><title>Node29&#45;&gt;Node15</title>
-<path fill="none" stroke="midnightblue" d="M1911.93,-179.282C1894.65,-164.863 1857.65,-136.32 1821,-123 1666.25,-66.7548 1141.19,-27.7668 1007.29,-18.5801"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1007.5,-15.0865 997.282,-17.8996 1007.02,-22.0704 1007.5,-15.0865"/>
+<g id="edge70" class="edge"><title>Node29&#45;&gt;Node15</title>
+<path fill="none" stroke="midnightblue" d="M1834.38,-179.358C1849.11,-164.781 1881.33,-135.66 1915,-123 2187.58,-20.5068 3147.24,-16.2619 3332.8,-16.4148"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3332.96,-19.9149 3342.96,-16.4284 3332.97,-12.9149 3332.96,-19.9149"/>
 </g>
 <!-- Node29&#45;&gt;Node19 -->
-<g id="edge70" class="edge"><title>Node29&#45;&gt;Node19</title>
-<path fill="none" stroke="midnightblue" d="M1868.31,-182.713C1853.04,-181.345 1836.37,-179.983 1821,-179 1409.53,-152.694 1305.73,-164.817 894,-143 859.879,-141.192 820.911,-138.505 793.828,-136.537"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="793.938,-133.036 783.709,-135.796 793.427,-140.017 793.938,-133.036"/>
+<g id="edge71" class="edge"><title>Node29&#45;&gt;Node19</title>
+<path fill="none" stroke="midnightblue" d="M1879.62,-185.939C2039.2,-179.778 2525.09,-160.819 2928,-143 2982.21,-140.602 3045.16,-137.552 3082.99,-135.689"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3083.62,-139.162 3093.43,-135.174 3083.27,-132.171 3083.62,-139.162"/>
 </g>
 <!-- Node30&#45;&gt;Node7 -->
-<g id="edge74" class="edge"><title>Node30&#45;&gt;Node7</title>
-<path fill="none" stroke="midnightblue" d="M1735.67,-358.475C1770.13,-348.12 1816.46,-334.199 1848.74,-324.498"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1850.12,-327.736 1858.69,-321.506 1848.11,-321.032 1850.12,-327.736"/>
+<g id="edge75" class="edge"><title>Node30&#45;&gt;Node7</title>
+<path fill="none" stroke="midnightblue" d="M1915.78,-416.388C1965.34,-405.625 2037.1,-390.042 2084.63,-379.722"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2085.65,-383.082 2094.68,-377.539 2084.16,-376.241 2085.65,-383.082"/>
 </g>
 <!-- Node30&#45;&gt;Node13 -->
-<g id="edge78" class="edge"><title>Node30&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M1647.61,-358.431C1573.44,-333.021 1423.65,-280.738 1403,-266 1365.77,-239.422 1340.55,-224.135 1348,-179 1357.75,-119.892 1369.13,-93.2172 1423,-67 1490.91,-33.9468 1721.94,-21.3201 1807.65,-17.7044"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1807.98,-21.194 1817.83,-17.289 1807.69,-14.1999 1807.98,-21.194"/>
+<g id="edge79" class="edge"><title>Node30&#45;&gt;Node13</title>
+<path fill="none" stroke="midnightblue" d="M1805.71,-414.475C1769.81,-404.906 1721.84,-391.552 1680,-378 1611.51,-355.814 1595.16,-347.949 1528,-322 1392.05,-269.467 1355.96,-260.88 1224,-199 1111.66,-146.321 1095.3,-107.46 978,-67 887.34,-35.728 773.826,-23.195 719.242,-18.7118"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="719.379,-15.2119 709.136,-17.9193 718.831,-22.1904 719.379,-15.2119"/>
 </g>
 <!-- Node30&#45;&gt;Node21 -->
-<g id="edge76" class="edge"><title>Node30&#45;&gt;Node21</title>
-<path fill="none" stroke="midnightblue" d="M1744.58,-363.216C1851.89,-345.199 2095.01,-303.927 2299,-266 2529.81,-223.086 2804.48,-167.448 2916.55,-144.551"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2917.47,-147.936 2926.57,-142.504 2916.07,-141.078 2917.47,-147.936"/>
+<g id="edge77" class="edge"><title>Node30&#45;&gt;Node21</title>
+<path fill="none" stroke="midnightblue" d="M1823.43,-414.394C1751.35,-386.28 1586.28,-320.326 1452,-255 1418.54,-238.723 1381.1,-217.575 1357.55,-203.896"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1359.05,-200.72 1348.65,-198.699 1355.52,-206.765 1359.05,-200.72"/>
 </g>
 <!-- Node30&#45;&gt;Node24 -->
-<g id="edge77" class="edge"><title>Node30&#45;&gt;Node24</title>
-<path fill="none" stroke="midnightblue" d="M1634.4,-358.462C1597.01,-348.675 1546.35,-335.077 1502,-322 1423.67,-298.904 1404.31,-292.295 1327,-266 1288.53,-252.918 1279.64,-247.559 1241,-235 1202.77,-222.573 1158.7,-209.941 1127.35,-201.237"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1128.22,-197.844 1117.65,-198.554 1126.35,-204.591 1128.22,-197.844"/>
+<g id="edge78" class="edge"><title>Node30&#45;&gt;Node24</title>
+<path fill="none" stroke="midnightblue" d="M1891.92,-414.497C1931.62,-397.793 2001.88,-370.555 2065,-358 2320.49,-307.181 2397.77,-387.066 2650,-322 2702.13,-308.553 2757.78,-277.96 2787.76,-259.957"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2789.84,-262.787 2796.56,-254.592 2786.2,-256.811 2789.84,-262.787"/>
 </g>
 <!-- Node30&#45;&gt;Node29 -->
-<g id="edge75" class="edge"><title>Node30&#45;&gt;Node29</title>
-<path fill="none" stroke="midnightblue" d="M1706.9,-358.483C1749.95,-324.757 1859.02,-239.33 1903.09,-204.81"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1905.29,-207.537 1911,-198.615 1900.97,-202.026 1905.29,-207.537"/>
+<g id="edge76" class="edge"><title>Node30&#45;&gt;Node29</title>
+<path fill="none" stroke="midnightblue" d="M1857.97,-414.295C1852.15,-373.433 1835.34,-255.551 1828.73,-209.12"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1832.15,-208.352 1827.28,-198.946 1825.22,-209.34 1832.15,-208.352"/>
 </g>
 <!-- Node31&#45;&gt;Node29 -->
-<g id="edge89" class="edge"><title>Node31&#45;&gt;Node29</title>
-<path fill="none" stroke="midnightblue" d="M2412.22,-425.459C2336.48,-389.081 2040.5,-246.917 1949.06,-202.998"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1950.43,-199.773 1939.9,-198.598 1947.4,-206.083 1950.43,-199.773"/>
+<g id="edge90" class="edge"><title>Node31&#45;&gt;Node29</title>
+<path fill="none" stroke="midnightblue" d="M1346.86,-481.425C1388.01,-446.037 1551.18,-309.709 1707,-235 1734.01,-222.049 1765.8,-210.19 1789.73,-201.901"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1791.13,-205.12 1799.46,-198.575 1788.87,-198.496 1791.13,-205.12"/>
 </g>
 <!-- Node32 -->
 <g id="node32" class="node"><title>Node32</title>
-<polygon fill="white" stroke="#bfbfbf" points="2779.5,-364 2779.5,-383 2838.5,-383 2838.5,-364 2779.5,-364"/>
-<text text-anchor="middle" x="2809" y="-371" font-family="Helvetica,sans-Serif" font-size="10.00">iostream</text>
+<polygon fill="white" stroke="#bfbfbf" points="1064.5,-420 1064.5,-439 1123.5,-439 1123.5,-420 1064.5,-420"/>
+<text text-anchor="middle" x="1094" y="-427" font-family="Helvetica,sans-Serif" font-size="10.00">iostream</text>
 </g>
 <!-- Node31&#45;&gt;Node32 -->
-<g id="edge90" class="edge"><title>Node31&#45;&gt;Node32</title>
-<path fill="none" stroke="midnightblue" d="M2489.24,-425.464C2549.23,-416.715 2644.66,-402.516 2727,-389 2740.87,-386.723 2756.03,-384.085 2769.5,-381.687"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2770.2,-385.118 2779.43,-379.911 2768.96,-378.228 2770.2,-385.118"/>
+<g id="edge91" class="edge"><title>Node31&#45;&gt;Node32</title>
+<path fill="none" stroke="midnightblue" d="M1302.18,-481.475C1257.56,-470.548 1180.38,-451.65 1133.57,-440.189"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1134.33,-436.771 1123.78,-437.793 1132.66,-443.571 1134.33,-436.771"/>
 </g>
 <!-- Node33&#45;&gt;Node5 -->
-<g id="edge104" class="edge"><title>Node33&#45;&gt;Node5</title>
-<path fill="none" stroke="midnightblue" d="M2176.78,-594.548C2226.81,-584.933 2313.94,-568.19 2368.93,-557.622"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2369.65,-561.049 2378.81,-555.724 2368.32,-554.174 2369.65,-561.049"/>
+<g id="edge105" class="edge"><title>Node33&#45;&gt;Node5</title>
+<path fill="none" stroke="midnightblue" d="M784.434,-649.444C820.856,-640.122 879.23,-625.183 919.029,-614.997"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="919.903,-618.387 928.723,-612.516 918.167,-611.605 919.903,-618.387"/>
 </g>
 <!-- Node33&#45;&gt;Node6 -->
-<g id="edge106" class="edge"><title>Node33&#45;&gt;Node6</title>
-<path fill="none" stroke="midnightblue" d="M2148.28,-593.368C2170.47,-574.733 2223.11,-530.532 2250.98,-507.134"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2253.36,-509.702 2258.77,-500.591 2248.86,-504.342 2253.36,-509.702"/>
+<g id="edge107" class="edge"><title>Node33&#45;&gt;Node6</title>
+<path fill="none" stroke="midnightblue" d="M789.638,-654.528C841.087,-649.184 934.493,-636.949 1011,-613 1031.11,-606.704 1033.7,-598.659 1054,-593 1177.06,-558.703 1327.47,-550.313 1408.85,-548.406"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1408.98,-551.904 1418.91,-548.195 1408.83,-544.905 1408.98,-551.904"/>
 </g>
 <!-- Node33&#45;&gt;Node7 -->
-<g id="edge105" class="edge"><title>Node33&#45;&gt;Node7</title>
-<path fill="none" stroke="midnightblue" d="M2099.43,-599.242C2038.05,-592.526 1925,-569.62 1925,-492 1925,-492 1925,-492 1925,-434 1925,-395.812 1907.56,-353.83 1896.31,-330.722"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1899.29,-328.865 1891.66,-321.515 1893.04,-332.02 1899.29,-328.865"/>
+<g id="edge106" class="edge"><title>Node33&#45;&gt;Node7</title>
+<path fill="none" stroke="midnightblue" d="M789.756,-655.403C973.723,-642.724 1759.45,-584.539 1989,-501 2032.21,-485.274 2042.21,-476.19 2076,-445 2095.17,-427.307 2112.66,-402.696 2123.31,-386.328"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2126.35,-388.076 2128.75,-377.758 2120.44,-384.324 2126.35,-388.076"/>
 </g>
 <!-- Node33&#45;&gt;Node11 -->
-<g id="edge108" class="edge"><title>Node33&#45;&gt;Node11</title>
-<path fill="none" stroke="midnightblue" d="M2176.81,-599.167C2311.56,-588.821 2751,-550.589 2751,-492 2751,-492 2751,-492 2751,-434 2751,-292.881 2909.59,-227.728 2815,-123 2790.57,-95.9453 2560.71,-84.2877 2442.91,-80.056"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2443.02,-76.5578 2432.9,-79.7048 2442.77,-83.5535 2443.02,-76.5578"/>
+<g id="edge109" class="edge"><title>Node33&#45;&gt;Node11</title>
+<path fill="none" stroke="midnightblue" d="M748.438,-649.249C743.251,-630.738 732,-586.179 732,-548 732,-548 732,-548 732,-490 732,-308.692 830.543,-253.139 996,-179 1110.05,-127.894 1481.88,-94.8294 1637.03,-82.9397"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1637.67,-86.4013 1647.37,-82.1537 1637.14,-79.4214 1637.67,-86.4013"/>
 </g>
 <!-- Node33&#45;&gt;Node13 -->
-<g id="edge109" class="edge"><title>Node33&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M2137.68,-593.493C2136.84,-567.912 2135.05,-489.482 2142,-425 2143.75,-408.794 2147.21,-405.201 2149,-389 2162.01,-271.455 2224.99,-219.055 2156,-123 2089.62,-30.5759 1938.88,-17.2947 1872.46,-16.1102"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1872.22,-12.6078 1862.18,-16.0132 1872.15,-19.6075 1872.22,-12.6078"/>
+<g id="edge110" class="edge"><title>Node33&#45;&gt;Node13</title>
+<path fill="none" stroke="midnightblue" d="M739.286,-649.286C727.962,-640.626 710.574,-626.694 697,-613 624.63,-539.986 455.062,-344.225 404,-255 388.704,-228.272 380,-220.795 380,-190 380,-190 380,-190 380,-132 380,-101.205 380.384,-86.7643 404,-67 441.614,-35.5204 588.303,-22.6102 654.396,-18.3121"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="654.889,-21.7882 664.652,-17.6726 654.453,-14.8018 654.889,-21.7882"/>
 </g>
 <!-- Node33&#45;&gt;Node21 -->
-<g id="edge107" class="edge"><title>Node33&#45;&gt;Node21</title>
-<path fill="none" stroke="midnightblue" d="M2176.9,-601.442C2306.23,-599.281 2716.34,-589.878 2765,-557 2912.07,-457.636 2955.85,-220.974 2965.62,-152.722"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2969.11,-153.006 2966.99,-142.626 2962.18,-152.065 2969.11,-153.006"/>
+<g id="edge108" class="edge"><title>Node33&#45;&gt;Node21</title>
+<path fill="none" stroke="midnightblue" d="M743.344,-649.495C728.826,-632.92 697.211,-594.617 680,-557 620.394,-426.722 543.003,-310.114 665,-235 717.055,-202.949 1153.02,-203.286 1214,-199 1227.5,-198.051 1241.85,-197.006 1255.76,-195.975"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1256.08,-199.461 1265.79,-195.227 1255.56,-192.48 1256.08,-199.461"/>
 </g>
 <!-- Node34&#45;&gt;Node5 -->
-<g id="edge127" class="edge"><title>Node34&#45;&gt;Node5</title>
-<path fill="none" stroke="midnightblue" d="M2695.68,-881.034C2856.86,-876.927 3358.77,-856.415 3284,-761 3181.16,-629.763 2637.96,-568.498 2469.12,-552.433"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2469.3,-548.934 2459.01,-551.483 2468.64,-555.903 2469.3,-548.934"/>
+<g id="edge128" class="edge"><title>Node34&#45;&gt;Node5</title>
+<path fill="none" stroke="midnightblue" d="M2000.45,-937.859C1795.44,-936.379 1002.07,-921.519 805,-781 744.633,-737.956 655.272,-705.736 703,-649 729.164,-617.898 844.288,-608.292 911.924,-605.325"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="912.096,-608.821 921.946,-604.918 911.812,-601.827 912.096,-608.821"/>
 </g>
 <!-- Node34&#45;&gt;Node6 -->
-<g id="edge128" class="edge"><title>Node34&#45;&gt;Node6</title>
-<path fill="none" stroke="midnightblue" d="M2598.36,-878.192C2489.75,-866.767 2237,-822.7 2237,-660 2237,-660 2237,-660 2237,-602 2237,-568.329 2250.99,-531.085 2260.47,-509.719"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2263.67,-511.138 2264.68,-500.592 2257.32,-508.207 2263.67,-511.138"/>
+<g id="edge129" class="edge"><title>Node34&#45;&gt;Node6</title>
+<path fill="none" stroke="midnightblue" d="M2058.9,-929.353C2079.68,-910.111 2124.8,-861.9 2111,-817 2081.16,-719.889 2045.57,-696.942 1956,-649 1814.88,-573.464 1622.86,-554.356 1527.21,-549.567"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1527.3,-546.067 1517.15,-549.098 1526.97,-553.06 1527.3,-546.067"/>
 </g>
 <!-- Node34&#45;&gt;Node13 -->
-<g id="edge152" class="edge"><title>Node34&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M2695.52,-881.625C2891.2,-879.484 3612,-865.268 3612,-772 3612,-772 3612,-772 3612,-311 3612,-138.097 3479.54,-95.5127 3309,-67 3163.03,-42.5946 2075.33,-20.9519 1872.51,-17.1057"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1872.29,-13.601 1862.23,-16.9115 1872.16,-20.5997 1872.29,-13.601"/>
+<g id="edge153" class="edge"><title>Node34&#45;&gt;Node13</title>
+<path fill="none" stroke="midnightblue" d="M2000.3,-938.214C1819.93,-938.629 1172.98,-936.778 642,-893 411.233,-873.974 152,-947.55 152,-716 152,-716 152,-716 152,-132 152,-97.8442 163.97,-84.9972 193,-67 270.599,-18.8919 557.963,-15.821 654.859,-16.1943"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="654.914,-19.6946 664.932,-16.2486 654.951,-12.6947 654.914,-19.6946"/>
 </g>
 <!-- Node34&#45;&gt;Node19 -->
-<g id="edge153" class="edge"><title>Node34&#45;&gt;Node19</title>
-<path fill="none" stroke="midnightblue" d="M2598.37,-882.071C2285.25,-881.903 566,-871.359 566,-660 566,-660 566,-660 566,-546 566,-433.999 463.276,-406.72 503,-302 528.906,-233.705 550.521,-218.442 612,-179 647.336,-156.33 694.705,-144.566 726.234,-138.843"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="727.012,-142.261 736.279,-137.126 725.833,-135.361 727.012,-142.261"/>
+<g id="edge154" class="edge"><title>Node34&#45;&gt;Node19</title>
+<path fill="none" stroke="midnightblue" d="M2097.77,-936.776C2293.22,-931.07 3018.41,-901.974 3207,-781 3367.52,-678.034 3321.14,-543.99 3279,-358 3259.28,-270.96 3251.31,-241.903 3188,-179 3175.09,-166.175 3157.91,-155.167 3143.69,-147.25"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3145.31,-144.147 3134.84,-142.505 3142,-150.316 3145.31,-144.147"/>
 </g>
 <!-- Node35 -->
 <g id="node35" class="node"><title>Node35</title>
 <g id="a_node35"><a xlink:href="tir_2expr_8h.html" target="_top" xlink:title="TIR expressions. ">
-<polygon fill="white" stroke="black" points="2804,-817.5 2804,-836.5 2884,-836.5 2884,-817.5 2804,-817.5"/>
-<text text-anchor="middle" x="2844" y="-824.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/tir/expr.h</text>
+<polygon fill="white" stroke="black" points="651,-873.5 651,-892.5 731,-892.5 731,-873.5 651,-873.5"/>
+<text text-anchor="middle" x="691" y="-880.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/tir/expr.h</text>
 </a>
 </g>
 </g>
 <!-- Node34&#45;&gt;Node35 -->
-<g id="edge129" class="edge"><title>Node34&#45;&gt;Node35</title>
-<path fill="none" stroke="midnightblue" d="M2678.22,-873.444C2711.93,-864.201 2765.8,-849.436 2802.93,-839.257"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2804.21,-842.536 2812.93,-836.516 2802.36,-835.785 2804.21,-842.536"/>
+<g id="edge130" class="edge"><title>Node34&#45;&gt;Node35</title>
+<path fill="none" stroke="midnightblue" d="M2000.47,-937.135C1795.19,-933.35 997.299,-917.343 747,-893 745.248,-892.83 743.468,-892.639 741.672,-892.432"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="741.664,-888.902 731.298,-891.093 740.768,-895.845 741.664,-888.902"/>
 </g>
 <!-- Node35&#45;&gt;Node4 -->
-<g id="edge130" class="edge"><title>Node35&#45;&gt;Node4</title>
-<path fill="none" stroke="midnightblue" d="M2820.91,-817.425C2746.88,-789.932 2515.3,-703.932 2429.45,-672.05"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2430.55,-668.724 2419.95,-668.524 2428.11,-675.287 2430.55,-668.724"/>
+<g id="edge131" class="edge"><title>Node35&#45;&gt;Node4</title>
+<path fill="none" stroke="midnightblue" d="M717.862,-873.425C804.422,-845.792 1076.12,-759.057 1174.76,-727.568"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1175.84,-730.899 1184.3,-724.524 1173.71,-724.231 1175.84,-730.899"/>
 </g>
 <!-- Node35&#45;&gt;Node6 -->
-<g id="edge133" class="edge"><title>Node35&#45;&gt;Node6</title>
-<path fill="none" stroke="midnightblue" d="M2839.14,-817.473C2829.32,-799.386 2808,-755.585 2808,-716 2808,-716 2808,-716 2808,-658 2808,-558.536 2468.52,-512.393 2328.4,-497.584"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2328.57,-494.083 2318.26,-496.53 2327.85,-501.045 2328.57,-494.083"/>
+<g id="edge134" class="edge"><title>Node35&#45;&gt;Node6</title>
+<path fill="none" stroke="midnightblue" d="M731.084,-880.738C949.159,-873.613 1983,-836.44 1983,-772 1983,-772 1983,-772 1983,-714 1983,-619.241 1663.16,-570.741 1527.59,-554.439"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1527.72,-550.93 1517.38,-553.231 1526.9,-557.882 1527.72,-550.93"/>
 </g>
 <!-- Node35&#45;&gt;Node7 -->
-<g id="edge131" class="edge"><title>Node35&#45;&gt;Node7</title>
-<path fill="none" stroke="midnightblue" d="M2803.87,-823.174C2623.24,-810.293 1892.73,-756.415 1856,-725 1771.21,-652.482 1773.75,-590.12 1797,-481 1798.68,-473.103 1851.58,-360.372 1853,-358 1858.98,-348.028 1866.73,-337.625 1873.32,-329.336"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1876.04,-331.534 1879.64,-321.57 1870.61,-327.114 1876.04,-331.534"/>
+<g id="edge132" class="edge"><title>Node35&#45;&gt;Node7</title>
+<path fill="none" stroke="midnightblue" d="M731.338,-882.056C957.044,-882.205 2055.28,-880.677 2111,-837 2239.79,-736.042 2215.92,-644.28 2205,-481 2203,-451.022 2211.4,-439.797 2196,-414 2187.99,-400.584 2174.27,-390.004 2161.78,-382.483"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2163.42,-379.394 2152.99,-377.55 2160,-385.498 2163.42,-379.394"/>
 </g>
 <!-- Node35&#45;&gt;Node12 -->
-<g id="edge134" class="edge"><title>Node35&#45;&gt;Node12</title>
-<path fill="none" stroke="midnightblue" d="M2884.41,-825.489C3020.86,-823.463 3458.94,-814.393 3512,-781 3734.14,-641.204 3809.36,-487.293 3737,-235 3709.49,-139.065 3681.8,-101.081 3588,-67 3519.52,-42.1189 3034.91,-24.6009 2847.62,-18.691"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2847.7,-15.1917 2837.59,-18.3767 2847.48,-22.1883 2847.7,-15.1917"/>
+<g id="edge135" class="edge"><title>Node35&#45;&gt;Node12</title>
+<path fill="none" stroke="midnightblue" d="M650.634,-880.144C544.547,-874.174 266,-850.979 266,-772 266,-772 266,-772 266,-658 266,-582.314 304,-567.686 304,-492 304,-492 304,-492 304,-428.5 304,-187.918 1174.29,-58.7669 1436.27,-25.34"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1436.95,-28.7807 1446.44,-24.0515 1436.07,-21.8362 1436.95,-28.7807"/>
 </g>
 <!-- Node35&#45;&gt;Node13 -->
-<g id="edge149" class="edge"><title>Node35&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M2884.04,-825.132C3004.33,-821.646 3358.86,-804.678 3435,-725 3495.64,-661.542 3536,-415.326 3536,-313 3536,-313 3536,-313 3536,-249.5 3536,-100.583 3398.31,-107.471 3255,-67 3117.95,-28.2973 2069.54,-18.2826 1872.03,-16.7319"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1872.03,-13.2319 1862,-16.6548 1871.98,-20.2317 1872.03,-13.2319"/>
+<g id="edge150" class="edge"><title>Node35&#45;&gt;Node13</title>
+<path fill="none" stroke="midnightblue" d="M650.927,-882.139C555.687,-881.806 318.764,-876.873 252,-837 200.121,-806.017 190,-776.426 190,-716 190,-716 190,-716 190,-132 190,-101.205 189.705,-85.9231 214,-67 248.732,-39.9481 553.761,-22.9253 654.713,-17.9925"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="655.092,-21.4783 664.912,-17.5015 654.755,-14.4864 655.092,-21.4783"/>
 </g>
 <!-- Node35&#45;&gt;Node15 -->
-<g id="edge151" class="edge"><title>Node35&#45;&gt;Node15</title>
-<path fill="none" stroke="midnightblue" d="M2803.97,-826.375C2633.88,-827.628 1954.43,-829.173 1399,-781 1336.38,-775.569 1321.29,-769.37 1259,-761 1132.03,-743.94 1098.68,-749.865 973,-725 737.233,-678.355 680.434,-650.429 459,-557 230.298,-460.505 38,-438.225 38,-190 38,-190 38,-190 38,-132 38,-92.6849 62.4533,-83.7962 98,-67 175.776,-30.25 797.026,-19.068 944.622,-16.9177"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="944.847,-20.415 954.796,-16.773 944.748,-13.4157 944.847,-20.415"/>
+<g id="edge152" class="edge"><title>Node35&#45;&gt;Node15</title>
+<path fill="none" stroke="midnightblue" d="M731.163,-881.835C979.719,-880.703 2310.92,-873.022 2717,-837 2877.85,-822.731 2919.44,-820.589 3076,-781 3303.85,-723.385 3385.43,-719.727 3555,-557 3627.93,-487.009 3683,-470.084 3683,-369 3683,-369 3683,-369 3683,-132 3683,-96.6087 3666.82,-86.0661 3637,-67 3597.17,-41.5332 3458.1,-25.4445 3395.25,-19.314"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3395.43,-15.8152 3385.15,-18.3501 3394.77,-22.7836 3395.43,-15.8152"/>
 </g>
 <!-- Node35&#45;&gt;Node16 -->
-<g id="edge146" class="edge"><title>Node35&#45;&gt;Node16</title>
-<path fill="none" stroke="midnightblue" d="M2884.31,-821.916C2920.09,-816.795 2972.65,-805.496 3012,-781 3130.09,-707.485 3224.9,-675.068 3208,-537 3188.07,-374.261 3241.33,-288.165 3119,-179 3107.79,-168.994 2904.94,-147.236 2816.1,-138.134"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2816.41,-134.647 2806.11,-137.113 2815.7,-141.611 2816.41,-134.647"/>
+<g id="edge147" class="edge"><title>Node35&#45;&gt;Node16</title>
+<path fill="none" stroke="midnightblue" d="M650.843,-878.753C594.3,-873.387 495.184,-860.813 469,-837 449.758,-819.499 458.651,-806.146 452,-781 410.09,-622.555 349.707,-563.888 416,-414 459.401,-315.87 482.163,-283.253 578,-235 667.775,-189.8 1386.93,-146.524 1567.82,-136.279"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1568.2,-139.763 1577.98,-135.706 1567.8,-132.774 1568.2,-139.763"/>
 </g>
 <!-- Node35&#45;&gt;Node18 -->
-<g id="edge150" class="edge"><title>Node35&#45;&gt;Node18</title>
-<path fill="none" stroke="midnightblue" d="M2803.93,-826.199C2588.88,-827.069 1576.38,-828.593 1446,-781 1143.85,-670.703 987.271,-246.346 955.925,-152.46"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="959.164,-151.104 952.722,-142.692 952.512,-153.285 959.164,-151.104"/>
+<g id="edge151" class="edge"><title>Node35&#45;&gt;Node18</title>
+<path fill="none" stroke="midnightblue" d="M731.007,-882.282C950.374,-883.628 2002.24,-887.391 2137,-837 2184.89,-819.091 2823.84,-298.934 2850,-255 2869.17,-222.802 2872.87,-177.984 2873.3,-152.913"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2876.8,-152.617 2873.31,-142.616 2869.8,-152.614 2876.8,-152.617"/>
 </g>
 <!-- Node35&#45;&gt;Node21 -->
-<g id="edge135" class="edge"><title>Node35&#45;&gt;Node21</title>
-<path fill="none" stroke="midnightblue" d="M2884.18,-822.624C3015.26,-811.08 3422,-770.841 3422,-716 3422,-716 3422,-716 3422,-602 3422,-495.003 3278.11,-242.513 3192,-179 3179.19,-169.549 3094.33,-154.251 3032.84,-144.159"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="3033.24,-140.678 3022.81,-142.524 3032.11,-147.586 3033.24,-140.678"/>
+<g id="edge136" class="edge"><title>Node35&#45;&gt;Node21</title>
+<path fill="none" stroke="midnightblue" d="M650.86,-879.629C581.865,-874.919 444.306,-862.555 404,-837 353.195,-804.788 345.668,-782.503 328,-725 263.884,-516.326 432.412,-469.559 558,-291 577.026,-263.949 577.302,-249.552 607,-235 667.671,-205.272 1146.59,-203.527 1214,-199 1227.5,-198.093 1241.86,-197.069 1255.76,-196.045"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1256.08,-199.531 1265.79,-195.301 1255.56,-192.551 1256.08,-199.531"/>
 </g>
 <!-- Node35&#45;&gt;Node25 -->
-<g id="edge148" class="edge"><title>Node35&#45;&gt;Node25</title>
-<path fill="none" stroke="midnightblue" d="M2884.32,-821.999C2915.89,-817.016 2959.29,-805.87 2989,-781 3010.49,-763.009 3007.02,-750.789 3018,-725 3082.55,-573.365 3071.98,-522.8 3071,-358 3070.68,-303.944 3069.77,-240.11 3069.3,-208.984"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="3072.8,-208.689 3069.14,-198.744 3065.8,-208.798 3072.8,-208.689"/>
+<g id="edge149" class="edge"><title>Node35&#45;&gt;Node25</title>
+<path fill="none" stroke="midnightblue" d="M650.869,-880.818C583.75,-876.47 456,-857.026 456,-772 456,-772 456,-772 456,-658 456,-656.55 639.642,-341.07 684.976,-263.214"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="688.001,-264.975 690.008,-254.572 681.951,-261.452 688.001,-264.975"/>
 </g>
 <!-- Node35&#45;&gt;Node29 -->
-<g id="edge132" class="edge"><title>Node35&#45;&gt;Node29</title>
-<path fill="none" stroke="midnightblue" d="M2803.95,-825.94C2689.2,-825.283 2349.59,-819.927 2071,-781 1844.6,-749.366 1711.96,-847.375 1569,-669 1481.21,-559.471 1522.5,-454.957 1624,-358 1705.6,-280.052 1830.24,-225.264 1889.07,-202.197"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1890.53,-205.383 1898.6,-198.513 1888.01,-198.854 1890.53,-205.383"/>
+<g id="edge133" class="edge"><title>Node35&#45;&gt;Node29</title>
+<path fill="none" stroke="midnightblue" d="M731.037,-881.267C914.732,-877.772 1669.78,-861.936 1713,-837 1922.63,-716.046 1938.17,-610.321 2006,-378 2030.6,-293.737 1914.22,-229.101 1856.65,-202.826"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1857.85,-199.528 1847.29,-198.655 1855,-205.922 1857.85,-199.528"/>
 </g>
 <!-- Node35&#45;&gt;Node32 -->
-<g id="edge147" class="edge"><title>Node35&#45;&gt;Node32</title>
-<path fill="none" stroke="midnightblue" d="M2882.73,-817.426C2930.08,-804.392 3004,-774.654 3004,-716 3004,-716 3004,-716 3004,-602 3004,-498.7 2886.67,-418.772 2834.03,-388.112"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2835.71,-385.04 2825.29,-383.128 2832.24,-391.121 2835.71,-385.04"/>
+<g id="edge148" class="edge"><title>Node35&#45;&gt;Node32</title>
+<path fill="none" stroke="midnightblue" d="M675.07,-873.436C631.05,-848.622 513.715,-774.046 556,-705 614.615,-609.289 951.388,-481.706 1060.14,-442.478"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1061.42,-445.74 1069.65,-439.066 1059.05,-439.151 1061.42,-445.74"/>
 </g>
 <!-- Node36 -->
 <g id="node36" class="node"><title>Node36</title>
 <g id="a_node36"><a xlink:href="buffer_8h.html" target="_top" xlink:title="Symbolic n&#45;dimensional array, to represent a memory buffer. ">
-<polygon fill="white" stroke="black" points="2836.5,-761.5 2836.5,-780.5 2923.5,-780.5 2923.5,-761.5 2836.5,-761.5"/>
-<text text-anchor="middle" x="2880" y="-768.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/tir/buffer.h</text>
+<polygon fill="white" stroke="black" points="645.5,-817.5 645.5,-836.5 732.5,-836.5 732.5,-817.5 645.5,-817.5"/>
+<text text-anchor="middle" x="689" y="-824.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/tir/buffer.h</text>
 </a>
 </g>
 </g>
 <!-- Node35&#45;&gt;Node36 -->
-<g id="edge136" class="edge"><title>Node35&#45;&gt;Node36</title>
-<path fill="none" stroke="midnightblue" d="M2849.94,-817.083C2854.98,-809.534 2862.34,-798.495 2868.51,-789.23"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2871.53,-791.013 2874.17,-780.751 2865.71,-787.13 2871.53,-791.013"/>
+<g id="edge137" class="edge"><title>Node35&#45;&gt;Node36</title>
+<path fill="none" stroke="midnightblue" d="M690.67,-873.083C690.408,-866.006 690.032,-855.861 689.703,-846.986"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="693.192,-846.615 689.324,-836.751 686.197,-846.874 693.192,-846.615"/>
 </g>
 <!-- Node37 -->
 <g id="node37" class="node"><title>Node37</title>
 <g id="a_node37"><a xlink:href="var_8h.html" target="_top" xlink:title="Variables in the TIR. ">
-<polygon fill="white" stroke="black" points="2902,-705.5 2902,-724.5 2976,-724.5 2976,-705.5 2902,-705.5"/>
-<text text-anchor="middle" x="2939" y="-712.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/tir/var.h</text>
+<polygon fill="white" stroke="black" points="600,-761.5 600,-780.5 674,-780.5 674,-761.5 600,-761.5"/>
+<text text-anchor="middle" x="637" y="-768.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/tir/var.h</text>
 </a>
 </g>
 </g>
 <!-- Node35&#45;&gt;Node37 -->
-<g id="edge145" class="edge"><title>Node35&#45;&gt;Node37</title>
-<path fill="none" stroke="midnightblue" d="M2877.78,-817.383C2896.86,-810.694 2919.35,-799.377 2932,-781 2941.15,-767.705 2942.29,-749.127 2941.5,-735.235"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2944.95,-734.511 2940.52,-724.888 2937.98,-735.175 2944.95,-734.511"/>
+<g id="edge146" class="edge"><title>Node35&#45;&gt;Node37</title>
+<path fill="none" stroke="midnightblue" d="M673.802,-873.386C660.749,-865.794 643.766,-853.389 636,-837 629.128,-822.498 630.198,-804.054 632.508,-790.52"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="635.982,-791.004 634.593,-780.501 629.129,-789.578 635.982,-791.004"/>
 </g>
 <!-- Node36&#45;&gt;Node4 -->
-<g id="edge137" class="edge"><title>Node36&#45;&gt;Node4</title>
-<path fill="none" stroke="midnightblue" d="M2842.5,-761.46C2755.57,-741.662 2540.4,-692.658 2444.47,-670.812"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2445,-667.343 2434.47,-668.535 2443.45,-674.168 2445,-667.343"/>
+<g id="edge138" class="edge"><title>Node36&#45;&gt;Node4</title>
+<path fill="none" stroke="midnightblue" d="M732.867,-822.576C788.032,-817.429 885.187,-805.557 965,-781 985.145,-774.802 988.038,-767.765 1008,-761 1036.07,-751.488 1111.63,-735.667 1162.23,-725.542"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1163.18,-728.921 1172.3,-723.535 1161.81,-722.056 1163.18,-728.921"/>
 </g>
 <!-- Node36&#45;&gt;Node7 -->
-<g id="edge138" class="edge"><title>Node36&#45;&gt;Node7</title>
-<path fill="none" stroke="midnightblue" d="M2836.45,-769.284C2649.21,-766.047 1920.38,-751.765 1880,-725 1853.32,-707.316 1849,-692.006 1849,-660 1849,-660 1849,-660 1849,-434 1849,-395.812 1866.44,-353.83 1877.69,-330.722"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1880.96,-332.02 1882.34,-321.515 1874.71,-328.865 1880.96,-332.02"/>
+<g id="edge139" class="edge"><title>Node36&#45;&gt;Node7</title>
+<path fill="none" stroke="midnightblue" d="M732.712,-820.031C793.102,-811.707 905.432,-795.95 1001,-781 1535.67,-697.362 1863.99,-935.941 2186,-501 2213.11,-464.381 2173.89,-411.762 2150.05,-385.386"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2152.33,-382.709 2142.95,-377.797 2147.22,-387.491 2152.33,-382.709"/>
 </g>
 <!-- Node36&#45;&gt;Node13 -->
-<g id="edge144" class="edge"><title>Node36&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M2923.59,-766.964C3058.48,-756.39 3460,-715.545 3460,-604 3460,-604 3460,-604 3460,-249.5 3460,-203.089 3281.69,-87.8273 3219,-67 3153.07,-45.0974 2074.25,-21.4216 1872.4,-17.1718"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1872.23,-13.6676 1862.16,-16.9571 1872.08,-20.666 1872.23,-13.6676"/>
+<g id="edge145" class="edge"><title>Node36&#45;&gt;Node13</title>
+<path fill="none" stroke="midnightblue" d="M656.16,-817.471C556.774,-789.477 266,-690.604 266,-492 266,-492 266,-492 266,-132 266,-97.8442 278.246,-85.4341 307,-67 364.373,-30.2184 573.158,-19.9672 654.37,-17.3381"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="654.624,-20.8321 664.512,-17.0284 654.41,-13.8354 654.624,-20.8321"/>
 </g>
 <!-- Node36&#45;&gt;Node37 -->
-<g id="edge139" class="edge"><title>Node36&#45;&gt;Node37</title>
-<path fill="none" stroke="midnightblue" d="M2889.48,-761.324C2898.2,-753.341 2911.34,-741.313 2921.9,-731.653"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2924.33,-734.167 2929.35,-724.834 2919.61,-729.004 2924.33,-734.167"/>
+<g id="edge140" class="edge"><title>Node36&#45;&gt;Node37</title>
+<path fill="none" stroke="midnightblue" d="M680.413,-817.083C672.841,-809.22 661.622,-797.569 652.486,-788.081"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="654.885,-785.527 645.427,-780.751 649.842,-790.382 654.885,-785.527"/>
 </g>
 <!-- Node37&#45;&gt;Node4 -->
-<g id="edge140" class="edge"><title>Node37&#45;&gt;Node4</title>
-<path fill="none" stroke="midnightblue" d="M2901.91,-708.581C2892.8,-707.327 2883.05,-706.053 2874,-705 2717.79,-686.826 2532.14,-670.952 2445.74,-663.893"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2445.95,-660.399 2435.7,-663.076 2445.39,-667.376 2445.95,-660.399"/>
+<g id="edge141" class="edge"><title>Node37&#45;&gt;Node4</title>
+<path fill="none" stroke="midnightblue" d="M674.219,-766.499C774.437,-757.07 1049.96,-731.15 1162.33,-720.579"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1162.73,-724.056 1172.36,-719.635 1162.08,-717.087 1162.73,-724.056"/>
 </g>
 <!-- Node37&#45;&gt;Node6 -->
-<g id="edge141" class="edge"><title>Node37&#45;&gt;Node6</title>
-<path fill="none" stroke="midnightblue" d="M2932.63,-705.174C2909.47,-674.009 2825.92,-569.264 2727,-537 2654.41,-513.323 2434.9,-499.916 2328.33,-494.643"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2328.23,-491.134 2318.07,-494.143 2327.89,-498.125 2328.23,-491.134"/>
+<g id="edge142" class="edge"><title>Node37&#45;&gt;Node6</title>
+<path fill="none" stroke="midnightblue" d="M669.067,-761.433C798.253,-726.922 1280.57,-598.073 1426.14,-559.183"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1427.27,-562.504 1436.03,-556.541 1425.46,-555.741 1427.27,-562.504"/>
 </g>
 <!-- Node37&#45;&gt;Node13 -->
-<g id="edge143" class="edge"><title>Node37&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M2976.21,-710.41C3035.84,-704.125 3149.57,-689.79 3184,-669 3319.44,-587.215 3384,-532.719 3384,-374.5 3384,-374.5 3384,-374.5 3384,-311 3384,-251.372 3397.26,-225.552 3360,-179 3334.36,-146.971 3309.71,-164.869 3275,-143 3231.84,-115.811 3236.36,-85.9477 3189,-67 3125.97,-41.7819 2071.65,-20.8388 1872.36,-17.0953"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1872.3,-13.5938 1862.24,-16.9063 1872.17,-20.5926 1872.3,-13.5938"/>
+<g id="edge144" class="edge"><title>Node37&#45;&gt;Node13</title>
+<path fill="none" stroke="midnightblue" d="M603.261,-761.378C541.83,-743.423 418,-695.84 418,-604 418,-604 418,-604 418,-546 418,-403.364 304,-388.636 304,-246 304,-246 304,-246 304,-132 304,-101.205 304.024,-86.326 328,-67 377.961,-26.729 576.098,-18.5719 654.663,-16.9196"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="654.995,-20.4141 664.927,-16.727 654.863,-13.4154 654.995,-20.4141"/>
 </g>
 <!-- Node37&#45;&gt;Node21 -->
-<g id="edge142" class="edge"><title>Node37&#45;&gt;Node21</title>
-<path fill="none" stroke="midnightblue" d="M2976.08,-714.241C3046.8,-712.554 3194,-697.777 3194,-604 3194,-604 3194,-604 3194,-372.5 3194,-285.841 3228.27,-243.141 3170,-179 3160.5,-168.548 3086.49,-153.962 3030.78,-144.262"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="3031.06,-140.758 3020.61,-142.509 3029.87,-147.657 3031.06,-140.758"/>
+<g id="edge143" class="edge"><title>Node37&#45;&gt;Node21</title>
+<path fill="none" stroke="midnightblue" d="M627.775,-761.406C599.77,-734.567 516.063,-648.618 490,-557 468.527,-481.519 567.074,-274.3 635,-235 690.792,-202.72 1149.69,-203.41 1214,-199 1227.5,-198.074 1241.85,-197.04 1255.76,-196.013"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1256.08,-199.499 1265.79,-195.267 1255.56,-192.518 1256.08,-199.499"/>
 </g>
 <!-- Node38&#45;&gt;Node2 -->
-<g id="edge155" class="edge"><title>Node38&#45;&gt;Node2</title>
-<path fill="none" stroke="midnightblue" d="M1049.93,-994.642C973.778,-992.95 832,-977.275 832,-884 832,-884 832,-884 832,-826 832,-761.233 915.711,-733.982 972.004,-722.978"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="972.968,-726.36 982.17,-721.108 971.701,-719.475 972.968,-726.36"/>
+<g id="edge156" class="edge"><title>Node38&#45;&gt;Node2</title>
+<path fill="none" stroke="midnightblue" d="M2228.27,-1049.72C2424.03,-1047.96 3161,-1035.02 3161,-940 3161,-940 3161,-940 3161,-882 3161,-831.725 3103.87,-800.101 3064.5,-784.275"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3065.5,-780.912 3054.91,-780.59 3062.99,-787.446 3065.5,-780.912"/>
 </g>
 <!-- Node38&#45;&gt;Node4 -->
-<g id="edge156" class="edge"><title>Node38&#45;&gt;Node4</title>
-<path fill="none" stroke="midnightblue" d="M1144.27,-992.447C1375.02,-984.565 2372,-946.869 2372,-884 2372,-884 2372,-884 2372,-770 2372,-736.995 2382.83,-699.774 2390.24,-678.192"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2393.57,-679.282 2393.63,-668.687 2386.97,-676.93 2393.57,-679.282"/>
+<g id="edge157" class="edge"><title>Node38&#45;&gt;Node4</title>
+<path fill="none" stroke="midnightblue" d="M2173.28,-1041.43C2165.59,-1032.66 2153.7,-1018.44 2145,-1005 2124.37,-973.128 2135.87,-952.432 2106,-929 1970.21,-822.486 1425.92,-744.019 1259.55,-722.147"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1259.97,-718.673 1249.6,-720.848 1259.07,-725.614 1259.97,-718.673"/>
 </g>
 <!-- Node38&#45;&gt;Node13 -->
-<g id="edge211" class="edge"><title>Node38&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M1144.18,-993.272C1495.35,-987.701 3688,-950.268 3688,-884 3688,-884 3688,-884 3688,-249.5 3688,-87.316 3531.51,-96.3449 3372,-67 3219.81,-39.0021 2079.56,-20.1934 1872.28,-16.9873"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1872.11,-13.4845 1862.06,-16.8303 1872,-20.4836 1872.11,-13.4845"/>
+<g id="edge212" class="edge"><title>Node38&#45;&gt;Node13</title>
+<path fill="none" stroke="midnightblue" d="M2134,-1050.12C1819.9,-1050.63 38,-1048.71 38,-940 38,-940 38,-940 38,-132 38,-97.8442 49.8007,-84.7205 79,-67 128.447,-36.9918 536.154,-21.4269 654.755,-17.5048"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="655.117,-20.9949 664.998,-17.1714 654.889,-13.9986 655.117,-20.9949"/>
 </g>
 <!-- Node38&#45;&gt;Node24 -->
-<g id="edge210" class="edge"><title>Node38&#45;&gt;Node24</title>
-<path fill="none" stroke="midnightblue" d="M1049.72,-993.289C930.484,-991.018 620.123,-982.019 523,-949 409.326,-910.354 381.556,-844.682 372,-725 356.336,-528.815 355.448,-432.239 503,-302 584.102,-230.415 924.189,-200.921 1044.01,-192.622"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1044.5,-196.097 1054.24,-191.928 1044.03,-189.113 1044.5,-196.097"/>
+<g id="edge211" class="edge"><title>Node38&#45;&gt;Node24</title>
+<path fill="none" stroke="midnightblue" d="M2228.32,-1050.36C2456.34,-1051.61 3434.06,-1050.94 3517,-949 3614,-829.771 3541,-757.704 3541,-604 3541,-604 3541,-604 3541,-546 3541,-470.485 3000.2,-302.932 2848.91,-257.535"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2849.69,-254.116 2839.11,-254.602 2847.68,-260.822 2849.69,-254.116"/>
 </g>
 <!-- Node38&#45;&gt;Node34 -->
-<g id="edge212" class="edge"><title>Node38&#45;&gt;Node34</title>
-<path fill="none" stroke="midnightblue" d="M1144.09,-993.675C1345.94,-992.077 2139.03,-983.674 2386,-949 2467.77,-937.52 2561.09,-910.889 2611.01,-895.514"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2612.17,-898.816 2620.68,-892.505 2610.1,-892.131 2612.17,-898.816"/>
+<g id="edge213" class="edge"><title>Node38&#45;&gt;Node34</title>
+<path fill="none" stroke="midnightblue" d="M2165.8,-1041.42C2151.35,-1032.97 2129.32,-1019.28 2112,-1005 2093.72,-989.933 2075.13,-970.077 2062.93,-956.293"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2065.43,-953.832 2056.22,-948.593 2060.15,-958.431 2065.43,-953.832"/>
 </g>
 <!-- Node39 -->
 <g id="node39" class="node"><title>Node39</title>
 <g id="a_node39"><a xlink:href="ir_2module_8h.html" target="_top" xlink:title="IRModule that holds the functions and type definitions. ">
-<polygon fill="white" stroke="black" points="1535,-817.5 1535,-836.5 1627,-836.5 1627,-817.5 1535,-817.5"/>
-<text text-anchor="middle" x="1581" y="-824.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/ir/module.h</text>
+<polygon fill="white" stroke="black" points="2437,-873.5 2437,-892.5 2529,-892.5 2529,-873.5 2437,-873.5"/>
+<text text-anchor="middle" x="2483" y="-880.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/ir/module.h</text>
 </a>
 </g>
 </g>
 <!-- Node38&#45;&gt;Node39 -->
-<g id="edge157" class="edge"><title>Node38&#45;&gt;Node39</title>
-<path fill="none" stroke="midnightblue" d="M1122,-985.425C1202.33,-957.876 1453.93,-871.582 1546.43,-839.856"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1547.82,-843.079 1556.15,-836.524 1545.55,-836.458 1547.82,-843.079"/>
+<g id="edge158" class="edge"><title>Node38&#45;&gt;Node39</title>
+<path fill="none" stroke="midnightblue" d="M2228.08,-1047.23C2292.39,-1040.81 2406.79,-1019.53 2469,-949 2480.09,-936.431 2483.1,-917.212 2483.63,-902.928"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2487.13,-902.753 2483.63,-892.753 2480.13,-902.752 2487.13,-902.753"/>
 </g>
 <!-- Node45 -->
 <g id="node45" class="node"><title>Node45</title>
 <g id="a_node45"><a xlink:href="ir_2op_8h.html" target="_top" xlink:title="Primitive operators(builtin intrinsics) and registry for them. ">
-<polygon fill="white" stroke="black" points="532,-929.5 532,-948.5 600,-948.5 600,-929.5 532,-929.5"/>
-<text text-anchor="middle" x="566" y="-936.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/ir/op.h</text>
+<polygon fill="white" stroke="black" points="3241,-985.5 3241,-1004.5 3309,-1004.5 3309,-985.5 3241,-985.5"/>
+<text text-anchor="middle" x="3275" y="-992.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/ir/op.h</text>
 </a>
 </g>
 </g>
 <!-- Node38&#45;&gt;Node45 -->
-<g id="edge189" class="edge"><title>Node38&#45;&gt;Node45</title>
-<path fill="none" stroke="midnightblue" d="M1049.88,-989.208C948.142,-978.862 709.152,-954.558 610.152,-944.49"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="610.483,-941.006 600.18,-943.476 609.774,-947.97 610.483,-941.006"/>
+<g id="edge190" class="edge"><title>Node38&#45;&gt;Node45</title>
+<path fill="none" stroke="midnightblue" d="M2228.12,-1048.61C2370.85,-1044.25 2810.9,-1029.61 3175,-1005 3193.15,-1003.77 3213.11,-1002.05 3230.4,-1000.44"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3231.16,-1003.89 3240.78,-999.459 3230.5,-996.917 3231.16,-1003.89"/>
 </g>
 <!-- Node50 -->
 <g id="node50" class="node"><title>Node50</title>
 <g id="a_node50"><a xlink:href="relay_2type_8h.html" target="_top" xlink:title="Relay typed AST nodes. ">
-<polygon fill="white" stroke="red" points="1070.5,-929.5 1070.5,-948.5 1123.5,-948.5 1123.5,-929.5 1070.5,-929.5"/>
-<text text-anchor="middle" x="1097" y="-936.5" font-family="Helvetica,sans-Serif" font-size="10.00">./type.h</text>
+<polygon fill="white" stroke="red" points="2154.5,-985.5 2154.5,-1004.5 2207.5,-1004.5 2207.5,-985.5 2154.5,-985.5"/>
+<text text-anchor="middle" x="2181" y="-992.5" font-family="Helvetica,sans-Serif" font-size="10.00">./type.h</text>
 </a>
 </g>
 </g>
 <!-- Node38&#45;&gt;Node50 -->
-<g id="edge213" class="edge"><title>Node38&#45;&gt;Node50</title>
-<path fill="none" stroke="midnightblue" d="M1097,-985.083C1097,-978.006 1097,-967.861 1097,-958.986"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1100.5,-958.751 1097,-948.751 1093.5,-958.751 1100.5,-958.751"/>
+<g id="edge214" class="edge"><title>Node38&#45;&gt;Node50</title>
+<path fill="none" stroke="midnightblue" d="M2181,-1041.08C2181,-1034.01 2181,-1023.86 2181,-1014.99"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2184.5,-1014.75 2181,-1004.75 2177.5,-1014.75 2184.5,-1014.75"/>
 </g>
 <!-- Node39&#45;&gt;Node4 -->
-<g id="edge165" class="edge"><title>Node39&#45;&gt;Node4</title>
-<path fill="none" stroke="midnightblue" d="M1627.09,-822.187C1695.55,-816.076 1828,-802.539 1939,-781 2097.78,-750.188 2282.91,-695.305 2360.19,-671.52"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2361.41,-674.805 2369.93,-668.51 2359.35,-668.117 2361.41,-674.805"/>
+<g id="edge166" class="edge"><title>Node39&#45;&gt;Node4</title>
+<path fill="none" stroke="midnightblue" d="M2455.48,-873.47C2369.92,-847.006 2111.95,-767.669 2073,-761 1914.89,-733.931 1418.41,-720.666 1260.05,-717.047"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1259.76,-713.54 1249.68,-716.813 1259.6,-720.538 1259.76,-713.54"/>
 </g>
 <!-- Node39&#45;&gt;Node7 -->
-<g id="edge173" class="edge"><title>Node39&#45;&gt;Node7</title>
-<path fill="none" stroke="midnightblue" d="M1582.34,-817.372C1589.59,-772.679 1627.38,-564.201 1721,-425 1733.57,-406.318 1778.33,-370.584 1797,-358 1815.82,-345.313 1838.72,-333.912 1856.71,-325.76"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1858.42,-328.83 1866.15,-321.577 1855.59,-322.431 1858.42,-328.83"/>
+<g id="edge174" class="edge"><title>Node39&#45;&gt;Node7</title>
+<path fill="none" stroke="midnightblue" d="M2483.27,-873.392C2484.19,-832.925 2484.18,-658.479 2421,-537 2385.47,-468.689 2365.52,-452.783 2299,-414 2270.08,-397.137 2234.62,-386.399 2204,-379.661"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2204.33,-376.153 2193.82,-377.527 2202.89,-383.004 2204.33,-376.153"/>
 </g>
 <!-- Node39&#45;&gt;Node13 -->
-<g id="edge185" class="edge"><title>Node39&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M1627.3,-824.41C1905.78,-813.632 3346,-740.211 3346,-374.5 3346,-374.5 3346,-374.5 3346,-249.5 3346,-134.779 3228.58,-162.84 3121,-123 3009.74,-81.7986 2977.63,-82.4283 2860,-67 2473.53,-16.3117 1998.61,-15.4394 1872.17,-16.2042"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1871.99,-12.7052 1862.01,-16.274 1872.03,-19.7051 1871.99,-12.7052"/>
+<g id="edge186" class="edge"><title>Node39&#45;&gt;Node13</title>
+<path fill="none" stroke="midnightblue" d="M2436.67,-881.602C2168.28,-879.2 818.482,-865.763 636,-837 449.167,-807.552 228,-905.14 228,-716 228,-716 228,-716 228,-132 228,-95.5528 246.352,-85.0775 278,-67 342.269,-30.2893 569.811,-19.9318 654.729,-17.3091"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="654.925,-20.805 664.818,-17.0136 654.719,-13.808 654.925,-20.805"/>
 </g>
 <!-- Node39&#45;&gt;Node18 -->
-<g id="edge186" class="edge"><title>Node39&#45;&gt;Node18</title>
-<path fill="none" stroke="midnightblue" d="M1534.78,-824.106C1424.73,-819.26 1149.77,-805.063 1061,-781 1004.97,-765.812 985.035,-765.036 943,-725 881.814,-666.724 873.996,-639.097 854,-557 815.838,-400.32 908.694,-209.781 940.079,-151.661"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="943.277,-153.11 945.026,-142.661 937.142,-149.739 943.277,-153.11"/>
+<g id="edge187" class="edge"><title>Node39&#45;&gt;Node18</title>
+<path fill="none" stroke="midnightblue" d="M2529.19,-878.524C2589.16,-873.119 2689.42,-860.759 2717,-837 2820.89,-747.509 2898.89,-390.161 2922,-255 2928.71,-215.753 2903.13,-173.389 2886.43,-150.669"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2889.19,-148.527 2880.34,-142.713 2883.63,-152.781 2889.19,-148.527"/>
 </g>
 <!-- Node39&#45;&gt;Node19 -->
-<g id="edge188" class="edge"><title>Node39&#45;&gt;Node19</title>
-<path fill="none" stroke="midnightblue" d="M1534.78,-825.974C1413.56,-825.356 1088.97,-819.998 990,-781 952.932,-766.393 947.563,-753.768 920,-725 756.334,-554.181 756.311,-233.702 759.099,-152.88"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="762.602,-152.874 759.511,-142.74 755.608,-152.59 762.602,-152.874"/>
+<g id="edge189" class="edge"><title>Node39&#45;&gt;Node19</title>
+<path fill="none" stroke="midnightblue" d="M2529.13,-875.545C2587.6,-867.24 2691.44,-852.08 2780,-837 3019.7,-796.18 3275,-847.154 3275,-604 3275,-604 3275,-604 3275,-490 3275,-343.681 3247.93,-300.899 3167,-179 3159.31,-167.411 3147.95,-156.865 3138,-148.9"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3139.82,-145.888 3129.75,-142.603 3135.58,-151.452 3139.82,-145.888"/>
 </g>
 <!-- Node39&#45;&gt;Node33 -->
-<g id="edge172" class="edge"><title>Node39&#45;&gt;Node33</title>
-<path fill="none" stroke="midnightblue" d="M1627.1,-820.764C1702.43,-810.77 1855.03,-784.442 1970,-725 2017.42,-700.485 2019.33,-679.701 2063,-649 2079.2,-637.609 2098.36,-626.144 2113.26,-617.637"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2115.37,-620.466 2122.36,-612.505 2111.93,-614.368 2115.37,-620.466"/>
+<g id="edge173" class="edge"><title>Node39&#45;&gt;Node33</title>
+<path fill="none" stroke="midnightblue" d="M2436.81,-880.029C2181.1,-869 947.983,-814.249 875,-781 824.316,-757.909 782.078,-704.917 762.585,-677.361"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="765.218,-675.013 756.662,-668.765 759.454,-678.985 765.218,-675.013"/>
 </g>
 <!-- Node40 -->
 <g id="node40" class="node"><title>Node40</title>
 <g id="a_node40"><a xlink:href="ir_2adt_8h.html" target="_top" xlink:title="Algebraic data type definitions. ">
-<polygon fill="white" stroke="black" points="1889,-705.5 1889,-724.5 1961,-724.5 1961,-705.5 1889,-705.5"/>
-<text text-anchor="middle" x="1925" y="-712.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/ir/adt.h</text>
+<polygon fill="white" stroke="black" points="884,-761.5 884,-780.5 956,-780.5 956,-761.5 884,-761.5"/>
+<text text-anchor="middle" x="920" y="-768.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/ir/adt.h</text>
 </a>
 </g>
 </g>
 <!-- Node39&#45;&gt;Node40 -->
-<g id="edge158" class="edge"><title>Node39&#45;&gt;Node40</title>
-<path fill="none" stroke="midnightblue" d="M1607.71,-817.46C1668.79,-797.926 1818.79,-749.963 1888.41,-727.702"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1889.61,-730.99 1898.07,-724.61 1887.48,-724.322 1889.61,-730.99"/>
+<g id="edge159" class="edge"><title>Node39&#45;&gt;Node40</title>
+<path fill="none" stroke="midnightblue" d="M2436.65,-878.738C2207.24,-862.593 1195.3,-791.375 966.213,-775.252"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="966.432,-771.759 956.211,-774.548 965.941,-778.742 966.432,-771.759"/>
 </g>
 <!-- Node41 -->
 <g id="node41" class="node"><title>Node41</title>
 <g id="a_node41"><a xlink:href="ir_2function_8h.html" target="_top" xlink:title="Function nodes. ">
-<polygon fill="white" stroke="black" points="1455.5,-761.5 1455.5,-780.5 1550.5,-780.5 1550.5,-761.5 1455.5,-761.5"/>
-<text text-anchor="middle" x="1503" y="-768.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/ir/function.h</text>
+<polygon fill="white" stroke="black" points="2613.5,-817.5 2613.5,-836.5 2708.5,-836.5 2708.5,-817.5 2613.5,-817.5"/>
+<text text-anchor="middle" x="2661" y="-824.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/ir/function.h</text>
 </a>
 </g>
 </g>
 <!-- Node39&#45;&gt;Node41 -->
-<g id="edge166" class="edge"><title>Node39&#45;&gt;Node41</title>
-<path fill="none" stroke="midnightblue" d="M1568.47,-817.324C1556.41,-808.979 1537.97,-796.212 1523.73,-786.352"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1525.72,-783.469 1515.5,-780.655 1521.73,-789.224 1525.72,-783.469"/>
+<g id="edge167" class="edge"><title>Node39&#45;&gt;Node41</title>
+<path fill="none" stroke="midnightblue" d="M2511.2,-873.444C2541.41,-864.28 2589.51,-849.688 2623.03,-839.519"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2624.37,-842.769 2632.93,-836.516 2622.34,-836.07 2624.37,-842.769"/>
 </g>
 <!-- Node42 -->
 <g id="node42" class="node"><title>Node42</title>
 <g id="a_node42"><a xlink:href="source__map_8h.html" target="_top" xlink:title="A map from source names to source code. ">
-<polygon fill="white" stroke="red" points="936,-593.5 936,-612.5 1076,-612.5 1076,-593.5 936,-593.5"/>
-<text text-anchor="middle" x="1006" y="-600.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/parser/source_map.h</text>
+<polygon fill="white" stroke="red" points="2841,-649.5 2841,-668.5 2981,-668.5 2981,-649.5 2841,-649.5"/>
+<text text-anchor="middle" x="2911" y="-656.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/parser/source_map.h</text>
 </a>
 </g>
 </g>
 <!-- Node39&#45;&gt;Node42 -->
-<g id="edge174" class="edge"><title>Node39&#45;&gt;Node42</title>
-<path fill="none" stroke="midnightblue" d="M1534.91,-823.513C1409.2,-815.941 1063.77,-789.615 973,-725 941.734,-702.742 913.19,-682.454 932,-649 940.221,-634.378 955.331,-623.954 969.763,-616.846"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="971.397,-619.949 979.071,-612.644 968.516,-613.569 971.397,-619.949"/>
+<g id="edge175" class="edge"><title>Node39&#45;&gt;Node42</title>
+<path fill="none" stroke="midnightblue" d="M2529.16,-877.931C2595.41,-871.474 2713.6,-857.482 2751,-837 2822.4,-797.895 2878.47,-714.52 2900.67,-677.89"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2903.92,-679.26 2906.01,-668.874 2897.9,-675.689 2903.92,-679.26"/>
 </g>
 <!-- Node44 -->
 <g id="node44" class="node"><title>Node44</title>
-<polygon fill="white" stroke="#bfbfbf" points="1164,-761.5 1164,-780.5 1250,-780.5 1250,-761.5 1164,-761.5"/>
-<text text-anchor="middle" x="1207" y="-768.5" font-family="Helvetica,sans-Serif" font-size="10.00">unordered_set</text>
+<polygon fill="white" stroke="#bfbfbf" points="2509,-817.5 2509,-836.5 2595,-836.5 2595,-817.5 2509,-817.5"/>
+<text text-anchor="middle" x="2552" y="-824.5" font-family="Helvetica,sans-Serif" font-size="10.00">unordered_set</text>
 </g>
 <!-- Node39&#45;&gt;Node44 -->
-<g id="edge187" class="edge"><title>Node39&#45;&gt;Node44</title>
-<path fill="none" stroke="midnightblue" d="M1534.77,-819.667C1473.01,-811.155 1360.2,-795.422 1264,-781 1262.75,-780.812 1261.48,-780.621 1260.2,-780.427"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1260.66,-776.957 1250.25,-778.901 1259.6,-783.877 1260.66,-776.957"/>
+<g id="edge188" class="edge"><title>Node39&#45;&gt;Node44</title>
+<path fill="none" stroke="midnightblue" d="M2494.09,-873.324C2504.49,-865.185 2520.26,-852.839 2532.72,-843.087"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2534.99,-845.753 2540.71,-836.834 2530.68,-840.241 2534.99,-845.753"/>
 </g>
 <!-- Node40&#45;&gt;Node4 -->
-<g id="edge159" class="edge"><title>Node40&#45;&gt;Node4</title>
-<path fill="none" stroke="midnightblue" d="M1961.16,-707.556C1966.79,-706.64 1972.55,-705.755 1978,-705 2111.68,-686.488 2270.27,-671.29 2348.45,-664.241"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2348.84,-667.72 2358.49,-663.341 2348.22,-660.748 2348.84,-667.72"/>
+<g id="edge160" class="edge"><title>Node40&#45;&gt;Node4</title>
+<path fill="none" stroke="midnightblue" d="M956.267,-763.27C1008.3,-753.615 1104.24,-735.811 1162.38,-725.023"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1163.23,-728.425 1172.42,-723.159 1161.95,-721.542 1163.23,-728.425"/>
 </g>
 <!-- Node40&#45;&gt;Node6 -->
-<g id="edge162" class="edge"><title>Node40&#45;&gt;Node6</title>
-<path fill="none" stroke="midnightblue" d="M1926.75,-705.367C1929.69,-692.279 1936.59,-667.034 1949,-649 1970.43,-617.866 1980.72,-612.669 2013,-593 2083.53,-550.021 2175.53,-519.034 2227.85,-503.44"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2228.85,-506.795 2237.46,-500.617 2226.87,-500.079 2228.85,-506.795"/>
+<g id="edge163" class="edge"><title>Node40&#45;&gt;Node6</title>
+<path fill="none" stroke="midnightblue" d="M941.146,-761.433C1025.49,-727.267 1338.06,-600.64 1437.45,-560.377"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1438.96,-563.54 1446.92,-556.541 1436.33,-557.052 1438.96,-563.54"/>
 </g>
 <!-- Node40&#45;&gt;Node7 -->
-<g id="edge161" class="edge"><title>Node40&#45;&gt;Node7</title>
-<path fill="none" stroke="midnightblue" d="M1919.72,-705.231C1909.27,-687.043 1887,-643.601 1887,-604 1887,-604 1887,-604 1887,-434 1887,-397.714 1887,-355.307 1887,-331.559"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1890.5,-331.509 1887,-321.509 1883.5,-331.509 1890.5,-331.509"/>
+<g id="edge162" class="edge"><title>Node40&#45;&gt;Node7</title>
+<path fill="none" stroke="midnightblue" d="M956.129,-765.956C1018.08,-758.849 1148.5,-743.143 1258,-725 1561.72,-674.678 1642.24,-666.418 1930,-557 2024.14,-521.204 2080.2,-534.185 2127,-445 2136.31,-427.25 2137.06,-403.938 2136.12,-387.837"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2139.58,-387.23 2135.23,-377.569 2132.61,-387.834 2139.58,-387.23"/>
 </g>
 <!-- Node40&#45;&gt;Node11 -->
-<g id="edge163" class="edge"><title>Node40&#45;&gt;Node11</title>
-<path fill="none" stroke="midnightblue" d="M1931.9,-705.172C1942.64,-691.829 1964.46,-666.256 1987,-649 2028.37,-617.321 2042.44,-614.278 2090,-593 2179.18,-553.102 2447.73,-524.872 2504,-445 2587.7,-326.195 2439.12,-148.089 2389.38,-94.0847"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2391.68,-91.4235 2382.29,-86.5109 2386.57,-96.2052 2391.68,-91.4235"/>
+<g id="edge164" class="edge"><title>Node40&#45;&gt;Node11</title>
+<path fill="none" stroke="midnightblue" d="M918.404,-761.173C915.829,-748.267 910.006,-723.749 900,-705 872.777,-653.989 818,-661.82 818,-604 818,-604 818,-604 818,-546 818,-403.825 1163.88,-170.184 1298,-123 1358.79,-101.614 1537.31,-87.9863 1636.88,-81.8281"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1637.38,-85.3044 1647.15,-81.2024 1636.95,-78.3173 1637.38,-85.3044"/>
 </g>
 <!-- Node40&#45;&gt;Node13 -->
-<g id="edge164" class="edge"><title>Node40&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M1897.43,-705.472C1842.67,-687.168 1719.61,-638.992 1651,-557 1600.12,-496.193 1604.53,-467.122 1591,-389 1574.15,-291.749 1667.65,-287.862 1751,-235 1780.54,-216.263 1801.25,-227.879 1821,-199 1855.72,-148.223 1848.87,-70.2108 1843.41,-34.9415"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1846.86,-34.3561 1841.74,-25.081 1839.96,-35.525 1846.86,-34.3561"/>
+<g id="edge165" class="edge"><title>Node40&#45;&gt;Node13</title>
+<path fill="none" stroke="midnightblue" d="M894.843,-761.39C852.892,-746.375 766.949,-712.779 703,-669 601.028,-599.191 342,-369.578 342,-246 342,-246 342,-246 342,-132 342,-97.6051 355.251,-85.882 384,-67 428.215,-37.9594 585.867,-23.6012 654.586,-18.6094"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="655.092,-22.0825 664.821,-17.8878 654.6,-15.0999 655.092,-22.0825"/>
 </g>
 <!-- Node40&#45;&gt;Node33 -->
-<g id="edge160" class="edge"><title>Node40&#45;&gt;Node33</title>
-<path fill="none" stroke="midnightblue" d="M1936.55,-705.408C1954.71,-692.15 1991.35,-666.42 2025,-649 2050.16,-635.977 2079.98,-624.246 2102.67,-616.029"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2103.99,-619.274 2112.23,-612.619 2101.64,-612.68 2103.99,-619.274"/>
+<g id="edge161" class="edge"><title>Node40&#45;&gt;Node33</title>
+<path fill="none" stroke="midnightblue" d="M916.359,-761.111C910.237,-747.217 896.596,-720.356 877,-705 854.484,-687.355 824.472,-676.273 799.387,-669.523"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="800.241,-666.129 789.687,-667.058 798.517,-672.913 800.241,-666.129"/>
 </g>
 <!-- Node41&#45;&gt;Node2 -->
-<g id="edge167" class="edge"><title>Node41&#45;&gt;Node2</title>
-<path fill="none" stroke="midnightblue" d="M1455.12,-764.625C1363.62,-754.352 1164.77,-732.028 1071.79,-721.59"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1072.12,-718.105 1061.79,-720.468 1071.34,-725.061 1072.12,-718.105"/>
+<g id="edge168" class="edge"><title>Node41&#45;&gt;Node2</title>
+<path fill="none" stroke="midnightblue" d="M2708.6,-818.996C2778.29,-808.742 2907.48,-789.733 2978.33,-779.309"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2978.96,-782.753 2988.34,-777.835 2977.94,-775.828 2978.96,-782.753"/>
 </g>
 <!-- Node41&#45;&gt;Node4 -->
-<g id="edge168" class="edge"><title>Node41&#45;&gt;Node4</title>
-<path fill="none" stroke="midnightblue" d="M1550.83,-766.265C1635.49,-759.459 1817.23,-743.974 1970,-725 2107.45,-707.929 2268.84,-681.654 2348.08,-668.344"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2348.96,-671.744 2358.24,-666.631 2347.8,-664.841 2348.96,-671.744"/>
+<g id="edge169" class="edge"><title>Node41&#45;&gt;Node4</title>
+<path fill="none" stroke="midnightblue" d="M2613.43,-818.365C2610.24,-817.891 2607.08,-817.432 2604,-817 2402.23,-788.663 2352.01,-778.338 2149,-761 1809.8,-732.03 1399.89,-720.381 1260.29,-717.07"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1260.02,-713.563 1249.94,-716.828 1259.85,-720.561 1260.02,-713.563"/>
 </g>
 <!-- Node41&#45;&gt;Node8 -->
-<g id="edge169" class="edge"><title>Node41&#45;&gt;Node8</title>
-<path fill="none" stroke="midnightblue" d="M1497.45,-761.28C1476.72,-727.844 1405,-604.073 1405,-492 1405,-492 1405,-492 1405,-434 1405,-293.372 1527.1,-301.52 1651,-235 1672.77,-223.312 1698.09,-211.399 1717.11,-202.769"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1718.75,-205.87 1726.43,-198.574 1715.87,-199.487 1718.75,-205.87"/>
+<g id="edge170" class="edge"><title>Node41&#45;&gt;Node8</title>
+<path fill="none" stroke="midnightblue" d="M2655.72,-817.231C2645.27,-799.043 2623,-755.601 2623,-716 2623,-716 2623,-716 2623,-658 2623,-551.32 2920.79,-314.813 2850,-235 2830.6,-213.13 2655.87,-199.858 2552.24,-193.842"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2552.31,-190.341 2542.13,-193.265 2551.91,-197.33 2552.31,-190.341"/>
 </g>
 <!-- Node41&#45;&gt;Node13 -->
-<g id="edge170" class="edge"><title>Node41&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M1490.68,-761.449C1453.67,-734.647 1346,-648.077 1346,-548 1346,-548 1346,-548 1346,-490 1346,-297.641 934.24,-336.271 1045,-179 1135.23,-50.8791 1669.05,-22.7175 1807.31,-17.5389"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1807.79,-21.024 1817.66,-17.1672 1807.54,-14.0285 1807.79,-21.024"/>
+<g id="edge171" class="edge"><title>Node41&#45;&gt;Node13</title>
+<path fill="none" stroke="midnightblue" d="M2613.47,-818.002C2610.27,-817.616 2607.09,-817.275 2604,-817 2412.6,-799.976 1058.63,-837.602 875,-781 787.825,-754.129 768.878,-732.101 703,-669 587,-557.89 418,-228.343 418,-190 418,-190 418,-190 418,-132 418,-29.9971 583.331,-16.9414 654.763,-16.0608"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="655.004,-19.5598 664.987,-16.0111 654.97,-12.5599 655.004,-19.5598"/>
 </g>
 <!-- Node41&#45;&gt;Node14 -->
-<g id="edge171" class="edge"><title>Node41&#45;&gt;Node14</title>
-<path fill="none" stroke="midnightblue" d="M1550.68,-769.895C1771.73,-769.202 2698.21,-763.981 2985,-725 3088.82,-710.888 3118.19,-711.407 3214,-669 3498.95,-542.879 3726,-501.614 3726,-190 3726,-190 3726,-190 3726,-132 3726,-43.8515 3413.24,-22.7081 3295.29,-17.8687"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="3295.29,-14.366 3285.16,-17.4737 3295.02,-21.3607 3295.29,-14.366"/>
+<g id="edge172" class="edge"><title>Node41&#45;&gt;Node14</title>
+<path fill="none" stroke="midnightblue" d="M2661,-817.442C2661,-798.936 2661,-753.812 2661,-716 2661,-716 2661,-716 2661,-658 2661,-647.095 2858.78,-331.577 2864,-322 2893.09,-268.583 2908.21,-257.739 2924,-199 2932.78,-166.335 2948.87,-149.615 2928,-123 2879.79,-61.5335 2638.88,-30.8696 2538.27,-20.582"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2538.42,-17.0805 2528.12,-19.5659 2537.73,-24.0456 2538.42,-17.0805"/>
 </g>
 <!-- Node42&#45;&gt;Node5 -->
-<g id="edge175" class="edge"><title>Node42&#45;&gt;Node5</title>
-<path fill="none" stroke="midnightblue" d="M1076.07,-599.322C1323.73,-589.857 2154.42,-558.111 2368.39,-549.934"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2368.83,-553.42 2378.69,-549.54 2368.56,-546.425 2368.83,-553.42"/>
+<g id="edge176" class="edge"><title>Node42&#45;&gt;Node5</title>
+<path fill="none" stroke="midnightblue" d="M2840.77,-656.054C2529.62,-647.433 1278.98,-612.783 1012.22,-605.391"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1012.15,-601.888 1002.05,-605.11 1011.95,-608.885 1012.15,-601.888"/>
 </g>
 <!-- Node42&#45;&gt;Node13 -->
-<g id="edge182" class="edge"><title>Node42&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M1006.26,-593.127C1006.5,-580.575 1006.51,-556.939 1004,-537 993.812,-456.121 985.531,-436.623 964,-358 935.215,-252.891 823.599,-206.188 894,-123 895.519,-121.205 1225.67,-67.2914 1228,-67 1448.85,-39.423 1716.49,-23.24 1807.74,-18.2068"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1808.18,-21.6879 1817.98,-17.6483 1807.8,-14.6983 1808.18,-21.6879"/>
+<g id="edge183" class="edge"><title>Node42&#45;&gt;Node13</title>
+<path fill="none" stroke="midnightblue" d="M2840.69,-656.788C2625.54,-652.327 1957.49,-632.881 1410,-557 1290.76,-540.474 1259.73,-537.436 1145,-501 1085.04,-481.958 1064.62,-483.676 1015,-445 932.972,-381.059 746.501,-105.703 698.46,-33.754"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="701.305,-31.7123 692.848,-25.3306 695.48,-35.5934 701.305,-31.7123"/>
 </g>
 <!-- Node42&#45;&gt;Node15 -->
-<g id="edge183" class="edge"><title>Node42&#45;&gt;Node15</title>
-<path fill="none" stroke="midnightblue" d="M966.6,-593.462C891.82,-577.056 724.324,-539.34 585,-501 582.476,-500.305 226.968,-390.727 225,-389 197.053,-364.467 190,-350.188 190,-313 190,-313 190,-313 190,-132 190,-93.2823 213.234,-84.0411 248,-67 311.209,-36.0168 813.375,-20.7681 944.676,-17.2861"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="944.947,-20.7803 954.852,-17.0203 944.764,-13.7827 944.947,-20.7803"/>
+<g id="edge184" class="edge"><title>Node42&#45;&gt;Node15</title>
+<path fill="none" stroke="midnightblue" d="M2942.88,-649.451C3066.34,-616.069 3508.27,-493.936 3550,-445 3624.53,-357.606 3592,-304.86 3592,-190 3592,-190 3592,-190 3592,-132 3592,-44.7524 3457.5,-23.3922 3395.11,-18.1773"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3395.27,-14.6803 3385.04,-17.4247 3394.75,-21.6608 3395.27,-14.6803"/>
 </g>
 <!-- Node42&#45;&gt;Node19 -->
-<g id="edge184" class="edge"><title>Node42&#45;&gt;Node19</title>
-<path fill="none" stroke="midnightblue" d="M1003.54,-593.259C992.933,-556.361 948.97,-410.259 888,-302 854.1,-241.807 800.77,-179.22 774.97,-150.364"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="777.478,-147.919 768.179,-142.84 772.281,-152.609 777.478,-147.919"/>
+<g id="edge185" class="edge"><title>Node42&#45;&gt;Node19</title>
+<path fill="none" stroke="midnightblue" d="M2920.15,-649.44C2953.59,-617.498 3069.76,-500.789 3117,-378 3145.71,-303.375 3130.99,-278.856 3135,-199 3135.45,-190.122 3136.68,-187.729 3135,-179 3133.21,-169.705 3129.52,-159.889 3125.94,-151.805"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3129.06,-150.209 3121.62,-142.661 3122.73,-153.198 3129.06,-150.209"/>
 </g>
 <!-- Node42&#45;&gt;Node20 -->
-<g id="edge176" class="edge"><title>Node42&#45;&gt;Node20</title>
-<path fill="none" stroke="midnightblue" d="M1012.39,-593.443C1042.19,-554.126 1172.12,-389.009 1316,-302 1342.44,-286.009 1374.56,-274.415 1402.52,-266.401"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1403.52,-269.756 1412.22,-263.714 1401.65,-263.01 1403.52,-269.756"/>
+<g id="edge177" class="edge"><title>Node42&#45;&gt;Node20</title>
+<path fill="none" stroke="midnightblue" d="M2907.08,-649.401C2887.92,-608.461 2799.66,-432.228 2668,-358 2617.98,-329.8 2459.2,-316.371 2367.71,-310.826"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2367.9,-307.332 2357.71,-310.236 2367.48,-314.32 2367.9,-307.332"/>
 </g>
 <!-- Node43 -->
 <g id="node43" class="node"><title>Node43</title>
 <g id="a_node43"><a xlink:href="registry_8h.html" target="_top" xlink:title="This file defines the TVM global function registry. ">
-<polygon fill="white" stroke="black" points="512,-302.5 512,-321.5 634,-321.5 634,-302.5 512,-302.5"/>
-<text text-anchor="middle" x="573" y="-309.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/runtime/registry.h</text>
+<polygon fill="white" stroke="black" points="2986,-358.5 2986,-377.5 3108,-377.5 3108,-358.5 2986,-358.5"/>
+<text text-anchor="middle" x="3047" y="-365.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/runtime/registry.h</text>
 </a>
 </g>
 </g>
 <!-- Node42&#45;&gt;Node43 -->
-<g id="edge177" class="edge"><title>Node42&#45;&gt;Node43</title>
-<path fill="none" stroke="midnightblue" d="M993.041,-593.351C931.787,-552.468 671.064,-378.452 594.328,-327.235"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="596.169,-324.256 585.908,-321.615 592.283,-330.078 596.169,-324.256"/>
+<g id="edge178" class="edge"><title>Node42&#45;&gt;Node43</title>
+<path fill="none" stroke="midnightblue" d="M2910.99,-649.35C2911.21,-636.667 2912.55,-612.422 2919,-593 2946.75,-509.5 3007.28,-422.248 3033.95,-386.171"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3037.05,-387.868 3040.24,-377.764 3031.44,-383.675 3037.05,-387.868"/>
 </g>
 <!-- Node43&#45;&gt;Node13 -->
-<g id="edge179" class="edge"><title>Node43&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M555.164,-302.306C525.154,-286.545 465.538,-250.092 442,-199 421.663,-154.855 413.256,-116.736 480,-67 507.497,-46.5097 1604.18,-21.6395 1807.69,-17.1959"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1807.82,-20.6939 1817.74,-16.9771 1807.67,-13.6956 1807.82,-20.6939"/>
+<g id="edge180" class="edge"><title>Node43&#45;&gt;Node13</title>
+<path fill="none" stroke="midnightblue" d="M2985.92,-366.136C2850.36,-363.666 2514.26,-354.498 2235,-322 2066.39,-302.379 2021.3,-304.722 1859,-255 1800.41,-237.049 1784.37,-232.421 1733,-199 1690.8,-171.543 1694.91,-143.66 1649,-123 1562.31,-83.9889 877.149,-30.726 719.339,-18.8925"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="719.543,-15.398 709.31,-18.1431 719.021,-22.3786 719.543,-15.398"/>
 </g>
 <!-- Node43&#45;&gt;Node15 -->
-<g id="edge180" class="edge"><title>Node43&#45;&gt;Node15</title>
-<path fill="none" stroke="midnightblue" d="M546.397,-302.483C506.938,-288.238 434.305,-255.44 404,-199 381.493,-157.083 362.427,-129.912 442,-67 481.549,-35.7317 836.33,-21.2269 944.701,-17.5"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="945.096,-20.9888 954.972,-17.1539 944.86,-13.9927 945.096,-20.9888"/>
+<g id="edge181" class="edge"><title>Node43&#45;&gt;Node15</title>
+<path fill="none" stroke="midnightblue" d="M3059.12,-358.298C3070.84,-349.648 3088.86,-335.723 3103,-322 3207.66,-220.439 3315.45,-81.0144 3351.45,-33.3015"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3354.46,-35.1175 3357.67,-25.0202 3348.87,-30.9128 3354.46,-35.1175"/>
 </g>
 <!-- Node43&#45;&gt;Node19 -->
-<g id="edge181" class="edge"><title>Node43&#45;&gt;Node19</title>
-<path fill="none" stroke="midnightblue" d="M574.456,-302.212C578.793,-278.78 593.779,-214.2 632,-179 658.119,-154.946 697.86,-143.617 726.124,-138.367"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="727.066,-141.758 736.342,-136.639 725.899,-134.856 727.066,-141.758"/>
+<g id="edge182" class="edge"><title>Node43&#45;&gt;Node19</title>
+<path fill="none" stroke="midnightblue" d="M3054.04,-358.408C3068.22,-340.511 3099.88,-297.365 3112,-255 3121.99,-220.083 3120.77,-177.197 3118.93,-152.994"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3122.39,-152.381 3118.02,-142.729 3115.41,-153 3122.39,-152.381"/>
 </g>
 <!-- Node43&#45;&gt;Node20 -->
-<g id="edge178" class="edge"><title>Node43&#45;&gt;Node20</title>
-<path fill="none" stroke="midnightblue" d="M634.233,-306.934C796.095,-296.185 1234.34,-267.083 1402.27,-255.931"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1402.71,-259.41 1412.46,-255.255 1402.25,-252.425 1402.71,-259.41"/>
+<g id="edge179" class="edge"><title>Node43&#45;&gt;Node20</title>
+<path fill="none" stroke="midnightblue" d="M2985.95,-362.131C2847.64,-351.1 2511.3,-324.273 2367.73,-312.823"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2367.98,-309.331 2357.74,-312.025 2367.43,-316.309 2367.98,-309.331"/>
 </g>
 <!-- Node45&#45;&gt;Node2 -->
-<g id="edge191" class="edge"><title>Node45&#45;&gt;Node2</title>
-<path fill="none" stroke="midnightblue" d="M569.652,-929.326C583.542,-896.759 633.911,-782.538 666,-761 715.022,-728.096 886.613,-719.248 972.396,-716.871"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="972.512,-720.369 982.418,-716.612 972.331,-713.372 972.512,-720.369"/>
+<g id="edge192" class="edge"><title>Node45&#45;&gt;Node2</title>
+<path fill="none" stroke="midnightblue" d="M3272.68,-985.491C3264.48,-957.068 3234.16,-864.525 3175,-817 3147.24,-794.698 3108.49,-783.42 3077.74,-777.732"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3078.09,-774.242 3067.65,-776.009 3076.92,-781.142 3078.09,-774.242"/>
 </g>
 <!-- Node45&#45;&gt;Node4 -->
-<g id="edge192" class="edge"><title>Node45&#45;&gt;Node4</title>
-<path fill="none" stroke="midnightblue" d="M600.253,-936.753C814.1,-928.833 1961.57,-884.377 2110,-837 2222.7,-801.028 2336.3,-711.557 2379.16,-675.462"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2381.65,-677.945 2386.99,-668.797 2377.11,-672.616 2381.65,-677.945"/>
+<g id="edge193" class="edge"><title>Node45&#45;&gt;Node4</title>
+<path fill="none" stroke="midnightblue" d="M3240.84,-994.268C3093.12,-995.134 2508.74,-995.5 2332,-949 2152.47,-901.764 2138.24,-809.348 1959,-761 1826.5,-725.26 1404.39,-717.865 1259.99,-716.37"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1259.71,-712.867 1249.68,-716.268 1259.64,-719.867 1259.71,-712.867"/>
 </g>
 <!-- Node45&#45;&gt;Node13 -->
-<g id="edge207" class="edge"><title>Node45&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M564.453,-929.23C561.967,-916.396 556.387,-891.967 547,-873 510.763,-799.785 482.673,-793.395 438,-725 400.98,-668.323 400.458,-648.711 362,-593 295.434,-496.572 261.203,-485.677 195,-389 173.072,-356.979 152,-351.809 152,-313 152,-313 152,-313 152,-188 152,-154.821 392.668,-74.449 425,-67 563.702,-35.0443 1609.68,-19.5748 1807.7,-16.9175"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1807.81,-20.4165 1817.76,-16.7838 1807.71,-13.4172 1807.81,-20.4165"/>
+<g id="edge208" class="edge"><title>Node45&#45;&gt;Node13</title>
+<path fill="none" stroke="midnightblue" d="M3240.82,-993.691C2985.34,-991.334 1370.56,-975.553 882,-949 709.483,-939.624 114,-1000.77 114,-828 114,-828 114,-828 114,-132 114,-95.5528 132.114,-84.6551 164,-67 248.987,-19.9425 554.581,-16.2313 654.869,-16.3125"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="654.985,-19.8127 664.993,-16.3346 655,-12.8127 654.985,-19.8127"/>
 </g>
 <!-- Node45&#45;&gt;Node15 -->
-<g id="edge208" class="edge"><title>Node45&#45;&gt;Node15</title>
-<path fill="none" stroke="midnightblue" d="M531.933,-937.644C420.812,-935.576 76,-921.617 76,-828 76,-828 76,-828 76,-490 76,-404.388 0,-398.612 0,-313 0,-313 0,-313 0,-132 0,-93.5768 22.4503,-83.8126 57,-67 138.206,-27.4834 792.686,-18.3778 944.591,-16.7889"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="944.68,-20.2883 954.644,-16.6872 944.609,-13.2887 944.68,-20.2883"/>
+<g id="edge209" class="edge"><title>Node45&#45;&gt;Node15</title>
+<path fill="none" stroke="midnightblue" d="M3309.24,-992.827C3354.1,-990.024 3434.09,-980.593 3494,-949 3525.7,-932.287 3528.13,-919.728 3552,-893 3714.98,-710.473 3721,-613.702 3721,-369 3721,-369 3721,-369 3721,-132 3721,-93.8683 3698.22,-85.7126 3665,-67 3619.09,-41.1453 3462.88,-24.9858 3395.43,-19.0603"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3395.66,-15.5673 3385.4,-18.1972 3395.06,-22.5415 3395.66,-15.5673"/>
 </g>
 <!-- Node45&#45;&gt;Node19 -->
-<g id="edge209" class="edge"><title>Node45&#45;&gt;Node19</title>
-<path fill="none" stroke="midnightblue" d="M531.849,-932.363C504.016,-926.388 464.892,-914.704 437,-893 413.457,-874.68 413.122,-863.789 400,-837 330.152,-694.405 335.709,-636.036 370,-481 405.251,-321.622 434.311,-254.558 579,-179 626.749,-154.065 688.875,-142.535 726.528,-137.517"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="726.981,-140.987 736.467,-136.268 726.108,-134.042 726.981,-140.987"/>
+<g id="edge210" class="edge"><title>Node45&#45;&gt;Node19</title>
+<path fill="none" stroke="midnightblue" d="M3309.32,-987.714C3361.43,-977.914 3455.79,-959.074 3468,-949 3655.09,-794.594 3675.93,-635.036 3576,-414 3514.37,-277.679 3469.23,-245.062 3335,-179 3302.97,-163.235 3203.61,-146.785 3150.8,-138.851"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3150.99,-135.341 3140.58,-137.336 3149.96,-142.265 3150.99,-135.341"/>
 </g>
 <!-- Node45&#45;&gt;Node33 -->
-<g id="edge193" class="edge"><title>Node45&#45;&gt;Node33</title>
-<path fill="none" stroke="midnightblue" d="M600.118,-937.317C739.519,-934.039 1279.06,-916.61 1714,-837 1860.16,-810.248 1903.57,-804.651 2029,-725 2062.34,-703.828 2071.95,-698.684 2098,-669 2110.8,-654.415 2122.05,-635.245 2129.33,-621.509"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2132.44,-623.105 2133.89,-612.61 2126.21,-619.909 2132.44,-623.105"/>
+<g id="edge194" class="edge"><title>Node45&#45;&gt;Node33</title>
+<path fill="none" stroke="midnightblue" d="M3240.89,-993.712C2940.5,-991.053 755.074,-969.118 642,-893 589.827,-857.879 560.635,-816.077 591,-761 616.624,-714.523 673.632,-686.482 712.205,-672.119"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="713.83,-675.255 722.068,-668.592 711.474,-668.663 713.83,-675.255"/>
 </g>
 <!-- Node45&#45;&gt;Node43 -->
-<g id="edge206" class="edge"><title>Node45&#45;&gt;Node43</title>
-<path fill="none" stroke="midnightblue" d="M531.968,-935.856C502.973,-932.019 462.931,-921.515 443,-893 437.908,-885.714 442.605,-881.88 443,-873 445.002,-827.978 452,-817.067 452,-772 452,-772 452,-772 452,-546 452,-455.697 524.469,-365.617 557.24,-329.499"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="560.069,-331.595 564.298,-321.88 554.934,-326.837 560.069,-331.595"/>
+<g id="edge207" class="edge"><title>Node45&#45;&gt;Node43</title>
+<path fill="none" stroke="midnightblue" d="M3284.2,-985.373C3305.4,-965.237 3356.56,-915.168 3365,-893 3377.02,-861.435 3374.71,-849.351 3365,-817 3303.28,-611.457 3240.58,-575.285 3099,-414 3089.28,-402.925 3076.84,-392.068 3066.57,-383.793"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3068.58,-380.925 3058.56,-377.502 3064.26,-386.43 3068.58,-380.925"/>
 </g>
 <!-- Node46 -->
 <g id="node46" class="node"><title>Node46</title>
-<polygon fill="white" stroke="#bfbfbf" points="452,-873.5 452,-892.5 538,-892.5 538,-873.5 452,-873.5"/>
-<text text-anchor="middle" x="495" y="-880.5" font-family="Helvetica,sans-Serif" font-size="10.00">dmlc/registry.h</text>
+<polygon fill="white" stroke="#bfbfbf" points="3351,-929.5 3351,-948.5 3437,-948.5 3437,-929.5 3351,-929.5"/>
+<text text-anchor="middle" x="3394" y="-936.5" font-family="Helvetica,sans-Serif" font-size="10.00">dmlc/registry.h</text>
 </g>
 <!-- Node45&#45;&gt;Node46 -->
-<g id="edge190" class="edge"><title>Node45&#45;&gt;Node46</title>
-<path fill="none" stroke="midnightblue" d="M554.593,-929.324C543.891,-921.185 527.659,-908.839 514.837,-899.087"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="516.693,-896.102 506.615,-892.834 512.456,-901.673 516.693,-896.102"/>
+<g id="edge191" class="edge"><title>Node45&#45;&gt;Node46</title>
+<path fill="none" stroke="midnightblue" d="M3293.86,-985.444C3313.27,-976.636 3343.73,-962.812 3365.96,-952.722"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3367.57,-955.836 3375.23,-948.516 3364.68,-949.462 3367.57,-955.836"/>
 </g>
 <!-- Node47 -->
 <g id="node47" class="node"><title>Node47</title>
 <g id="a_node47"><a xlink:href="type__relation_8h.html" target="_top" xlink:title="Type relation and function for type inference(checking). ">
-<polygon fill="white" stroke="red" points="867.5,-873.5 867.5,-892.5 986.5,-892.5 986.5,-873.5 867.5,-873.5"/>
-<text text-anchor="middle" x="927" y="-880.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/ir/type_relation.h</text>
+<polygon fill="white" stroke="red" points="2341.5,-929.5 2341.5,-948.5 2460.5,-948.5 2460.5,-929.5 2341.5,-929.5"/>
+<text text-anchor="middle" x="2401" y="-936.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/ir/type_relation.h</text>
 </a>
 </g>
 </g>
 <!-- Node45&#45;&gt;Node47 -->
-<g id="edge194" class="edge"><title>Node45&#45;&gt;Node47</title>
-<path fill="none" stroke="midnightblue" d="M600.287,-932.871C659.746,-923.977 782.981,-905.543 859.944,-894.031"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="860.648,-897.464 870.02,-892.523 859.612,-890.541 860.648,-897.464"/>
+<g id="edge195" class="edge"><title>Node45&#45;&gt;Node47</title>
+<path fill="none" stroke="midnightblue" d="M3240.74,-991.883C3110.94,-983.864 2647.85,-955.252 2470.65,-944.303"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2470.86,-940.81 2460.66,-943.686 2470.43,-947.796 2470.86,-940.81"/>
 </g>
 <!-- Node49 -->
 <g id="node49" class="node"><title>Node49</title>
 <g id="a_node49"><a xlink:href="attr__registry__map_8h.html" target="_top" xlink:title="Attribute map used in registry. ">
-<polygon fill="white" stroke="black" points="256,-358.5 256,-388.5 380,-388.5 380,-358.5 256,-358.5"/>
-<text text-anchor="start" x="264" y="-376.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/node/attr_registry</text>
-<text text-anchor="middle" x="318" y="-365.5" font-family="Helvetica,sans-Serif" font-size="10.00">_map.h</text>
+<polygon fill="white" stroke="black" points="3417,-414.5 3417,-444.5 3541,-444.5 3541,-414.5 3417,-414.5"/>
+<text text-anchor="start" x="3425" y="-432.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/node/attr_registry</text>
+<text text-anchor="middle" x="3479" y="-421.5" font-family="Helvetica,sans-Serif" font-size="10.00">_map.h</text>
 </a>
 </g>
 </g>
 <!-- Node45&#45;&gt;Node49 -->
-<g id="edge202" class="edge"><title>Node45&#45;&gt;Node49</title>
-<path fill="none" stroke="midnightblue" d="M531.957,-934.403C445.453,-924.322 224,-892.242 224,-828 224,-828 224,-828 224,-490 224,-449.232 259.665,-414.865 287.058,-394.481"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="289.157,-397.284 295.245,-388.613 285.078,-391.594 289.157,-397.284"/>
+<g id="edge203" class="edge"><title>Node45&#45;&gt;Node49</title>
+<path fill="none" stroke="midnightblue" d="M3309.31,-994.43C3347.69,-993.012 3409.26,-984.757 3446,-949 3467.57,-928.008 3465,-914.098 3465,-884 3465,-884 3465,-884 3465,-546 3465,-514.264 3470.27,-477.977 3474.35,-454.68"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3477.82,-455.122 3476.17,-444.658 3470.94,-453.872 3477.82,-455.122"/>
 </g>
 <!-- Node47&#45;&gt;Node2 -->
-<g id="edge195" class="edge"><title>Node47&#45;&gt;Node2</title>
-<path fill="none" stroke="midnightblue" d="M932.108,-873.075C947.191,-846.719 991.738,-768.878 1011.89,-733.661"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1015.17,-734.986 1017.1,-724.568 1009.09,-731.509 1015.17,-734.986"/>
+<g id="edge196" class="edge"><title>Node47&#45;&gt;Node2</title>
+<path fill="none" stroke="midnightblue" d="M2441.81,-929.464C2517.99,-913.277 2686.83,-876.214 2827,-837 2886.42,-820.377 2954.55,-797.529 2994.28,-783.818"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2995.44,-787.119 3003.75,-780.538 2993.15,-780.505 2995.44,-787.119"/>
 </g>
 <!-- Node47&#45;&gt;Node33 -->
-<g id="edge201" class="edge"><title>Node47&#45;&gt;Node33</title>
-<path fill="none" stroke="midnightblue" d="M986.585,-874.765C992.798,-874.118 999.024,-873.515 1005,-873 1389.31,-839.894 1501.74,-901.985 1868,-781 1928.51,-761.012 1946.02,-758.881 2000,-725 2049.46,-693.952 2098.84,-645.34 2122.89,-620.228"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2125.71,-622.34 2130.05,-612.672 2120.63,-617.528 2125.71,-622.34"/>
+<g id="edge202" class="edge"><title>Node47&#45;&gt;Node33</title>
+<path fill="none" stroke="midnightblue" d="M2341.27,-934.02C2168.92,-922.283 1656,-885.563 1232,-837 1057.15,-816.973 998.744,-856.981 840,-781 807.098,-765.252 801.187,-754.693 780,-725 769.693,-710.555 761.824,-691.861 756.901,-678.232"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="760.142,-676.894 753.594,-668.565 753.519,-679.159 760.142,-676.894"/>
 </g>
 <!-- Node47&#45;&gt;Node39 -->
-<g id="edge200" class="edge"><title>Node47&#45;&gt;Node39</title>
-<path fill="none" stroke="midnightblue" d="M986.597,-874.894C992.808,-874.222 999.03,-873.577 1005,-873 1195.15,-854.608 1421.19,-838.675 1524.46,-831.722"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1524.92,-835.199 1534.66,-831.038 1524.45,-828.215 1524.92,-835.199"/>
+<g id="edge201" class="edge"><title>Node47&#45;&gt;Node39</title>
+<path fill="none" stroke="midnightblue" d="M2414.17,-929.324C2426.85,-920.979 2446.23,-908.212 2461.21,-898.352"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2463.43,-901.078 2469.86,-892.655 2459.58,-895.231 2463.43,-901.078"/>
 </g>
 <!-- Node48 -->
 <g id="node48" class="node"><title>Node48</title>
 <g id="a_node48"><a xlink:href="env__func_8h.html" target="_top" xlink:title="Serializable global function used in IR. ">
-<polygon fill="white" stroke="black" points="594.5,-481.5 594.5,-500.5 693.5,-500.5 693.5,-481.5 594.5,-481.5"/>
-<text text-anchor="middle" x="644" y="-488.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/ir/env_func.h</text>
+<polygon fill="white" stroke="black" points="2313.5,-537.5 2313.5,-556.5 2412.5,-556.5 2412.5,-537.5 2313.5,-537.5"/>
+<text text-anchor="middle" x="2363" y="-544.5" font-family="Helvetica,sans-Serif" font-size="10.00">tvm/ir/env_func.h</text>
 </a>
 </g>
 </g>
 <!-- Node47&#45;&gt;Node48 -->
-<g id="edge196" class="edge"><title>Node47&#45;&gt;Node48</title>
-<path fill="none" stroke="midnightblue" d="M867.389,-875.966C805.614,-866.032 718,-839.963 718,-772 718,-772 718,-772 718,-714 718,-634.159 674.391,-546.073 654.127,-509.452"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="657.055,-507.52 649.086,-500.538 650.962,-510.965 657.055,-507.52"/>
+<g id="edge197" class="edge"><title>Node47&#45;&gt;Node48</title>
+<path fill="none" stroke="midnightblue" d="M2401,-929.442C2401,-910.936 2401,-865.812 2401,-828 2401,-828 2401,-828 2401,-658 2401,-623.813 2384.4,-586.764 2373.14,-565.571"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2376.18,-563.832 2368.28,-556.769 2370.05,-567.215 2376.18,-563.832"/>
 </g>
 <!-- Node48&#45;&gt;Node13 -->
-<g id="edge198" class="edge"><title>Node48&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M630.538,-481.369C590.73,-454.957 473.581,-370.636 429,-266 394.082,-184.043 377.744,-114.674 453,-67 482.574,-48.2651 1602.14,-21.9136 1807.74,-17.2269"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1807.97,-20.7227 1817.89,-16.9963 1807.81,-13.7245 1807.97,-20.7227"/>
+<g id="edge199" class="edge"><title>Node48&#45;&gt;Node13</title>
+<path fill="none" stroke="midnightblue" d="M2313.33,-545.292C2133.08,-542.083 1514.67,-525.328 1337,-445 1077.96,-327.881 1070.98,-202.376 821,-67 787.706,-48.9699 746.601,-34.5195 718.762,-25.7708"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="719.732,-22.4074 709.145,-22.8095 717.672,-29.0974 719.732,-22.4074"/>
 </g>
 <!-- Node48&#45;&gt;Node15 -->
-<g id="edge199" class="edge"><title>Node48&#45;&gt;Node15</title>
-<path fill="none" stroke="midnightblue" d="M616.816,-481.432C574.698,-468.319 491.01,-442.911 419,-425 343.208,-406.149 303.283,-443.148 247,-389 221.909,-364.861 228,-347.817 228,-313 228,-313 228,-313 228,-132 228,-92.985 251.928,-84.0915 287,-67 346.65,-37.931 818.046,-21.382 944.743,-17.4297"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="945.036,-20.9224 954.924,-17.1163 944.821,-13.9258 945.036,-20.9224"/>
+<g id="edge200" class="edge"><title>Node48&#45;&gt;Node15</title>
+<path fill="none" stroke="midnightblue" d="M2412.54,-541.702C2633.23,-521.222 3516,-424.638 3516,-190 3516,-190 3516,-190 3516,-132 3516,-71.7341 3439.31,-38.8093 3394.88,-24.7905"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3395.72,-21.3882 3385.13,-21.8547 3393.7,-28.0906 3395.72,-21.3882"/>
 </g>
 <!-- Node48&#45;&gt;Node27 -->
-<g id="edge197" class="edge"><title>Node48&#45;&gt;Node27</title>
-<path fill="none" stroke="midnightblue" d="M693.923,-488.28C923.043,-480.384 1874.1,-447.61 2141.04,-438.411"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2141.5,-441.897 2151.37,-438.055 2141.26,-434.901 2141.5,-441.897"/>
+<g id="edge198" class="edge"><title>Node48&#45;&gt;Node27</title>
+<path fill="none" stroke="midnightblue" d="M2313.27,-541.83C2199.52,-532.292 1920.09,-508.862 1789.02,-497.871"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1788.96,-494.354 1778.7,-497.006 1788.37,-501.329 1788.96,-494.354"/>
 </g>
 <!-- Node49&#45;&gt;Node7 -->
-<g id="edge203" class="edge"><title>Node49&#45;&gt;Node7</title>
-<path fill="none" stroke="midnightblue" d="M380.475,-370.131C630.056,-360.666 1554.73,-325.6 1817.15,-315.649"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1817.45,-319.14 1827.31,-315.264 1817.18,-312.145 1817.45,-319.14"/>
+<g id="edge204" class="edge"><title>Node49&#45;&gt;Node7</title>
+<path fill="none" stroke="midnightblue" d="M3416.83,-425.75C3194.23,-415.902 2437.88,-382.443 2203.7,-372.083"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2203.75,-368.582 2193.6,-371.637 2203.44,-375.575 2203.75,-368.582"/>
 </g>
 <!-- Node49&#45;&gt;Node15 -->
-<g id="edge204" class="edge"><title>Node49&#45;&gt;Node15</title>
-<path fill="none" stroke="midnightblue" d="M307.477,-358.445C292.282,-336.715 266,-293.114 266,-251.5 266,-251.5 266,-251.5 266,-132 266,-59.3432 345.592,-84.9349 416,-67 612.4,-16.9717 858.571,-14.8747 944.602,-15.8688"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="944.697,-19.3704 954.745,-16.0099 944.795,-12.3711 944.697,-19.3704"/>
+<g id="edge205" class="edge"><title>Node49&#45;&gt;Node15</title>
+<path fill="none" stroke="midnightblue" d="M3486.74,-414.469C3505.91,-378.392 3554,-278.981 3554,-190 3554,-190 3554,-190 3554,-132 3554,-101.205 3552.4,-88.1307 3530,-67 3492.8,-31.9143 3431.95,-21.1542 3395.17,-17.8864"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3395.26,-14.3834 3385.02,-17.1215 3394.73,-21.3636 3395.26,-14.3834"/>
 </g>
 <!-- Node49&#45;&gt;Node19 -->
-<g id="edge205" class="edge"><title>Node49&#45;&gt;Node19</title>
-<path fill="none" stroke="midnightblue" d="M322.512,-358.101C331.758,-330.718 355.305,-270.69 394,-235 489.631,-146.794 655.353,-134.563 726.248,-133.587"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="726.418,-137.086 736.394,-133.516 726.37,-130.086 726.418,-137.086"/>
+<g id="edge206" class="edge"><title>Node49&#45;&gt;Node19</title>
+<path fill="none" stroke="midnightblue" d="M3474.74,-414.436C3463.75,-380.068 3431.46,-290.577 3378,-235 3348.91,-204.762 3340.02,-196.75 3302,-179 3252.17,-155.736 3188.98,-143.705 3150.84,-138.131"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3151.16,-134.643 3140.77,-136.727 3150.19,-141.576 3151.16,-134.643"/>
 </g>
 <!-- Node50&#45;&gt;Node2 -->
-<g id="edge214" class="edge"><title>Node50&#45;&gt;Node2</title>
-<path fill="none" stroke="midnightblue" d="M1094.06,-929.297C1082.98,-896.494 1043.58,-779.884 1028.21,-734.377"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1031.52,-733.237 1025,-724.884 1024.89,-735.478 1031.52,-733.237"/>
+<g id="edge215" class="edge"><title>Node50&#45;&gt;Node2</title>
+<path fill="none" stroke="midnightblue" d="M2207.79,-990.864C2258.92,-984.647 2373.78,-969.566 2469,-949 2651.13,-909.663 2696.95,-898.04 2873,-837 2918.53,-821.215 2969.81,-798.749 3000.48,-784.789"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3002.05,-787.921 3009.68,-780.575 2999.14,-781.557 3002.05,-787.921"/>
 </g>
 <!-- Node50&#45;&gt;Node13 -->
-<g id="edge220" class="edge"><title>Node50&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M1102.42,-929.25C1113.14,-911.096 1136,-867.713 1136,-828 1136,-828 1136,-828 1136,-714 1136,-655.641 1115.42,-642.806 1085,-593 1016.57,-480.951 641.253,-222.424 727,-123 740.829,-106.965 1082.95,-69.2821 1104,-67 1375.48,-37.57 1705.26,-22.1332 1807.93,-17.7946"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1808.12,-21.2898 1817.97,-17.3755 1807.83,-14.2959 1808.12,-21.2898"/>
+<g id="edge221" class="edge"><title>Node50&#45;&gt;Node13</title>
+<path fill="none" stroke="midnightblue" d="M2154.2,-993.843C1936.76,-992.504 464.692,-982.252 271,-949 180.962,-933.543 76,-975.355 76,-884 76,-884 76,-884 76,-132 76,-101.205 75.5116,-85.6725 100,-67 144.318,-33.2071 538.633,-20.3511 654.882,-17.2758"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="655.031,-20.7732 664.938,-17.016 654.85,-13.7755 655.031,-20.7732"/>
 </g>
 <!-- Node50&#45;&gt;Node33 -->
-<g id="edge216" class="edge"><title>Node50&#45;&gt;Node33</title>
-<path fill="none" stroke="midnightblue" d="M1123.63,-938.101C1287.59,-938.38 2153,-934.793 2153,-828 2153,-828 2153,-828 2153,-714 2153,-681.502 2146.49,-644.095 2142.05,-622.343"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2145.46,-621.559 2139.96,-612.504 2138.61,-623.013 2145.46,-621.559"/>
+<g id="edge217" class="edge"><title>Node50&#45;&gt;Node33</title>
+<path fill="none" stroke="midnightblue" d="M2154.42,-993.881C1927.54,-992.8 324.11,-983.931 290,-949 249.013,-907.026 255.852,-864.704 290,-817 386.273,-682.51 604.601,-661.885 702.231,-659.637"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="702.473,-663.133 712.411,-659.46 702.352,-656.134 702.473,-663.133"/>
 </g>
 <!-- Node50&#45;&gt;Node34 -->
-<g id="edge221" class="edge"><title>Node50&#45;&gt;Node34</title>
-<path fill="none" stroke="midnightblue" d="M1123.58,-937.074C1302.26,-930.849 2329.25,-895.07 2587.88,-886.06"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2588.32,-889.547 2598.19,-885.7 2588.07,-882.551 2588.32,-889.547"/>
+<g id="edge222" class="edge"><title>Node50&#45;&gt;Node34</title>
+<path fill="none" stroke="midnightblue" d="M2160.08,-985.444C2138.26,-976.517 2103.85,-962.439 2079.11,-952.317"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2080.4,-949.063 2069.82,-948.516 2077.75,-955.542 2080.4,-949.063"/>
 </g>
 <!-- Node50&#45;&gt;Node35 -->
-<g id="edge219" class="edge"><title>Node50&#45;&gt;Node35</title>
-<path fill="none" stroke="midnightblue" d="M1123.5,-937.41C1243.91,-934.619 1751.51,-921.662 2167,-893 2401.74,-876.807 2681.26,-846.437 2793.59,-833.774"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2794.08,-837.242 2803.62,-832.641 2793.29,-830.286 2794.08,-837.242"/>
+<g id="edge220" class="edge"><title>Node50&#45;&gt;Node35</title>
+<path fill="none" stroke="midnightblue" d="M2154.05,-992.998C2009.65,-987.441 1312.9,-958.272 747,-893 745.056,-892.776 743.076,-892.532 741.078,-892.273"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="741.4,-888.785 731.013,-890.874 740.436,-895.718 741.4,-888.785"/>
 </g>
 <!-- Node50&#45;&gt;Node43 -->
-<g id="edge218" class="edge"><title>Node50&#45;&gt;Node43</title>
-<path fill="none" stroke="midnightblue" d="M1070.38,-936.732C994.587,-932.831 778.968,-919.63 714,-893 591.395,-842.746 528,-792.505 528,-660 528,-660 528,-660 528,-546 528,-464.832 554.652,-370.597 566.934,-331.481"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="570.327,-332.361 570.047,-321.77 563.662,-330.224 570.327,-332.361"/>
+<g id="edge219" class="edge"><title>Node50&#45;&gt;Node43</title>
+<path fill="none" stroke="midnightblue" d="M2207.56,-994.396C2355.72,-995.99 3078.35,-995.486 3213,-837 3218.76,-830.226 3215.38,-825.566 3213,-817 3183.48,-710.573 3085,-714.445 3085,-604 3085,-604 3085,-604 3085,-490 3085,-451.812 3067.56,-409.83 3056.31,-386.722"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3059.29,-384.865 3051.66,-377.515 3053.04,-388.02 3059.29,-384.865"/>
 </g>
 <!-- Node50&#45;&gt;Node47 -->
-<g id="edge217" class="edge"><title>Node50&#45;&gt;Node47</title>
-<path fill="none" stroke="midnightblue" d="M1070.44,-929.562C1041.79,-920.462 995.97,-905.908 963.847,-895.704"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="964.527,-892.248 953.937,-892.556 962.408,-898.92 964.527,-892.248"/>
+<g id="edge218" class="edge"><title>Node50&#45;&gt;Node47</title>
+<path fill="none" stroke="midnightblue" d="M2207.54,-987.486C2244.41,-978.437 2311.62,-961.939 2356.3,-950.972"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2357.15,-954.367 2366.03,-948.584 2355.48,-947.568 2357.15,-954.367"/>
 </g>
 <!-- Node50&#45;&gt;Node48 -->
-<g id="edge215" class="edge"><title>Node50&#45;&gt;Node48</title>
-<path fill="none" stroke="midnightblue" d="M1070.49,-938.316C968.093,-938.033 604,-922.509 604,-716 604,-716 604,-716 604,-602 604,-567.617 621.474,-530.631 633.323,-509.503"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="636.424,-511.133 638.438,-500.731 630.377,-507.607 636.424,-511.133"/>
+<g id="edge216" class="edge"><title>Node50&#45;&gt;Node48</title>
+<path fill="none" stroke="midnightblue" d="M2189.88,-985.394C2212.32,-963.124 2271.56,-900.869 2301,-837 2344.86,-741.855 2358.2,-614.654 2361.8,-567.044"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2365.3,-567.173 2362.5,-556.954 2358.31,-566.686 2365.3,-567.173"/>
 </g>
 </g>
 </svg>
diff --git a/docs/api/doxygen/analyzer_8h.html b/docs/api/doxygen/analyzer_8h.html
index 7966be2..0e59aa1 100644
--- a/docs/api/doxygen/analyzer_8h.html
+++ b/docs/api/doxygen/analyzer_8h.html
@@ -106,13 +106,13 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 </div><div class="textblock"><div class="dynheader">
 Include dependency graph for analyzer.h:</div>
 <div class="dyncontent">
-<div class="center"><iframe scrolling="no" frameborder="0" src="analyzer_8h__incl.svg" width="3898" height="1351"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
+<div class="center"><iframe scrolling="no" frameborder="0" src="analyzer_8h__incl.svg" width="3984" height="1426"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
 </div>
 </div>
 </div><div class="textblock"><div class="dynheader">
 This graph shows which files directly or indirectly include this file:</div>
 <div class="dyncontent">
-<div class="center"><iframe scrolling="no" frameborder="0" src="analyzer_8h__dep__incl.svg" width="4394" height="663"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
+<div class="center"><iframe scrolling="no" frameborder="0" src="analyzer_8h__dep__incl.svg" width="4407" height="663"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
 </div>
 </div>
 </div>
diff --git a/docs/api/doxygen/analyzer_8h__dep__incl.svg b/docs/api/doxygen/analyzer_8h__dep__incl.svg
index 316c5e3..fd383a6 100644
--- a/docs/api/doxygen/analyzer_8h__dep__incl.svg
+++ b/docs/api/doxygen/analyzer_8h__dep__incl.svg
@@ -4,958 +4,986 @@
 <!-- Generated by graphviz version 2.38.0 (20140413.2041)
  -->
 <!-- Title: include/tvm/arith/analyzer.h Pages: 1 -->
-<svg width="3295pt" height="497pt"
- viewBox="0.00 0.00 3295.48 497.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<svg width="3305pt" height="497pt"
+ viewBox="0.00 0.00 3305.05 497.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
 <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 493)">
 <title>include/tvm/arith/analyzer.h</title>
-<polygon fill="white" stroke="none" points="-4,4 -4,-493 3291.48,-493 3291.48,4 -4,4"/>
+<polygon fill="white" stroke="none" points="-4,4 -4,-493 3301.05,-493 3301.05,4 -4,4"/>
 <!-- Node1 -->
 <g id="node1" class="node"><title>Node1</title>
-<polygon fill="#bfbfbf" stroke="black" points="2881.98,-469.5 2881.98,-488.5 3030.98,-488.5 3030.98,-469.5 2881.98,-469.5"/>
-<text text-anchor="middle" x="2956.48" y="-476.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/arith/analyzer.h</text>
+<polygon fill="#bfbfbf" stroke="black" points="2914.55,-469.5 2914.55,-488.5 3063.55,-488.5 3063.55,-469.5 2914.55,-469.5"/>
+<text text-anchor="middle" x="2989.05" y="-476.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/arith/analyzer.h</text>
 </g>
 <!-- Node2 -->
 <g id="node2" class="node"><title>Node2</title>
 <g id="a_node2"><a xlink:href="int__solver_8h.html" target="_top" xlink:title="integer constraints data structures and solvers ">
-<polygon fill="white" stroke="black" points="1661.48,-402.5 1661.48,-432.5 1773.48,-432.5 1773.48,-402.5 1661.48,-402.5"/>
-<text text-anchor="start" x="1669.48" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/arith/int</text>
-<text text-anchor="middle" x="1717.48" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00">_solver.h</text>
+<polygon fill="white" stroke="black" points="1607.05,-402.5 1607.05,-432.5 1719.05,-432.5 1719.05,-402.5 1607.05,-402.5"/>
+<text text-anchor="start" x="1615.05" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/arith/int</text>
+<text text-anchor="middle" x="1663.05" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00">_solver.h</text>
 </a>
 </g>
 </g>
 <!-- Node1&#45;&gt;Node2 -->
 <g id="edge1" class="edge"><title>Node1&#45;&gt;Node2</title>
-<path fill="none" stroke="midnightblue" d="M2871.67,-476.745C2638.88,-472.981 1993.25,-460.32 1782.48,-433 1779.61,-432.628 1776.68,-432.191 1773.74,-431.705"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2871.72,-480.246 2881.78,-476.907 2871.84,-473.247 2871.72,-480.246"/>
+<path fill="none" stroke="midnightblue" d="M2904.01,-477.003C2659.3,-473.81 1956.32,-462.28 1728.05,-433 1725.19,-432.632 1722.26,-432.198 1719.31,-431.714"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2904.23,-480.506 2914.27,-477.135 2904.32,-473.506 2904.23,-480.506"/>
 </g>
 <!-- Node3 -->
 <g id="node3" class="node"><title>Node3</title>
 <g id="a_node3"><a xlink:href="iter__affine__map_8h.html" target="_top" xlink:title="Iterator quasi&#45;affine mapping patterns. ">
-<polygon fill="white" stroke="black" points="1791.48,-402.5 1791.48,-432.5 1907.48,-432.5 1907.48,-402.5 1791.48,-402.5"/>
-<text text-anchor="start" x="1799.48" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/arith/iter</text>
-<text text-anchor="middle" x="1849.48" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00">_affine_map.h</text>
+<polygon fill="white" stroke="black" points="1737.05,-402.5 1737.05,-432.5 1853.05,-432.5 1853.05,-402.5 1737.05,-402.5"/>
+<text text-anchor="start" x="1745.05" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/arith/iter</text>
+<text text-anchor="middle" x="1795.05" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00">_affine_map.h</text>
 </a>
 </g>
 </g>
 <!-- Node1&#45;&gt;Node3 -->
 <g id="edge2" class="edge"><title>Node1&#45;&gt;Node3</title>
-<path fill="none" stroke="midnightblue" d="M2871.41,-476.183C2657.34,-471.312 2100.33,-456.697 1916.48,-433 1913.65,-432.635 1910.75,-432.211 1907.84,-431.742"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2871.57,-479.688 2881.65,-476.415 2871.73,-472.689 2871.57,-479.688"/>
+<path fill="none" stroke="midnightblue" d="M2904.12,-476.515C2677.73,-472.253 2063.41,-458.612 1862.05,-433 1859.22,-432.64 1856.33,-432.219 1853.42,-431.754"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2904.22,-480.018 2914.28,-476.705 2904.35,-473.019 2904.22,-480.018"/>
 </g>
 <!-- Node4 -->
 <g id="node4" class="node"><title>Node4</title>
 <g id="a_node4"><a xlink:href="operation_8h.html" target="_top" xlink:title="Operation node can generate one or multiple Tensors. ">
-<polygon fill="white" stroke="black" points="1925.98,-408 1925.98,-427 2068.98,-427 2068.98,-408 1925.98,-408"/>
-<text text-anchor="middle" x="1997.48" y="-415" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/te/operation.h</text>
+<polygon fill="white" stroke="black" points="1871.55,-408 1871.55,-427 2014.55,-427 2014.55,-408 1871.55,-408"/>
+<text text-anchor="middle" x="1943.05" y="-415" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/te/operation.h</text>
 </a>
 </g>
 </g>
 <!-- Node1&#45;&gt;Node4 -->
 <g id="edge3" class="edge"><title>Node1&#45;&gt;Node4</title>
-<path fill="none" stroke="midnightblue" d="M2871.6,-472.734C2685.18,-461.168 2242.81,-433.721 2069.18,-422.949"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2871.43,-476.23 2881.63,-473.356 2871.86,-469.243 2871.43,-476.23"/>
+<path fill="none" stroke="midnightblue" d="M2904.04,-473.164C2703.01,-461.729 2201.28,-433.189 2014.55,-422.567"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2904.06,-476.671 2914.25,-473.745 2904.46,-469.682 2904.06,-476.671"/>
 </g>
 <!-- Node23 -->
 <g id="node23" class="node"><title>Node23</title>
 <g id="a_node23"><a xlink:href="nn_2pooling_8h.html" target="_top" xlink:title="Pooling op constructions. ">
-<polygon fill="white" stroke="black" points="2970.48,-0.5 2970.48,-30.5 3078.48,-30.5 3078.48,-0.5 2970.48,-0.5"/>
-<text text-anchor="start" x="2978.48" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/nn</text>
-<text text-anchor="middle" x="3024.48" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00">/pooling.h</text>
+<polygon fill="white" stroke="black" points="3071.05,-0.5 3071.05,-30.5 3179.05,-30.5 3179.05,-0.5 3071.05,-0.5"/>
+<text text-anchor="start" x="3079.05" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/nn</text>
+<text text-anchor="middle" x="3125.05" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00">/pooling.h</text>
 </a>
 </g>
 </g>
 <!-- Node1&#45;&gt;Node23 -->
-<g id="edge106" class="edge"><title>Node1&#45;&gt;Node23</title>
-<path fill="none" stroke="midnightblue" d="M3041.13,-471.148C3085.68,-465.158 3140.28,-453.952 3185.48,-433 3238.12,-408.595 3287.48,-409.527 3287.48,-351.5 3287.48,-351.5 3287.48,-351.5 3287.48,-148.5 3287.48,-55.903 3155.64,-28.2565 3078.65,-20.0061"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="3040.67,-467.677 3031.2,-472.419 3041.56,-474.62 3040.67,-467.677"/>
+<g id="edge110" class="edge"><title>Node1&#45;&gt;Node23</title>
+<path fill="none" stroke="midnightblue" d="M3073.82,-468.44C3112.18,-461.891 3157.28,-450.994 3195.05,-433 3247.44,-408.042 3297.05,-409.527 3297.05,-351.5 3297.05,-351.5 3297.05,-351.5 3297.05,-148.5 3297.05,-85.8274 3224.3,-48.8411 3173.42,-30.6398"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3073.01,-465.026 3063.71,-470.094 3074.14,-471.934 3073.01,-465.026"/>
 </g>
 <!-- Node28 -->
 <g id="node28" class="node"><title>Node28</title>
 <g id="a_node28"><a xlink:href="topi_2nn_8h.html" target="_top" xlink:title="NN op constructions. ">
-<polygon fill="white" stroke="black" points="2912.98,-73 2912.98,-92 3029.98,-92 3029.98,-73 2912.98,-73"/>
-<text text-anchor="middle" x="2971.48" y="-80" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/nn.h</text>
+<polygon fill="white" stroke="black" points="3009.55,-73 3009.55,-92 3126.55,-92 3126.55,-73 3009.55,-73"/>
+<text text-anchor="middle" x="3068.05" y="-80" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/nn.h</text>
 </a>
 </g>
 </g>
 <!-- Node1&#45;&gt;Node28 -->
-<g id="edge107" class="edge"><title>Node1&#45;&gt;Node28</title>
-<path fill="none" stroke="midnightblue" d="M3041.14,-470.704C3077.48,-464.606 3119.15,-453.438 3152.48,-433 3185.56,-412.71 3195.24,-402.482 3208.48,-366 3249.38,-253.274 3110.04,-252.914 3028.48,-165 3006.17,-140.949 2985.58,-107.705 2976.44,-92.1623"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="3040.38,-467.28 3031.05,-472.292 3041.47,-474.195 3040.38,-467.28"/>
+<g id="edge111" class="edge"><title>Node1&#45;&gt;Node28</title>
+<path fill="none" stroke="midnightblue" d="M3071.35,-467.38C3101.56,-460.793 3134.96,-450.155 3162.05,-433 3220.27,-396.131 3229.17,-367.045 3240.05,-299 3251.88,-225.093 3188.8,-220.015 3138.05,-165 3114.28,-139.226 3087.67,-107.354 3075.21,-92.2406"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3070.33,-464.016 3061.24,-469.462 3071.75,-470.872 3070.33,-464.016"/>
 </g>
 <!-- Node29 -->
 <g id="node29" class="node"><title>Node29</title>
 <g id="a_node29"><a xlink:href="constant__utils_8h.html" target="_top" xlink:title="Utility functions for handling constants in TVM expressions. ">
-<polygon fill="white" stroke="black" points="2783.48,-335.5 2783.48,-365.5 2905.48,-365.5 2905.48,-335.5 2783.48,-335.5"/>
-<text text-anchor="start" x="2791.48" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/detail</text>
-<text text-anchor="middle" x="2844.48" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00">/constant_utils.h</text>
+<polygon fill="white" stroke="black" points="2831.05,-335.5 2831.05,-365.5 2953.05,-365.5 2953.05,-335.5 2831.05,-335.5"/>
+<text text-anchor="start" x="2839.05" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/detail</text>
+<text text-anchor="middle" x="2892.05" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00">/constant_utils.h</text>
 </a>
 </g>
 </g>
 <!-- Node1&#45;&gt;Node29 -->
-<g id="edge103" class="edge"><title>Node1&#45;&gt;Node29</title>
-<path fill="none" stroke="midnightblue" d="M2942.12,-461.785C2920.08,-436.887 2878.24,-389.638 2857.22,-365.888"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2939.53,-464.138 2948.78,-469.305 2944.77,-459.498 2939.53,-464.138"/>
+<g id="edge107" class="edge"><title>Node1&#45;&gt;Node29</title>
+<path fill="none" stroke="midnightblue" d="M2976.12,-461.13C2956.9,-436.067 2921.14,-389.431 2903.09,-365.888"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2973.52,-463.499 2982.39,-469.305 2979.08,-459.24 2973.52,-463.499"/>
 </g>
-<!-- Node30 -->
-<g id="node30" class="node"><title>Node30</title>
-<g id="a_node30"><a xlink:href="nn_2bnn_8h.html" target="_top" xlink:title="Binary op constructions. ">
-<polygon fill="white" stroke="black" points="3037.48,-268.5 3037.48,-298.5 3145.48,-298.5 3145.48,-268.5 3037.48,-268.5"/>
-<text text-anchor="start" x="3045.48" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/nn</text>
-<text text-anchor="middle" x="3091.48" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/bnn.h</text>
+<!-- Node31 -->
+<g id="node31" class="node"><title>Node31</title>
+<g id="a_node31"><a xlink:href="nn_2bnn_8h.html" target="_top" xlink:title="Binary op constructions. ">
+<polygon fill="white" stroke="black" points="3123.05,-268.5 3123.05,-298.5 3231.05,-298.5 3231.05,-268.5 3123.05,-268.5"/>
+<text text-anchor="start" x="3131.05" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/nn</text>
+<text text-anchor="middle" x="3177.05" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/bnn.h</text>
 </a>
 </g>
 </g>
-<!-- Node1&#45;&gt;Node30 -->
-<g id="edge104" class="edge"><title>Node1&#45;&gt;Node30</title>
-<path fill="none" stroke="midnightblue" d="M3020.29,-467.24C3076.51,-454.01 3155.1,-425.507 3192.48,-366 3199.81,-354.333 3199.72,-346.72 3192.48,-335 3181.81,-317.74 3163.26,-306.182 3144.94,-298.536"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="3019.41,-463.851 3010.42,-469.461 3020.95,-470.68 3019.41,-463.851"/>
+<!-- Node1&#45;&gt;Node31 -->
+<g id="edge108" class="edge"><title>Node1&#45;&gt;Node31</title>
+<path fill="none" stroke="midnightblue" d="M3041.58,-466.794C3092.67,-452.798 3167.46,-423.355 3202.05,-366 3215.05,-344.455 3199.76,-315.584 3188.02,-298.619"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3040.49,-463.464 3031.7,-469.391 3042.26,-470.234 3040.49,-463.464"/>
 </g>
-<!-- Node44 -->
-<g id="node44" class="node"><title>Node44</title>
-<g id="a_node44"><a xlink:href="dilate_8h.html" target="_top" xlink:title="Dilate op constructions. ">
-<polygon fill="white" stroke="black" points="3075.48,-335.5 3075.48,-365.5 3183.48,-365.5 3183.48,-335.5 3075.48,-335.5"/>
-<text text-anchor="start" x="3083.48" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/nn</text>
-<text text-anchor="middle" x="3129.48" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00">/dilate.h</text>
+<!-- Node45 -->
+<g id="node45" class="node"><title>Node45</title>
+<g id="a_node45"><a xlink:href="dilate_8h.html" target="_top" xlink:title="Dilate op constructions. ">
+<polygon fill="white" stroke="black" points="3085.05,-335.5 3085.05,-365.5 3193.05,-365.5 3193.05,-335.5 3085.05,-335.5"/>
+<text text-anchor="start" x="3093.05" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/nn</text>
+<text text-anchor="middle" x="3139.05" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00">/dilate.h</text>
 </a>
 </g>
 </g>
-<!-- Node1&#45;&gt;Node44 -->
-<g id="edge105" class="edge"><title>Node1&#45;&gt;Node44</title>
-<path fill="none" stroke="midnightblue" d="M2976.5,-463.359C3010,-438.861 3077,-389.871 3110.08,-365.682"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2974.38,-460.577 2968.37,-469.305 2978.51,-466.228 2974.38,-460.577"/>
+<!-- Node1&#45;&gt;Node45 -->
+<g id="edge109" class="edge"><title>Node1&#45;&gt;Node45</title>
+<path fill="none" stroke="midnightblue" d="M3007.14,-462.748C3036.34,-438.124 3093.46,-389.947 3121.99,-365.888"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="3004.75,-460.183 2999.36,-469.305 3009.26,-465.534 3004.75,-460.183"/>
 </g>
 <!-- Node5 -->
 <g id="node5" class="node"><title>Node5</title>
 <g id="a_node5"><a xlink:href="cublas_8h.html" target="_top" xlink:title="External function interface to cuBLAS libraries. ">
-<polygon fill="white" stroke="black" points="422.978,-268.5 422.978,-298.5 551.978,-298.5 551.978,-268.5 422.978,-268.5"/>
-<text text-anchor="start" x="430.978" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/contrib</text>
-<text text-anchor="middle" x="487.478" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/cublas.h</text>
+<polygon fill="white" stroke="black" points="1719.55,-268.5 1719.55,-298.5 1848.55,-298.5 1848.55,-268.5 1719.55,-268.5"/>
+<text text-anchor="start" x="1727.55" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/contrib</text>
+<text text-anchor="middle" x="1784.05" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/cublas.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node5 -->
 <g id="edge4" class="edge"><title>Node4&#45;&gt;Node5</title>
-<path fill="none" stroke="midnightblue" d="M1946.98,-406.252C1936.91,-404.551 1926.39,-403.014 1916.48,-402 1767.2,-386.736 703.866,-426.337 566.478,-366 534.363,-351.896 508.714,-318.003 496.066,-298.659"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1946.4,-409.704 1956.86,-407.994 1947.62,-402.81 1946.4,-409.704"/>
+<path fill="none" stroke="midnightblue" d="M1899.1,-404.816C1875.84,-396.885 1848.04,-384.429 1828.05,-366 1807.6,-347.145 1794.7,-316.554 1788.48,-298.7"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1898.16,-408.191 1908.75,-407.942 1900.32,-401.532 1898.16,-408.191"/>
 </g>
 <!-- Node6 -->
 <g id="node6" class="node"><title>Node6</title>
 <g id="a_node6"><a xlink:href="cuda_2dense_8h.html" target="_top" xlink:title="CUDA schedule for dense operation. ">
-<polygon fill="white" stroke="black" points="458.978,-134.5 458.978,-164.5 577.978,-164.5 577.978,-134.5 458.978,-134.5"/>
-<text text-anchor="start" x="466.978" y="-152.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/cuda</text>
-<text text-anchor="middle" x="518.478" y="-141.5" font-family="Helvetica,sans-Serif" font-size="10.00">/dense.h</text>
+<polygon fill="white" stroke="black" points="1746.55,-134.5 1746.55,-164.5 1865.55,-164.5 1865.55,-134.5 1746.55,-134.5"/>
+<text text-anchor="start" x="1754.55" y="-152.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/cuda</text>
+<text text-anchor="middle" x="1806.05" y="-141.5" font-family="Helvetica,sans-Serif" font-size="10.00">/dense.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node6 -->
 <g id="edge9" class="edge"><title>Node4&#45;&gt;Node6</title>
-<path fill="none" stroke="midnightblue" d="M1946.98,-406.245C1936.92,-404.545 1926.39,-403.009 1916.48,-402 1762.22,-386.294 670.032,-410.416 521.478,-366 459.379,-347.433 390.364,-328.554 413.478,-268 430.649,-223.016 473.838,-184.348 499.155,-164.573"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1946.4,-409.697 1956.86,-407.987 1947.62,-402.803 1946.4,-409.697"/>
+<path fill="none" stroke="midnightblue" d="M1956.15,-399.312C1967.57,-382.048 1980.53,-355.183 1968.05,-335 1949.24,-304.566 1922.36,-323.25 1896.05,-299 1852.61,-258.95 1822.93,-193.178 1811.41,-164.548"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1953.05,-397.628 1950.13,-407.812 1958.76,-401.674 1953.05,-397.628"/>
 </g>
 <!-- Node7 -->
 <g id="node7" class="node"><title>Node7</title>
 <g id="a_node7"><a xlink:href="rocm_2dense_8h.html" target="_top" xlink:title="rocm schedule for dense operation ">
-<polygon fill="white" stroke="black" points="581.978,-67.5 581.978,-97.5 702.978,-97.5 702.978,-67.5 581.978,-67.5"/>
-<text text-anchor="start" x="589.978" y="-85.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/rocm</text>
-<text text-anchor="middle" x="642.478" y="-74.5" font-family="Helvetica,sans-Serif" font-size="10.00">/dense.h</text>
+<polygon fill="white" stroke="black" points="1799.55,-67.5 1799.55,-97.5 1920.55,-97.5 1920.55,-67.5 1799.55,-67.5"/>
+<text text-anchor="start" x="1807.55" y="-85.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/rocm</text>
+<text text-anchor="middle" x="1860.05" y="-74.5" font-family="Helvetica,sans-Serif" font-size="10.00">/dense.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node7 -->
-<g id="edge92" class="edge"><title>Node4&#45;&gt;Node7</title>
-<path fill="none" stroke="midnightblue" d="M2017.32,-401.591C2029.82,-391.902 2046.07,-378.742 2059.48,-366 2072.94,-353.211 2072.33,-345.732 2087.48,-335 2121.29,-311.048 2150.52,-333.496 2173.48,-299 2181.11,-287.53 2182.62,-278.309 2173.48,-268 2073.95,-155.762 975.214,-98.6347 703.145,-86.1486"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2015.16,-398.839 2009.35,-407.698 2019.42,-404.396 2015.16,-398.839"/>
+<g id="edge96" class="edge"><title>Node4&#45;&gt;Node7</title>
+<path fill="none" stroke="midnightblue" d="M1979.34,-404.453C2025.99,-386.407 2101.05,-347.569 2101.05,-284.5 2101.05,-284.5 2101.05,-284.5 2101.05,-215.5 2101.05,-133.328 1991.95,-101.975 1920.56,-90.2551"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1978,-401.217 1969.86,-407.99 1980.45,-407.776 1978,-401.217"/>
 </g>
 <!-- Node8 -->
 <g id="node8" class="node"><title>Node8</title>
 <g id="a_node8"><a xlink:href="rocblas_8h.html" target="_top" xlink:title="include/tvm/topi/contrib\l/rocblas.h">
-<polygon fill="white" stroke="black" points="608.978,-268.5 608.978,-298.5 737.978,-298.5 737.978,-268.5 608.978,-268.5"/>
-<text text-anchor="start" x="616.978" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/contrib</text>
-<text text-anchor="middle" x="673.478" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/rocblas.h</text>
+<polygon fill="white" stroke="black" points="1905.55,-268.5 1905.55,-298.5 2034.55,-298.5 2034.55,-268.5 1905.55,-268.5"/>
+<text text-anchor="start" x="1913.55" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/contrib</text>
+<text text-anchor="middle" x="1970.05" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/rocblas.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node8 -->
 <g id="edge7" class="edge"><title>Node4&#45;&gt;Node8</title>
-<path fill="none" stroke="midnightblue" d="M1946.34,-406.185C1936.47,-404.523 1926.18,-403.017 1916.48,-402 1791.1,-388.858 901.455,-407.67 782.478,-366 779.858,-365.082 722.302,-321.559 691.91,-298.502"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1945.98,-409.675 1956.44,-407.959 1947.19,-402.781 1945.98,-409.675"/>
+<path fill="none" stroke="midnightblue" d="M1967.72,-402.238C1980.21,-393.531 1994.15,-381.156 2001.05,-366 2011.59,-342.853 1994.95,-314.988 1982.26,-298.565"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1965.48,-399.524 1959.01,-407.916 1969.3,-405.388 1965.48,-399.524"/>
 </g>
 <!-- Node9 -->
 <g id="node9" class="node"><title>Node9</title>
 <g id="a_node9"><a xlink:href="cuda_2injective_8h.html" target="_top" xlink:title="CUDA schedule for injective operations. ">
-<polygon fill="white" stroke="black" points="1485.98,-268.5 1485.98,-298.5 1604.98,-298.5 1604.98,-268.5 1485.98,-268.5"/>
-<text text-anchor="start" x="1493.98" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/cuda</text>
-<text text-anchor="middle" x="1545.48" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/injective.h</text>
+<polygon fill="white" stroke="black" points="206.553,-268.5 206.553,-298.5 325.553,-298.5 325.553,-268.5 206.553,-268.5"/>
+<text text-anchor="start" x="214.553" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/cuda</text>
+<text text-anchor="middle" x="266.053" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/injective.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node9 -->
 <g id="edge10" class="edge"><title>Node4&#45;&gt;Node9</title>
-<path fill="none" stroke="midnightblue" d="M1954.66,-405.294C1916.89,-395.31 1860.4,-380.133 1811.48,-366 1733.29,-343.414 1642.79,-315.252 1590.06,-298.639"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1954.24,-408.801 1964.8,-407.968 1956.02,-402.033 1954.24,-408.801"/>
+<path fill="none" stroke="midnightblue" d="M1892.56,-406.231C1882.49,-404.532 1871.97,-403 1862.05,-402 1695.96,-385.243 515.745,-426.231 360.053,-366 324.196,-352.128 292.757,-318.145 276.928,-298.724"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1891.98,-409.682 1902.43,-407.973 1893.19,-402.789 1891.98,-409.682"/>
 </g>
 <!-- Node10 -->
 <g id="node10" class="node"><title>Node10</title>
 <g id="a_node10"><a xlink:href="rocm_2injective_8h.html" target="_top" xlink:title="rocm schedule for injective operations ">
-<polygon fill="white" stroke="black" points="1484.98,-201.5 1484.98,-231.5 1605.98,-231.5 1605.98,-201.5 1484.98,-201.5"/>
-<text text-anchor="start" x="1492.98" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/rocm</text>
-<text text-anchor="middle" x="1545.48" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00">/injective.h</text>
+<polygon fill="white" stroke="black" points="204.553,-201.5 204.553,-231.5 325.553,-231.5 325.553,-201.5 204.553,-201.5"/>
+<text text-anchor="start" x="212.553" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/rocm</text>
+<text text-anchor="middle" x="265.053" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00">/injective.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node10 -->
-<g id="edge93" class="edge"><title>Node4&#45;&gt;Node10</title>
-<path fill="none" stroke="midnightblue" d="M2006.92,-398.811C2017.51,-380.521 2036.26,-352.27 2059.48,-335 2096.04,-307.801 2129.52,-336.454 2155.48,-299 2163.33,-287.676 2164.81,-278.136 2155.48,-268 2118.77,-228.129 1753.21,-219.734 1606.22,-217.969"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2003.84,-397.147 2002.03,-407.586 2009.96,-400.557 2003.84,-397.147"/>
+<g id="edge97" class="edge"><title>Node4&#45;&gt;Node10</title>
+<path fill="none" stroke="midnightblue" d="M1892.56,-406.225C1882.49,-404.526 1871.97,-402.996 1862.05,-402 1690.21,-384.732 473.89,-414.247 308.053,-366 252.723,-349.903 225.378,-349.182 197.053,-299 190.28,-287.002 190.862,-280.309 197.053,-268 205.048,-252.104 220.676,-239.953 234.937,-231.558"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1891.98,-409.676 1902.43,-407.966 1893.19,-402.782 1891.98,-409.676"/>
 </g>
 <!-- Node11 -->
 <g id="node11" class="node"><title>Node11</title>
 <g id="a_node11"><a xlink:href="cuda_2normalization_8h.html" target="_top" xlink:title="CUDA schedule for LRN and l2 normalization operations. ">
-<polygon fill="white" stroke="black" points="30.978,-335.5 30.978,-365.5 149.978,-365.5 149.978,-335.5 30.978,-335.5"/>
-<text text-anchor="start" x="38.978" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/cuda</text>
-<text text-anchor="middle" x="90.478" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00">/normalization.h</text>
+<polygon fill="white" stroke="black" points="1661.55,-335.5 1661.55,-365.5 1780.55,-365.5 1780.55,-335.5 1661.55,-335.5"/>
+<text text-anchor="start" x="1669.55" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/cuda</text>
+<text text-anchor="middle" x="1721.05" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00">/normalization.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node11 -->
 <g id="edge12" class="edge"><title>Node4&#45;&gt;Node11</title>
-<path fill="none" stroke="midnightblue" d="M1946.99,-406.204C1936.92,-404.507 1926.39,-402.983 1916.48,-402 1139.24,-324.949 935.478,-454.699 159.478,-366 156.438,-365.652 153.333,-365.235 150.206,-364.763"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1946.4,-409.655 1956.86,-407.945 1947.62,-402.761 1946.4,-409.655"/>
+<path fill="none" stroke="midnightblue" d="M1903.97,-405.058C1866.33,-394.037 1809.32,-377.344 1768.96,-365.528"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1902.99,-408.417 1913.57,-407.869 1904.96,-401.699 1902.99,-408.417"/>
 </g>
 <!-- Node12 -->
 <g id="node12" class="node"><title>Node12</title>
 <g id="a_node12"><a xlink:href="cuda_2pooling_8h.html" target="_top" xlink:title="CUDA schedule for pooling operations. ">
-<polygon fill="white" stroke="black" points="208.978,-268.5 208.978,-298.5 327.978,-298.5 327.978,-268.5 208.978,-268.5"/>
-<text text-anchor="start" x="216.978" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/cuda</text>
-<text text-anchor="middle" x="268.478" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/pooling.h</text>
+<polygon fill="white" stroke="black" points="458.553,-268.5 458.553,-298.5 577.553,-298.5 577.553,-268.5 458.553,-268.5"/>
+<text text-anchor="start" x="466.553" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/cuda</text>
+<text text-anchor="middle" x="518.053" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/pooling.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node12 -->
 <g id="edge13" class="edge"><title>Node4&#45;&gt;Node12</title>
-<path fill="none" stroke="midnightblue" d="M1946.98,-406.222C1936.92,-404.524 1926.39,-402.995 1916.48,-402 1829.45,-393.268 422.047,-402.322 342.478,-366 311.531,-351.873 287.844,-317.99 276.282,-298.653"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1946.4,-409.674 1956.86,-407.964 1947.62,-402.78 1946.4,-409.674"/>
+<path fill="none" stroke="midnightblue" d="M1891.92,-406.184C1882.04,-404.522 1871.75,-403.016 1862.05,-402 1610.41,-375.638 971.813,-417.302 724.053,-366 656.826,-352.08 583.196,-317.948 544.777,-298.558"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1891.56,-409.674 1902.01,-407.958 1892.77,-402.78 1891.56,-409.674"/>
 </g>
 <!-- Node13 -->
 <g id="node13" class="node"><title>Node13</title>
 <g id="a_node13"><a xlink:href="rocm_2pooling_8h.html" target="_top" xlink:title="rocm schedule for pooling operations ">
-<polygon fill="white" stroke="black" points="226.978,-201.5 226.978,-231.5 347.978,-231.5 347.978,-201.5 226.978,-201.5"/>
-<text text-anchor="start" x="234.978" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/rocm</text>
-<text text-anchor="middle" x="287.478" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00">/pooling.h</text>
+<polygon fill="white" stroke="black" points="670.553,-201.5 670.553,-231.5 791.553,-231.5 791.553,-201.5 670.553,-201.5"/>
+<text text-anchor="start" x="678.553" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/rocm</text>
+<text text-anchor="middle" x="731.053" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00">/pooling.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node13 -->
-<g id="edge95" class="edge"><title>Node4&#45;&gt;Node13</title>
-<path fill="none" stroke="midnightblue" d="M1946.98,-406.227C1936.92,-404.528 1926.39,-402.998 1916.48,-402 1831.55,-393.454 451.588,-413.213 380.478,-366 340.702,-339.591 363.285,-307.508 336.478,-268 327.287,-254.454 314.292,-241.253 303.994,-231.747"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1946.4,-409.678 1956.86,-407.968 1947.62,-402.785 1946.4,-409.678"/>
+<g id="edge99" class="edge"><title>Node4&#45;&gt;Node13</title>
+<path fill="none" stroke="midnightblue" d="M1891.58,-406.227C1881.81,-404.577 1871.64,-403.065 1862.05,-402 1816.95,-396.991 1077.42,-398.786 1046.05,-366 1036.53,-356.045 1037.78,-346.017 1046.05,-335 1080.51,-289.112 1135.59,-344.888 1170.05,-299 1178.33,-287.983 1179.22,-278.284 1170.05,-268 1145.32,-240.263 906.567,-225.584 791.915,-220.086"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1891.13,-409.701 1901.58,-407.982 1892.34,-402.806 1891.13,-409.701"/>
 </g>
 <!-- Node14 -->
 <g id="node14" class="node"><title>Node14</title>
 <g id="a_node14"><a xlink:href="cuda_2reduction_8h.html" target="_top" xlink:title="CUDA schedule for reduction operations. ">
-<polygon fill="white" stroke="black" points="981.978,-268.5 981.978,-298.5 1100.98,-298.5 1100.98,-268.5 981.978,-268.5"/>
-<text text-anchor="start" x="989.978" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/cuda</text>
-<text text-anchor="middle" x="1041.48" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/reduction.h</text>
+<polygon fill="white" stroke="black" points="1218.55,-268.5 1218.55,-298.5 1337.55,-298.5 1337.55,-268.5 1218.55,-268.5"/>
+<text text-anchor="start" x="1226.55" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/cuda</text>
+<text text-anchor="middle" x="1278.05" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/reduction.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node14 -->
 <g id="edge15" class="edge"><title>Node4&#45;&gt;Node14</title>
-<path fill="none" stroke="midnightblue" d="M1946.01,-406.233C1936.23,-404.583 1926.07,-403.069 1916.48,-402 1827.94,-392.129 1196.11,-403.887 1115.48,-366 1084.69,-351.533 1060.94,-317.782 1049.33,-298.558"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1945.55,-409.707 1956.01,-407.989 1946.76,-402.813 1945.55,-409.707"/>
+<path fill="none" stroke="midnightblue" d="M1889.01,-406.212C1880.03,-404.678 1870.79,-403.209 1862.05,-402 1699.7,-379.542 1644.8,-432.609 1495.05,-366 1476.37,-357.689 1478.96,-344.88 1461.05,-335 1439.84,-323.296 1382.65,-308.382 1337.79,-297.804"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1888.56,-409.686 1899.01,-407.961 1889.77,-402.791 1888.56,-409.686"/>
 </g>
 <!-- Node15 -->
 <g id="node15" class="node"><title>Node15</title>
 <g id="a_node15"><a xlink:href="rocm_2reduction_8h.html" target="_top" xlink:title="rocm schedule for reduction operations ">
-<polygon fill="white" stroke="black" points="980.978,-201.5 980.978,-231.5 1101.98,-231.5 1101.98,-201.5 980.978,-201.5"/>
-<text text-anchor="start" x="988.978" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/rocm</text>
-<text text-anchor="middle" x="1041.48" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00">/reduction.h</text>
+<polygon fill="white" stroke="black" points="1217.55,-201.5 1217.55,-231.5 1338.55,-231.5 1338.55,-201.5 1217.55,-201.5"/>
+<text text-anchor="start" x="1225.55" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/rocm</text>
+<text text-anchor="middle" x="1278.05" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00">/reduction.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node15 -->
-<g id="edge96" class="edge"><title>Node4&#45;&gt;Node15</title>
-<path fill="none" stroke="midnightblue" d="M1946.01,-406.185C1936.24,-404.539 1926.07,-403.039 1916.48,-402 1710.86,-379.725 1188.57,-409.991 986.478,-366 909.945,-349.34 867.082,-364.066 823.478,-299 783.608,-239.505 903.456,-223.326 980.964,-219.002"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1945.56,-409.659 1956.01,-407.939 1946.77,-402.764 1945.56,-409.659"/>
+<g id="edge100" class="edge"><title>Node4&#45;&gt;Node15</title>
+<path fill="none" stroke="midnightblue" d="M1887.15,-406.187C1878.75,-404.726 1870.18,-403.285 1862.05,-402 1745.96,-383.64 1706.21,-416.462 1600.05,-366 1542.4,-338.595 1551.38,-298.03 1495.05,-268 1446.07,-241.888 1383.82,-229.182 1338.56,-223.058"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1886.82,-409.682 1897.27,-407.97 1888.03,-402.788 1886.82,-409.682"/>
 </g>
 <!-- Node16 -->
 <g id="node16" class="node"><title>Node16</title>
 <g id="a_node16"><a xlink:href="cuda_2softmax_8h.html" target="_top" xlink:title="include/tvm/topi/cuda\l/softmax.h">
-<polygon fill="white" stroke="black" points="1271.98,-268.5 1271.98,-298.5 1390.98,-298.5 1390.98,-268.5 1271.98,-268.5"/>
-<text text-anchor="start" x="1279.98" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/cuda</text>
-<text text-anchor="middle" x="1331.48" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/softmax.h</text>
+<polygon fill="white" stroke="black" points="1543.55,-268.5 1543.55,-298.5 1662.55,-298.5 1662.55,-268.5 1543.55,-268.5"/>
+<text text-anchor="start" x="1551.55" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/cuda</text>
+<text text-anchor="middle" x="1603.05" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/softmax.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node16 -->
 <g id="edge17" class="edge"><title>Node4&#45;&gt;Node16</title>
-<path fill="none" stroke="midnightblue" d="M1944.72,-406.216C1935.34,-404.628 1925.64,-403.139 1916.48,-402 1803.51,-387.95 1507.88,-415.716 1405.48,-366 1375.05,-351.226 1351.32,-317.921 1339.58,-298.773"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1944.28,-409.692 1954.73,-407.967 1945.48,-402.796 1944.28,-409.692"/>
+<path fill="none" stroke="midnightblue" d="M1886.82,-406.209C1878.53,-404.756 1870.07,-403.311 1862.05,-402 1808.18,-393.189 1658.55,-403.687 1619.05,-366 1600.8,-348.579 1600.02,-316.891 1601.36,-298.582"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1886.38,-409.684 1896.83,-407.979 1887.59,-402.791 1886.38,-409.684"/>
 </g>
 <!-- Node17 -->
 <g id="node17" class="node"><title>Node17</title>
 <g id="a_node17"><a xlink:href="rocm_2softmax_8h.html" target="_top" xlink:title="include/tvm/topi/rocm\l/softmax.h">
-<polygon fill="white" stroke="black" points="1150.98,-201.5 1150.98,-231.5 1271.98,-231.5 1271.98,-201.5 1150.98,-201.5"/>
-<text text-anchor="start" x="1158.98" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/rocm</text>
-<text text-anchor="middle" x="1211.48" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00">/softmax.h</text>
+<polygon fill="white" stroke="black" points="1542.55,-201.5 1542.55,-231.5 1663.55,-231.5 1663.55,-201.5 1542.55,-201.5"/>
+<text text-anchor="start" x="1550.55" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/rocm</text>
+<text text-anchor="middle" x="1603.05" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00">/softmax.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node17 -->
-<g id="edge97" class="edge"><title>Node4&#45;&gt;Node17</title>
-<path fill="none" stroke="midnightblue" d="M1945.68,-406.233C1936.01,-404.598 1925.96,-403.09 1916.48,-402 1839.77,-393.18 1277.68,-419.992 1222.48,-366 1186.07,-330.389 1199.18,-261.647 1207.16,-231.853"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1945.13,-409.69 1955.59,-407.969 1946.34,-402.795 1945.13,-409.69"/>
+<g id="edge101" class="edge"><title>Node4&#45;&gt;Node17</title>
+<path fill="none" stroke="midnightblue" d="M1885.6,-406.161C1877.69,-404.754 1869.67,-403.335 1862.05,-402 1815.42,-393.828 1682.57,-402.198 1652.05,-366 1643.17,-355.467 1648.74,-348.374 1652.05,-335 1656.4,-317.439 1666.7,-316.561 1671.05,-299 1674.37,-285.626 1677.24,-280.309 1671.05,-268 1663.06,-252.104 1647.43,-239.953 1633.17,-231.558"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1885.06,-409.62 1895.52,-407.931 1886.29,-402.729 1885.06,-409.62"/>
 </g>
 <!-- Node18 -->
 <g id="node18" class="node"><title>Node18</title>
 <g id="a_node18"><a xlink:href="array__utils_8h.html" target="_top" xlink:title="Utility functions for handling arrays. ">
-<polygon fill="white" stroke="black" points="168.478,-335.5 168.478,-365.5 290.478,-365.5 290.478,-335.5 168.478,-335.5"/>
-<text text-anchor="start" x="176.478" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/detail</text>
-<text text-anchor="middle" x="229.478" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00">/array_utils.h</text>
+<polygon fill="white" stroke="black" points="369.053,-335.5 369.053,-365.5 491.053,-365.5 491.053,-335.5 369.053,-335.5"/>
+<text text-anchor="start" x="377.053" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/detail</text>
+<text text-anchor="middle" x="430.053" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00">/array_utils.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node18 -->
 <g id="edge19" class="edge"><title>Node4&#45;&gt;Node18</title>
-<path fill="none" stroke="midnightblue" d="M1946.98,-406.218C1936.92,-404.52 1926.39,-402.992 1916.48,-402 1203.42,-330.648 1017.03,-442.285 304.478,-366 299.995,-365.52 295.379,-364.912 290.746,-364.218"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1946.4,-409.669 1956.86,-407.959 1947.62,-402.776 1946.4,-409.669"/>
+<path fill="none" stroke="midnightblue" d="M1891.92,-406.158C1882.05,-404.498 1871.76,-403 1862.05,-402 1299.47,-344.011 1154.62,-399.538 590.053,-366 557.338,-364.057 520.948,-360.842 491.148,-357.934"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1891.56,-409.648 1902.01,-407.931 1892.77,-402.754 1891.56,-409.648"/>
 </g>
 <!-- Node19 -->
 <g id="node19" class="node"><title>Node19</title>
 <g id="a_node19"><a xlink:href="detail_2broadcast_8h.html" target="_top" xlink:title="Detail broadcast. ">
-<polygon fill="white" stroke="black" points="2431.48,-268.5 2431.48,-298.5 2553.48,-298.5 2553.48,-268.5 2431.48,-268.5"/>
-<text text-anchor="start" x="2439.48" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/detail</text>
-<text text-anchor="middle" x="2492.48" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/broadcast.h</text>
+<polygon fill="white" stroke="black" points="2591.05,-268.5 2591.05,-298.5 2713.05,-298.5 2713.05,-268.5 2591.05,-268.5"/>
+<text text-anchor="start" x="2599.05" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/detail</text>
+<text text-anchor="middle" x="2652.05" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/broadcast.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node19 -->
 <g id="edge24" class="edge"><title>Node4&#45;&gt;Node19</title>
-<path fill="none" stroke="midnightblue" d="M2079.17,-407.083C2127.15,-399.7 2188.23,-387.039 2239.48,-366 2262.97,-356.355 2264.16,-345.064 2287.48,-335 2313.05,-323.96 2380.53,-308.227 2431.13,-297.266"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2078.51,-403.642 2069.14,-408.58 2079.54,-410.566 2078.51,-403.642"/>
+<path fill="none" stroke="midnightblue" d="M2024.89,-415.368C2195.33,-412.306 2576.47,-401.527 2623.05,-366 2644.07,-349.971 2649.9,-317.461 2651.48,-298.707"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2024.65,-411.872 2014.71,-415.546 2024.77,-418.87 2024.65,-411.872"/>
 </g>
 <!-- Node22 -->
 <g id="node22" class="node"><title>Node22</title>
 <g id="a_node22"><a xlink:href="reduction_8h.html" target="_top" xlink:title="Reduction op constructors. ">
-<polygon fill="white" stroke="black" points="2500.48,-73 2500.48,-92 2650.48,-92 2650.48,-73 2500.48,-73"/>
-<text text-anchor="middle" x="2575.48" y="-80" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/reduction.h</text>
+<polygon fill="white" stroke="black" points="2548.05,-73 2548.05,-92 2698.05,-92 2698.05,-73 2548.05,-73"/>
+<text text-anchor="middle" x="2623.05" y="-80" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/reduction.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node22 -->
-<g id="edge91" class="edge"><title>Node4&#45;&gt;Node22</title>
-<path fill="none" stroke="midnightblue" d="M2035.93,-404.804C2062.12,-396.017 2097.06,-382.653 2125.48,-366 2144.28,-354.981 2144.72,-346.1 2163.48,-335 2200.2,-313.27 2215.64,-320.528 2252.48,-299 2293.71,-274.907 2306.13,-268.871 2336.48,-232 2367.68,-194.09 2349.2,-164.748 2387.48,-134 2419.36,-108.392 2463.1,-95.7098 2500.37,-89.4553"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2034.76,-401.503 2026.35,-407.945 2036.94,-408.155 2034.76,-401.503"/>
+<g id="edge95" class="edge"><title>Node4&#45;&gt;Node22</title>
+<path fill="none" stroke="midnightblue" d="M2024.92,-416.175C2190.45,-414.661 2552.42,-406.684 2590.05,-366 2610.42,-343.985 2585.33,-328.81 2582.05,-299 2576.23,-245.957 2590.85,-150.982 2596.05,-134 2600.83,-118.412 2610.57,-102.071 2616.95,-92.3389"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2024.76,-412.676 2014.79,-416.261 2024.82,-419.676 2024.76,-412.676"/>
 </g>
 <!-- Node24 -->
 <g id="node24" class="node"><title>Node24</title>
 <g id="a_node24"><a xlink:href="nn_2softmax_8h.html" target="_top" xlink:title="Softmax op constructions. ">
-<polygon fill="white" stroke="black" points="1116.48,-0.5 1116.48,-30.5 1224.48,-30.5 1224.48,-0.5 1116.48,-0.5"/>
-<text text-anchor="start" x="1124.48" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/nn</text>
-<text text-anchor="middle" x="1170.48" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00">/softmax.h</text>
+<polygon fill="white" stroke="black" points="2260.05,-0.5 2260.05,-30.5 2368.05,-30.5 2368.05,-0.5 2260.05,-0.5"/>
+<text text-anchor="start" x="2268.05" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/nn</text>
+<text text-anchor="middle" x="2314.05" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00">/softmax.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node24 -->
-<g id="edge89" class="edge"><title>Node4&#45;&gt;Node24</title>
-<path fill="none" stroke="midnightblue" d="M1946.99,-406.192C1936.92,-404.496 1926.39,-402.976 1916.48,-402 1864.08,-396.845 62.4034,-399.12 21.478,-366 -7.43389,-342.602 2.47796,-321.694 2.47796,-284.5 2.47796,-284.5 2.47796,-284.5 2.47796,-148.5 2.47796,-34.9231 885.783,-19.0601 1116.1,-16.8544"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1946.41,-409.643 1956.86,-407.933 1947.62,-402.75 1946.41,-409.643"/>
+<g id="edge93" class="edge"><title>Node4&#45;&gt;Node24</title>
+<path fill="none" stroke="midnightblue" d="M1994.87,-405.342C2052.94,-389.436 2139.05,-354.113 2139.05,-284.5 2139.05,-284.5 2139.05,-284.5 2139.05,-148.5 2139.05,-110.74 2136.84,-94.1814 2163.05,-67 2188.38,-40.7343 2227.74,-28.1287 2259.92,-22.0794"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1993.88,-401.984 1985.1,-407.91 1995.66,-408.754 1993.88,-401.984"/>
 </g>
 <!-- Node25 -->
 <g id="node25" class="node"><title>Node25</title>
 <g id="a_node25"><a xlink:href="reorg_8h.html" target="_top" xlink:title="Reorg op constructions. ">
-<polygon fill="white" stroke="black" points="2461.98,-0.5 2461.98,-30.5 2584.98,-30.5 2584.98,-0.5 2461.98,-0.5"/>
-<text text-anchor="start" x="2469.98" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/vision</text>
-<text text-anchor="middle" x="2523.48" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00">/reorg.h</text>
+<polygon fill="white" stroke="black" points="2509.55,-0.5 2509.55,-30.5 2632.55,-30.5 2632.55,-0.5 2509.55,-0.5"/>
+<text text-anchor="start" x="2517.55" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/vision</text>
+<text text-anchor="middle" x="2571.05" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00">/reorg.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node25 -->
-<g id="edge99" class="edge"><title>Node4&#45;&gt;Node25</title>
-<path fill="none" stroke="midnightblue" d="M2079.4,-417.041C2262.33,-417.39 2691.47,-413.124 2735.48,-366 2744.88,-355.931 2744.65,-345.284 2735.48,-335 2688.88,-282.741 2491.08,-313.003 2422.48,-299 2319.83,-278.047 2259.15,-315.951 2196.48,-232 2117.47,-126.169 2346.64,-56.8682 2461.79,-29.5937"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2079.31,-413.54 2069.3,-417.016 2079.29,-420.54 2079.31,-413.54"/>
+<g id="edge103" class="edge"><title>Node4&#45;&gt;Node25</title>
+<path fill="none" stroke="midnightblue" d="M2018.16,-405.997C2086.59,-391.692 2177.05,-358.742 2177.05,-284.5 2177.05,-284.5 2177.05,-284.5 2177.05,-148.5 2177.05,-79.246 2398.68,-39.623 2509.49,-24.1399"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2017.36,-402.587 2008.23,-407.974 2018.73,-409.452 2017.36,-402.587"/>
 </g>
 <!-- Node26 -->
 <g id="node26" class="node"><title>Node26</title>
 <g id="a_node26"><a xlink:href="bias__add_8h.html" target="_top" xlink:title="bias_add op constructions ">
-<polygon fill="white" stroke="black" points="2721.48,-67.5 2721.48,-97.5 2829.48,-97.5 2829.48,-67.5 2721.48,-67.5"/>
-<text text-anchor="start" x="2729.48" y="-85.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/nn</text>
-<text text-anchor="middle" x="2775.48" y="-74.5" font-family="Helvetica,sans-Serif" font-size="10.00">/bias_add.h</text>
+<polygon fill="white" stroke="black" points="2384.05,-67.5 2384.05,-97.5 2492.05,-97.5 2492.05,-67.5 2384.05,-67.5"/>
+<text text-anchor="start" x="2392.05" y="-85.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/nn</text>
+<text text-anchor="middle" x="2438.05" y="-74.5" font-family="Helvetica,sans-Serif" font-size="10.00">/bias_add.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node26 -->
-<g id="edge80" class="edge"><title>Node4&#45;&gt;Node26</title>
-<path fill="none" stroke="midnightblue" d="M2079.26,-415.012C2298.72,-410.621 2885.99,-396.101 2914.48,-366 2994.51,-281.441 2845.14,-142.025 2792.7,-97.5763"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2079.02,-411.516 2069.09,-415.213 2079.16,-418.514 2079.02,-411.516"/>
+<g id="edge84" class="edge"><title>Node4&#45;&gt;Node26</title>
+<path fill="none" stroke="midnightblue" d="M2025.12,-408.897C2087.69,-401.476 2167.06,-388.025 2191.05,-366 2218.87,-340.465 2215.05,-322.26 2215.05,-284.5 2215.05,-284.5 2215.05,-284.5 2215.05,-215.5 2215.05,-137.331 2317.78,-104.465 2383.79,-91.4146"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2024.38,-405.46 2014.84,-410.081 2025.18,-412.414 2024.38,-405.46"/>
 </g>
 <!-- Node27 -->
 <g id="node27" class="node"><title>Node27</title>
 <g id="a_node27"><a xlink:href="topi_2transform_8h.html" target="_top" xlink:title="Transform op constructors. ">
-<polygon fill="white" stroke="black" points="2396.48,-140 2396.48,-159 2548.48,-159 2548.48,-140 2396.48,-140"/>
-<text text-anchor="middle" x="2472.48" y="-147" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/transform.h</text>
+<polygon fill="white" stroke="black" points="2364.05,-140 2364.05,-159 2516.05,-159 2516.05,-140 2364.05,-140"/>
+<text text-anchor="middle" x="2440.05" y="-147" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/transform.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node27 -->
-<g id="edge98" class="edge"><title>Node4&#45;&gt;Node27</title>
-<path fill="none" stroke="midnightblue" d="M2071.99,-406.198C2111.89,-398.705 2161.01,-386.225 2201.48,-366 2220.97,-356.256 2220.03,-344.832 2239.48,-335 2296.82,-306.019 2321.3,-326.258 2379.48,-299 2400.81,-289.004 2401.79,-279.271 2422.48,-268 2461.04,-246.995 2489.73,-268.266 2514.48,-232 2531.44,-207.152 2500.46,-174.571 2483.09,-159.219"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2071.27,-402.771 2062.05,-407.997 2072.52,-409.659 2071.27,-402.771"/>
+<g id="edge102" class="edge"><title>Node4&#45;&gt;Node27</title>
+<path fill="none" stroke="midnightblue" d="M2024.79,-409.563C2097.26,-402.082 2196.52,-388.168 2229.05,-366 2300.11,-317.583 2265.75,-258.078 2330.05,-201 2353.15,-180.494 2386,-166.886 2409.77,-159.017"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2024.34,-406.09 2014.75,-410.576 2025.05,-413.054 2024.34,-406.09"/>
 </g>
 <!-- Node4&#45;&gt;Node28 -->
-<g id="edge90" class="edge"><title>Node4&#45;&gt;Node28</title>
-<path fill="none" stroke="midnightblue" d="M2079.35,-417.016C2295.27,-417.694 2868.31,-414.82 2937.48,-366 2969.54,-343.368 2971.48,-323.748 2971.48,-284.5 2971.48,-284.5 2971.48,-284.5 2971.48,-215.5 2971.48,-168.979 2971.48,-113.236 2971.48,-92.1487"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2079.36,-413.516 2069.35,-416.981 2079.33,-420.516 2079.36,-413.516"/>
+<g id="edge94" class="edge"><title>Node4&#45;&gt;Node28</title>
+<path fill="none" stroke="midnightblue" d="M2024.98,-416.191C2252.23,-414.811 2879.19,-407.265 2962.05,-366 2979.42,-357.35 3008.52,-315.903 3018.05,-299 3058.8,-226.738 3066.38,-123.001 3067.76,-92.3469"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2024.77,-412.692 2014.79,-416.25 2024.81,-419.692 2024.77,-412.692"/>
 </g>
 <!-- Node4&#45;&gt;Node29 -->
 <g id="edge40" class="edge"><title>Node4&#45;&gt;Node29</title>
-<path fill="none" stroke="midnightblue" d="M2079.39,-414.521C2220.72,-410.353 2519.3,-398.236 2769.48,-366 2773.95,-365.424 2778.56,-364.745 2783.18,-364.001"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2078.97,-411.032 2069.08,-414.819 2079.17,-418.029 2078.97,-411.032"/>
-</g>
-<!-- Node4&#45;&gt;Node30 -->
-<g id="edge81" class="edge"><title>Node4&#45;&gt;Node30</title>
-<path fill="none" stroke="midnightblue" d="M2079.57,-416.224C2308.04,-414.931 2940.01,-407.594 3023.48,-366 3052.84,-351.37 3074.24,-317.683 3084.56,-298.513"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2079.31,-412.725 2069.32,-416.28 2079.34,-419.725 2079.31,-412.725"/>
+<path fill="none" stroke="midnightblue" d="M2024.95,-412.468C2170.97,-405.037 2485.85,-387.915 2751.05,-366 2777.29,-363.832 2806.23,-360.943 2831.03,-358.314"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2024.54,-408.984 2014.73,-412.986 2024.9,-415.975 2024.54,-408.984"/>
 </g>
-<!-- Node31 -->
-<g id="node31" class="node"><title>Node31</title>
-<g id="a_node31"><a xlink:href="flatten_8h.html" target="_top" xlink:title="Softmax op constructions. ">
-<polygon fill="white" stroke="black" points="2571.48,-268.5 2571.48,-298.5 2679.48,-298.5 2679.48,-268.5 2571.48,-268.5"/>
-<text text-anchor="start" x="2579.48" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/nn</text>
-<text text-anchor="middle" x="2625.48" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/flatten.h</text>
+<!-- Node30 -->
+<g id="node30" class="node"><title>Node30</title>
+<g id="a_node30"><a xlink:href="einsum_8h.html" target="_top" xlink:title="Einstein summation op. ">
+<polygon fill="white" stroke="black" points="2319.05,-274 2319.05,-293 2459.05,-293 2459.05,-274 2319.05,-274"/>
+<text text-anchor="middle" x="2389.05" y="-281" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/einsum.h</text>
 </a>
 </g>
 </g>
+<!-- Node4&#45;&gt;Node30 -->
+<g id="edge80" class="edge"><title>Node4&#45;&gt;Node30</title>
+<path fill="none" stroke="midnightblue" d="M2024.94,-413.525C2152.5,-408.039 2386.71,-394.215 2412.05,-366 2430.91,-344.999 2408.55,-309.656 2396.18,-293.297"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2024.67,-410.033 2014.82,-413.952 2024.96,-417.027 2024.67,-410.033"/>
+</g>
 <!-- Node4&#45;&gt;Node31 -->
-<g id="edge86" class="edge"><title>Node4&#45;&gt;Node31</title>
-<path fill="none" stroke="midnightblue" d="M2079.11,-411.39C2136.48,-405.528 2214,-392.954 2277.48,-366 2299.16,-356.792 2298.65,-343.873 2320.48,-335 2421.21,-294.046 2455.63,-319.21 2562.48,-299 2565.35,-298.458 2568.28,-297.876 2571.24,-297.268"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2078.62,-407.922 2069,-412.375 2079.3,-414.889 2078.62,-407.922"/>
+<g id="edge85" class="edge"><title>Node4&#45;&gt;Node31</title>
+<path fill="none" stroke="midnightblue" d="M2025.2,-415.391C2262.65,-411.81 2938.92,-398.819 3033.05,-366 3055.3,-358.244 3055.9,-347.196 3076.05,-335 3098.53,-321.4 3125.15,-308.202 3145.41,-298.71"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2024.86,-411.896 2014.91,-415.545 2024.96,-418.895 2024.86,-411.896"/>
 </g>
 <!-- Node32 -->
 <g id="node32" class="node"><title>Node32</title>
-<g id="a_node32"><a xlink:href="detail_2extern_8h.html" target="_top" xlink:title="Helpers for using external functions. ">
-<polygon fill="white" stroke="black" points="608.478,-335.5 608.478,-365.5 730.478,-365.5 730.478,-335.5 608.478,-335.5"/>
-<text text-anchor="start" x="616.478" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/detail</text>
-<text text-anchor="middle" x="669.478" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00">/extern.h</text>
+<g id="a_node32"><a xlink:href="flatten_8h.html" target="_top" xlink:title="Softmax op constructions. ">
+<polygon fill="white" stroke="black" points="2731.05,-268.5 2731.05,-298.5 2839.05,-298.5 2839.05,-268.5 2731.05,-268.5"/>
+<text text-anchor="start" x="2739.05" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/nn</text>
+<text text-anchor="middle" x="2785.05" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/flatten.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node32 -->
-<g id="edge49" class="edge"><title>Node4&#45;&gt;Node32</title>
-<path fill="none" stroke="midnightblue" d="M1946.34,-406.177C1936.47,-404.516 1926.18,-403.012 1916.48,-402 1398.15,-347.94 1262.41,-423.645 744.478,-366 739.997,-365.501 735.383,-364.879 730.75,-364.175"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1945.98,-409.667 1956.44,-407.951 1947.19,-402.773 1945.98,-409.667"/>
+<g id="edge90" class="edge"><title>Node4&#45;&gt;Node32</title>
+<path fill="none" stroke="midnightblue" d="M2025,-415.359C2185.15,-412.356 2533.99,-401.795 2648.05,-366 2694.79,-351.333 2742.39,-317.987 2767.32,-298.803"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2024.66,-411.865 2014.72,-415.546 2024.79,-418.864 2024.66,-411.865"/>
 </g>
 <!-- Node33 -->
 <g id="node33" class="node"><title>Node33</title>
-<g id="a_node33"><a xlink:href="fuse_8h.html" target="_top" xlink:title="Fuse operation. ">
-<polygon fill="white" stroke="black" points="1231.48,-335.5 1231.48,-365.5 1353.48,-365.5 1353.48,-335.5 1231.48,-335.5"/>
-<text text-anchor="start" x="1239.48" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/detail</text>
-<text text-anchor="middle" x="1292.48" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00">/fuse.h</text>
+<g id="a_node33"><a xlink:href="detail_2extern_8h.html" target="_top" xlink:title="Helpers for using external functions. ">
+<polygon fill="white" stroke="black" points="1837.05,-335.5 1837.05,-365.5 1959.05,-365.5 1959.05,-335.5 1837.05,-335.5"/>
+<text text-anchor="start" x="1845.05" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/detail</text>
+<text text-anchor="middle" x="1898.05" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00">/extern.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node33 -->
-<g id="edge52" class="edge"><title>Node4&#45;&gt;Node33</title>
-<path fill="none" stroke="midnightblue" d="M1944.72,-406.179C1935.34,-404.594 1925.64,-403.117 1916.48,-402 1673.75,-372.41 1609.99,-397.269 1367.48,-366 1363.01,-365.423 1358.4,-364.744 1353.77,-364"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1944.28,-409.655 1954.73,-407.928 1945.49,-402.759 1944.28,-409.655"/>
+<g id="edge50" class="edge"><title>Node4&#45;&gt;Node33</title>
+<path fill="none" stroke="midnightblue" d="M1931.18,-399.355C1923.89,-388.824 1914.67,-375.502 1907.81,-365.589"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1928.41,-401.504 1936.98,-407.734 1934.17,-397.52 1928.41,-401.504"/>
 </g>
 <!-- Node34 -->
 <g id="node34" class="node"><title>Node34</title>
-<g id="a_node34"><a xlink:href="generic_2default_8h.html" target="_top" xlink:title="Generic default schedule. ">
-<polygon fill="white" stroke="black" points="1622.98,-268.5 1622.98,-298.5 1753.98,-298.5 1753.98,-268.5 1622.98,-268.5"/>
-<text text-anchor="start" x="1630.98" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/generic</text>
-<text text-anchor="middle" x="1688.48" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/default.h</text>
+<g id="a_node34"><a xlink:href="fuse_8h.html" target="_top" xlink:title="Fuse operation. ">
+<polygon fill="white" stroke="black" points="839.053,-335.5 839.053,-365.5 961.053,-365.5 961.053,-335.5 839.053,-335.5"/>
+<text text-anchor="start" x="847.053" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/detail</text>
+<text text-anchor="middle" x="900.053" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00">/fuse.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node34 -->
-<g id="edge77" class="edge"><title>Node4&#45;&gt;Node34</title>
-<path fill="none" stroke="midnightblue" d="M1967.88,-403.855C1909.94,-379.105 1781.94,-324.427 1721.42,-298.572"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1966.63,-407.129 1977.2,-407.839 1969.38,-400.692 1966.63,-407.129"/>
+<g id="edge53" class="edge"><title>Node4&#45;&gt;Node34</title>
+<path fill="none" stroke="midnightblue" d="M1891.59,-406.199C1881.81,-404.553 1871.64,-403.048 1862.05,-402 1469.84,-359.138 1366.96,-411.575 975.053,-366 970.574,-365.479 965.962,-364.841 961.331,-364.126"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1891.13,-409.673 1901.59,-407.954 1892.34,-402.779 1891.13,-409.673"/>
 </g>
 <!-- Node35 -->
 <g id="node35" class="node"><title>Node35</title>
-<g id="a_node35"><a xlink:href="generic_2extern_8h.html" target="_top" xlink:title="Schedule for extern followed by injective ops. ">
-<polygon fill="white" stroke="black" points="515.978,-201.5 515.978,-231.5 646.978,-231.5 646.978,-201.5 515.978,-201.5"/>
-<text text-anchor="start" x="523.978" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/generic</text>
-<text text-anchor="middle" x="581.478" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00">/extern.h</text>
+<g id="a_node35"><a xlink:href="generic_2default_8h.html" target="_top" xlink:title="Generic default schedule. ">
+<polygon fill="white" stroke="black" points="1355.55,-268.5 1355.55,-298.5 1486.55,-298.5 1486.55,-268.5 1355.55,-268.5"/>
+<text text-anchor="start" x="1363.55" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/generic</text>
+<text text-anchor="middle" x="1421.05" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/default.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node35 -->
-<g id="edge78" class="edge"><title>Node4&#45;&gt;Node35</title>
-<path fill="none" stroke="midnightblue" d="M1946.98,-406.257C1936.91,-404.556 1926.39,-403.017 1916.48,-402 1843.67,-394.53 653.591,-415.286 599.478,-366 561.462,-331.375 571.484,-261.477 577.977,-231.582"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1946.4,-409.709 1956.86,-408 1947.62,-402.815 1946.4,-409.709"/>
+<g id="edge81" class="edge"><title>Node4&#45;&gt;Node35</title>
+<path fill="none" stroke="midnightblue" d="M1888.08,-406.165C1879.39,-404.673 1870.49,-403.229 1862.05,-402 1720.87,-381.419 1677.05,-415.013 1543.05,-366 1537.46,-363.955 1474.94,-321.41 1441.65,-298.625"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1887.7,-409.651 1898.15,-407.928 1888.9,-402.755 1887.7,-409.651"/>
 </g>
 <!-- Node36 -->
 <g id="node36" class="node"><title>Node36</title>
-<g id="a_node36"><a xlink:href="generic_2injective_8h.html" target="_top" xlink:title="Generic schedule for injective operations. ">
-<polygon fill="white" stroke="black" points="832.978,-268.5 832.978,-298.5 963.978,-298.5 963.978,-268.5 832.978,-268.5"/>
-<text text-anchor="start" x="840.978" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/generic</text>
-<text text-anchor="middle" x="898.478" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/injective.h</text>
+<g id="a_node36"><a xlink:href="generic_2extern_8h.html" target="_top" xlink:title="Schedule for extern followed by injective ops. ">
+<polygon fill="white" stroke="black" points="55.5529,-201.5 55.5529,-231.5 186.553,-231.5 186.553,-201.5 55.5529,-201.5"/>
+<text text-anchor="start" x="63.5529" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/generic</text>
+<text text-anchor="middle" x="121.053" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00">/extern.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node36 -->
-<g id="edge79" class="edge"><title>Node4&#45;&gt;Node36</title>
-<path fill="none" stroke="midnightblue" d="M1946.01,-406.216C1936.24,-404.568 1926.07,-403.059 1916.48,-402 1730.33,-381.453 1256.47,-410.19 1074.48,-366 1016.23,-351.858 953.896,-317.98 921.317,-298.649"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1945.56,-409.69 1956.01,-407.972 1946.77,-402.796 1945.56,-409.69"/>
+<g id="edge82" class="edge"><title>Node4&#45;&gt;Node36</title>
+<path fill="none" stroke="midnightblue" d="M1892.56,-406.205C1882.49,-404.508 1871.97,-402.984 1862.05,-402 1669.18,-382.865 304.005,-420.659 118.053,-366 55.5515,-347.628 -26.299,-322.719 9.05291,-268 20.3383,-250.532 39.2734,-239.045 58.3943,-231.516"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1891.98,-409.656 1902.44,-407.946 1893.2,-402.763 1891.98,-409.656"/>
 </g>
 <!-- Node37 -->
 <g id="node37" class="node"><title>Node37</title>
-<g id="a_node37"><a xlink:href="x86_2bnn_8h.html" target="_top" xlink:title="x86 schedule for binary operations ">
-<polygon fill="white" stroke="black" points="1771.98,-268.5 1771.98,-298.5 1884.98,-298.5 1884.98,-268.5 1771.98,-268.5"/>
-<text text-anchor="start" x="1779.98" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/x86</text>
-<text text-anchor="middle" x="1828.48" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/bnn.h</text>
+<g id="a_node37"><a xlink:href="generic_2injective_8h.html" target="_top" xlink:title="Generic schedule for injective operations. ">
+<polygon fill="white" stroke="black" points="18.5529,-268.5 18.5529,-298.5 149.553,-298.5 149.553,-268.5 18.5529,-268.5"/>
+<text text-anchor="start" x="26.5529" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/generic</text>
+<text text-anchor="middle" x="84.0529" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/injective.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node37 -->
-<g id="edge100" class="edge"><title>Node4&#45;&gt;Node37</title>
-<path fill="none" stroke="midnightblue" d="M1978.44,-401.634C1945.58,-375.966 1878.64,-323.679 1846.49,-298.572"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1976.35,-404.442 1986.39,-407.839 1980.66,-398.925 1976.35,-404.442"/>
+<g id="edge83" class="edge"><title>Node4&#45;&gt;Node37</title>
+<path fill="none" stroke="midnightblue" d="M1892.56,-406.209C1882.49,-404.512 1871.97,-402.987 1862.05,-402 1768.16,-392.658 250.665,-403.432 164.053,-366 131.657,-351.999 105.635,-318.066 92.7852,-298.688"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1891.98,-409.661 1902.44,-407.95 1893.19,-402.767 1891.98,-409.661"/>
 </g>
 <!-- Node38 -->
 <g id="node38" class="node"><title>Node38</title>
-<g id="a_node38"><a xlink:href="x86_2default_8h.html" target="_top" xlink:title="default x86 schedule ">
-<polygon fill="white" stroke="black" points="1902.98,-268.5 1902.98,-298.5 2015.98,-298.5 2015.98,-268.5 1902.98,-268.5"/>
-<text text-anchor="start" x="1910.98" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/x86</text>
-<text text-anchor="middle" x="1959.48" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/default.h</text>
+<g id="a_node38"><a xlink:href="x86_2bnn_8h.html" target="_top" xlink:title="x86 schedule for binary operations ">
+<polygon fill="white" stroke="black" points="595.553,-268.5 595.553,-298.5 708.553,-298.5 708.553,-268.5 595.553,-268.5"/>
+<text text-anchor="start" x="603.553" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/x86</text>
+<text text-anchor="middle" x="652.053" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/bnn.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node38 -->
-<g id="edge101" class="edge"><title>Node4&#45;&gt;Node38</title>
-<path fill="none" stroke="midnightblue" d="M1992.12,-397.889C1984.45,-371.244 1970.5,-322.798 1963.59,-298.783"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1988.85,-399.198 1994.98,-407.839 1995.58,-397.261 1988.85,-399.198"/>
+<g id="edge104" class="edge"><title>Node4&#45;&gt;Node38</title>
+<path fill="none" stroke="midnightblue" d="M1891.91,-406.201C1882.04,-404.538 1871.75,-403.027 1862.05,-402 1627.01,-377.112 1026.84,-429.054 799.053,-366 748.994,-352.143 697.479,-318.154 670.735,-298.728"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1891.55,-409.692 1902.01,-407.976 1892.77,-402.797 1891.55,-409.692"/>
 </g>
 <!-- Node39 -->
 <g id="node39" class="node"><title>Node39</title>
-<g id="a_node39"><a xlink:href="x86_2injective_8h.html" target="_top" xlink:title="x86 schedule for injective ops ">
-<polygon fill="white" stroke="black" points="2033.98,-268.5 2033.98,-298.5 2146.98,-298.5 2146.98,-268.5 2033.98,-268.5"/>
-<text text-anchor="start" x="2041.98" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/x86</text>
-<text text-anchor="middle" x="2090.48" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/injective.h</text>
+<g id="a_node39"><a xlink:href="x86_2default_8h.html" target="_top" xlink:title="default x86 schedule ">
+<polygon fill="white" stroke="black" points="726.553,-268.5 726.553,-298.5 839.553,-298.5 839.553,-268.5 726.553,-268.5"/>
+<text text-anchor="start" x="734.553" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/x86</text>
+<text text-anchor="middle" x="783.053" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/default.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node39 -->
-<g id="edge102" class="edge"><title>Node4&#45;&gt;Node39</title>
-<path fill="none" stroke="midnightblue" d="M1999.51,-397.754C2002.18,-380.199 2008.19,-353.907 2021.48,-335 2032.23,-319.698 2049.12,-307.225 2063.49,-298.504"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1996.02,-397.496 1998.2,-407.863 2002.96,-398.396 1996.02,-397.496"/>
+<g id="edge105" class="edge"><title>Node4&#45;&gt;Node39</title>
+<path fill="none" stroke="midnightblue" d="M1891.91,-406.209C1882.04,-404.545 1871.75,-403.032 1862.05,-402 1747.96,-389.86 928.388,-425.116 830.053,-366 805.275,-351.104 792.293,-317.846 786.639,-298.738"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1891.55,-409.7 1902.01,-407.984 1892.77,-402.805 1891.55,-409.7"/>
 </g>
 <!-- Node40 -->
 <g id="node40" class="node"><title>Node40</title>
-<g id="a_node40"><a xlink:href="pad__utils_8h.html" target="_top" xlink:title="Padding helpers. ">
-<polygon fill="white" stroke="black" points="3037.48,-134.5 3037.48,-164.5 3159.48,-164.5 3159.48,-134.5 3037.48,-134.5"/>
-<text text-anchor="start" x="3045.48" y="-152.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/detail</text>
-<text text-anchor="middle" x="3098.48" y="-141.5" font-family="Helvetica,sans-Serif" font-size="10.00">/pad_utils.h</text>
+<g id="a_node40"><a xlink:href="x86_2injective_8h.html" target="_top" xlink:title="x86 schedule for injective ops ">
+<polygon fill="white" stroke="black" points="1048.55,-268.5 1048.55,-298.5 1161.55,-298.5 1161.55,-268.5 1048.55,-268.5"/>
+<text text-anchor="start" x="1056.55" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/x86</text>
+<text text-anchor="middle" x="1105.05" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00">/injective.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node40 -->
-<g id="edge70" class="edge"><title>Node4&#45;&gt;Node40</title>
-<path fill="none" stroke="midnightblue" d="M2079.15,-417.182C2302.5,-418.373 2911.79,-416.87 2985.48,-366 3024.62,-338.979 3006.71,-310.29 3028.48,-268 3048.26,-229.576 3075.29,-186.398 3089.27,-164.646"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2079.14,-413.682 2069.12,-417.125 2079.1,-420.681 2079.14,-413.682"/>
+<g id="edge106" class="edge"><title>Node4&#45;&gt;Node40</title>
+<path fill="none" stroke="midnightblue" d="M1891.58,-406.214C1881.81,-404.565 1871.64,-403.057 1862.05,-402 1815.13,-396.829 1045.72,-400.082 1013.05,-366 988.764,-340.659 1033.68,-314.234 1068.89,-298.58"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1891.13,-409.687 1901.59,-407.969 1892.34,-402.793 1891.13,-409.687"/>
 </g>
 <!-- Node41 -->
 <g id="node41" class="node"><title>Node41</title>
-<g id="a_node41"><a xlink:href="ravel__unravel_8h.html" target="_top" xlink:title="Index ravel and unraval operations. ">
-<polygon fill="white" stroke="black" points="2205.48,-201.5 2205.48,-231.5 2327.48,-231.5 2327.48,-201.5 2205.48,-201.5"/>
-<text text-anchor="start" x="2213.48" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/detail</text>
-<text text-anchor="middle" x="2266.48" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00">/ravel_unravel.h</text>
+<g id="a_node41"><a xlink:href="pad__utils_8h.html" target="_top" xlink:title="Padding helpers. ">
+<polygon fill="white" stroke="black" points="3147.05,-134.5 3147.05,-164.5 3269.05,-164.5 3269.05,-134.5 3147.05,-134.5"/>
+<text text-anchor="start" x="3155.05" y="-152.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/detail</text>
+<text text-anchor="middle" x="3208.05" y="-141.5" font-family="Helvetica,sans-Serif" font-size="10.00">/pad_utils.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node41 -->
-<g id="edge72" class="edge"><title>Node4&#45;&gt;Node41</title>
-<path fill="none" stroke="midnightblue" d="M2022.41,-403.054C2040.61,-393.107 2065.79,-379.073 2087.48,-366 2109.23,-352.891 2112.87,-346.574 2135.48,-335 2174.56,-314.989 2195.31,-328.889 2227.48,-299 2247.47,-280.421 2258.3,-249.447 2263.19,-231.51"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2020.69,-400.002 2013.58,-407.856 2024.04,-406.151 2020.69,-400.002"/>
+<g id="edge71" class="edge"><title>Node4&#45;&gt;Node41</title>
+<path fill="none" stroke="midnightblue" d="M2024.93,-415.679C2256.69,-412.921 2906.2,-402.033 2995.05,-366 3005.49,-361.77 3063,-306.87 3071.05,-299 3119.21,-251.919 3172.19,-191.928 3195.68,-164.875"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2024.84,-412.18 2014.88,-415.797 2024.92,-419.179 2024.84,-412.18"/>
 </g>
 <!-- Node42 -->
 <g id="node42" class="node"><title>Node42</title>
-<g id="a_node42"><a xlink:href="tensor__utils_8h.html" target="_top" xlink:title="Utility functions for handling tensor. ">
-<polygon fill="white" stroke="black" points="2383.48,-201.5 2383.48,-231.5 2505.48,-231.5 2505.48,-201.5 2383.48,-201.5"/>
-<text text-anchor="start" x="2391.48" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/detail</text>
-<text text-anchor="middle" x="2444.48" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00">/tensor_utils.h</text>
+<g id="a_node42"><a xlink:href="ravel__unravel_8h.html" target="_top" xlink:title="Index ravel and unraval operations. ">
+<polygon fill="white" stroke="black" points="2459.05,-335.5 2459.05,-365.5 2581.05,-365.5 2581.05,-335.5 2459.05,-335.5"/>
+<text text-anchor="start" x="2467.05" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/detail</text>
+<text text-anchor="middle" x="2520.05" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00">/ravel_unravel.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node42 -->
-<g id="edge75" class="edge"><title>Node4&#45;&gt;Node42</title>
-<path fill="none" stroke="midnightblue" d="M2046.25,-405.476C2080.05,-396.929 2125.59,-383.578 2163.48,-366 2186.52,-355.312 2188.31,-345.397 2211.48,-335 2266.18,-310.456 2288.02,-326.139 2341.48,-299 2376.86,-281.039 2411.73,-249.843 2430.43,-231.7"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2045.03,-402.173 2036.17,-407.973 2046.71,-408.967 2045.03,-402.173"/>
+<g id="edge73" class="edge"><title>Node4&#45;&gt;Node42</title>
+<path fill="none" stroke="midnightblue" d="M2024.79,-410.568C2124.37,-402.815 2297.59,-387.627 2445.05,-366 2449.51,-365.346 2454.11,-364.609 2458.74,-363.825"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2024.5,-407.081 2014.8,-411.341 2025.04,-414.06 2024.5,-407.081"/>
 </g>
 <!-- Node43 -->
 <g id="node43" class="node"><title>Node43</title>
-<g id="a_node43"><a xlink:href="nn_2dense_8h.html" target="_top" xlink:title="Dense op constructions. ">
-<polygon fill="white" stroke="black" points="1290.48,-201.5 1290.48,-231.5 1398.48,-231.5 1398.48,-201.5 1290.48,-201.5"/>
-<text text-anchor="start" x="1298.48" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/nn</text>
-<text text-anchor="middle" x="1344.48" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00">/dense.h</text>
+<g id="a_node43"><a xlink:href="tensor__utils_8h.html" target="_top" xlink:title="Utility functions for handling tensor. ">
+<polygon fill="white" stroke="black" points="2281.05,-335.5 2281.05,-365.5 2403.05,-365.5 2403.05,-335.5 2281.05,-335.5"/>
+<text text-anchor="start" x="2289.05" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/detail</text>
+<text text-anchor="middle" x="2342.05" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00">/tensor_utils.h</text>
 </a>
 </g>
 </g>
 <!-- Node4&#45;&gt;Node43 -->
-<g id="edge82" class="edge"><title>Node4&#45;&gt;Node43</title>
-<path fill="none" stroke="midnightblue" d="M1940.03,-406.156C1932.12,-404.749 1924.1,-403.332 1916.48,-402 1718.64,-367.415 1659.96,-380.691 1476.48,-299 1433.91,-280.048 1388.6,-249.492 1363.68,-231.67"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="1939.49,-409.615 1949.95,-407.925 1940.72,-402.724 1939.49,-409.615"/>
-</g>
-<!-- Node4&#45;&gt;Node44 -->
-<g id="edge85" class="edge"><title>Node4&#45;&gt;Node44</title>
-<path fill="none" stroke="midnightblue" d="M2079.24,-417.135C2258.14,-417.537 2697.39,-413.14 3061.48,-366 3066.03,-365.411 3070.72,-364.677 3075.42,-363.855"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2079.13,-413.635 2069.12,-417.106 2079.11,-420.635 2079.13,-413.635"/>
+<g id="edge77" class="edge"><title>Node4&#45;&gt;Node43</title>
+<path fill="none" stroke="midnightblue" d="M2012.21,-406.465C2077.89,-396.758 2179.35,-381.307 2267.05,-366 2271.49,-365.225 2276.08,-364.4 2280.69,-363.552"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="2011.37,-403.051 2001.99,-407.972 2012.39,-409.976 2011.37,-403.051"/>
 </g>
-<!-- Node45 -->
-<g id="node45" class="node"><title>Node45</title>
-<g id="a_node45"><a xlink:href="local__response__norm_8h.html" target="_top" xlink:title="local response normalization op constructions ">
-<polygon fill="white" stroke="black" points="2329.98,-335.5 2329.98,-365.5 2460.98,-365.5 2460.98,-335.5 2329.98,-335.5"/>
-<text text-anchor="start" x="2337.98" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/nn</text>
-<text text-anchor="middle" x="2395.48" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00">/local_response_norm.h</text>
+<!-- Node44 -->
+<g id="node44" class="node"><title>Node44</title>
+<g id="a_node44"><a xlink:href="nn_2dense_8h.html" target="_top" xlink:title="Dense op constructions. ">
+<polygon fill="white" stroke="black" points="1943.05,-201.5 1943.05,-231.5 2051.05,-231.5 2051.05,-201.5 1943.05,-201.5"/>
+<text text-anchor="start" x="1951.05" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00">include/tvm/topi/nn</text>
+<text text-anchor="middle" x="1997.05" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00">/dense.h</text>
 </a>
 </g>
 </g>
+<!-- Node4&#45;&gt;Node44 -->
+<g id="edge86" class="edge"><title>Node4&#45;&gt;Node44</title>
+<path fill="none" stroke="midnightblue" d="M1970.64,-403.017C1986.07,-394.288 2004.64,-381.636 2017.05,-366 2036.91,-340.985 2037.17,-330.396 2043.05,-299 2045.59,-285.458 2048.03,-280.848 2043.05,-268 2037.49,-253.623 2025.71,-240.825 2015.47,-231.71"/>
+<polygon fill="midnightblue" stroke="midnightblue" points="1968.66,-400.109 1961.52,-407.937 1971.98,-406.269 1968.66,-400.109"/>
+</g>
 <!-- Node4&#45;&gt;Node45 -->
-<g id="edge87" class="edge"><title>Node4&#45;&gt;Node45</title>
-<path fill="none" stroke="midnightblue" d="M2063.56,-406.436C2127.93,-396.532 2228.51,-380.785 2315.48,-366 2320.07,-365.219 2324.81,-364.399 2329.58,-363.562"/>
-<polygon fill="midnightblue" stroke="midnightblue" points="2062.9,-402.996 2053.55,-407.975 2063.96,-409.915 2062.9,-402.996"/>
... 295054 lines suppressed ...