You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by tq...@apache.org on 2021/05/05 23:23:55 UTC

[tvm-site] branch asf-site updated: Docs build at Wed May 5 19:23:36 EDT 2021

This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/tvm-site.git


The following commit(s) were added to refs/heads/asf-site by this push:
     new b81301f  Docs build at Wed May  5 19:23:36 EDT 2021
b81301f is described below

commit b81301fa8958e1b37cfa326016cb1bc92c4d4c43
Author: tqchen <ti...@gmail.com>
AuthorDate: Wed May 5 19:23:38 2021 -0400

    Docs build at Wed May  5 19:23:36 EDT 2021
---
 .../tune_network_mali.ipynb                        |    6 +-
 .../opt_matmul_auto_tensorcore.py                  |  544 ---------
 .../tune_relay_cuda.py                             |    6 +-
 .../tune_relay_mobile_gpu.ipynb                    |    4 +-
 .../deploy_sparse.py                               |    2 +-
 .../deploy_model_on_android.ipynb                  |    2 +-
 .../deploy_model_on_android.py                     |    2 +-
 .../tune_relay_vta.ipynb                           |    2 +-
 .../tune_relay_vta.py                              |    2 +-
 .../tune_relay_cuda.ipynb                          |    4 +-
 .../tune_network_mali.py                           |   10 +-
 .../opt_matmul_auto_tensorcore.ipynb               |  111 --
 .../deploy_sparse.ipynb                            |    2 +-
 .../tune_network_arm.ipynb                         |    4 +-
 .../tune_relay_arm.py                              |    4 +-
 .../tune_relay_mobile_gpu.py                       |    4 +-
 .../tune_relay_arm.ipynb                           |    4 +-
 .../tune_network_arm.py                            |    4 +-
 .../sphx_glr_opt_matmul_auto_tensorcore_thumb.png  |  Bin 26786 -> 0 bytes
 docs/_sources/deploy/arm_compute_lib.rst.txt       |    2 +-
 docs/_sources/dev/index.rst.txt                    |    6 +-
 docs/_sources/dev/pass_infra.rst.txt               |    6 +-
 docs/_sources/dev/runtime.rst.txt                  |    6 +-
 docs/_sources/install/from_source.rst.txt          |    5 +
 .../auto_scheduler/sg_execution_times.rst.txt      |   16 +-
 .../auto_scheduler/tune_conv2d_layer_cuda.rst.txt  | 1272 ++++++++++---------
 .../auto_scheduler/tune_network_arm.rst.txt        |    4 +-
 .../auto_scheduler/tune_network_cuda.rst.txt       |    6 +-
 .../auto_scheduler/tune_network_mali.rst.txt       |   12 +-
 .../auto_scheduler/tune_network_x86.rst.txt        |    6 +-
 .../auto_scheduler/tune_sparse_x86.rst.txt         |   37 +-
 .../tutorials/autotvm/sg_execution_times.rst.txt   |   12 +-
 .../tutorials/autotvm/tune_conv2d_cuda.rst.txt     |   42 +-
 .../tutorials/autotvm/tune_relay_arm.rst.txt       |    4 +-
 .../tutorials/autotvm/tune_relay_cuda.rst.txt      |    6 +-
 .../autotvm/tune_relay_mobile_gpu.rst.txt          |    4 +-
 .../tutorials/dev/low_level_custom_pass.rst.txt    |    4 +-
 .../tutorials/dev/sg_execution_times.rst.txt       |    8 +-
 .../frontend/deploy_model_on_android.rst.txt       |    4 +-
 .../deploy_object_detection_pytorch.rst.txt        |    2 +-
 .../tutorials/frontend/deploy_prequantized.rst.txt |    2 +-
 .../frontend/deploy_prequantized_tflite.rst.txt    |    4 +-
 .../tutorials/frontend/deploy_sparse.rst.txt       |    2 +-
 .../tutorials/frontend/deploy_ssd_gluoncv.rst.txt  |    2 +-
 docs/_sources/tutorials/frontend/from_onnx.rst.txt |    2 +-
 .../tutorials/frontend/from_tensorflow.rst.txt     |  146 +--
 .../tutorials/frontend/sg_execution_times.rst.txt  |   40 +-
 .../get_started/auto_tuning_with_python.rst.txt    |   74 +-
 .../get_started/cross_compilation_and_rpc.rst.txt  |    2 +-
 .../get_started/relay_quick_start.rst.txt          |    2 +-
 .../get_started/sg_execution_times.rst.txt         |   20 +-
 .../get_started/tensor_expr_get_started.rst.txt    |   65 +-
 .../tutorials/get_started/tune_matmul_x86.rst.txt  |  123 +-
 docs/_sources/tutorials/index.rst.txt              |   20 -
 .../tutorials/language/intrin_math.rst.txt         |    1 +
 .../tutorials/language/schedule_primitives.rst.txt |    8 +-
 .../tutorials/language/sg_execution_times.rst.txt  |   18 +-
 docs/_sources/tutorials/language/tensorize.rst.txt |    8 +-
 .../tutorials/language/tuple_inputs.rst.txt        |   14 +-
 .../tutorials/micro/sg_execution_times.rst.txt     |    6 +-
 .../tutorials/optimize/opt_conv_cuda.rst.txt       |    2 +-
 .../tutorials/optimize/opt_conv_tensorcore.rst.txt |    2 +-
 docs/_sources/tutorials/optimize/opt_gemm.rst.txt  |   16 +-
 .../optimize/opt_matmul_auto_tensorcore.rst.txt    |  585 ---------
 .../tutorials/optimize/sg_execution_times.rst.txt  |    9 +-
 docs/_sources/tutorials/topi/intro_topi.rst.txt    |    2 +-
 .../tutorials/topi/sg_execution_times.rst.txt      |    4 +-
 .../tutorials/autotvm/sg_execution_times.rst.txt   |    6 +-
 .../vta/tutorials/autotvm/tune_relay_vta.rst.txt   |    4 +-
 .../frontend/deploy_classification.rst.txt         |    8 +-
 .../tutorials/frontend/sg_execution_times.rst.txt  |    4 +-
 .../vta/tutorials/optimize/convolution_opt.rst.txt |    8 +-
 .../tutorials/optimize/matrix_multiply_opt.rst.txt |    8 +-
 .../tutorials/optimize/sg_execution_times.rst.txt  |    6 +-
 .../vta/tutorials/sg_execution_times.rst.txt       |    6 +-
 docs/_static/css/tlcpack_theme.css                 |    4 +
 ...e_2include_2tvm_2tir_2transform_8h-example.html |   94 ++
 docs/api/doxygen/algorithm_8h.html                 |    1 +
 docs/api/doxygen/algorithm_8h_source.html          |    1 +
 docs/api/doxygen/analyzer_8h.html                  |    1 +
 docs/api/doxygen/analyzer_8h_source.html           |    1 +
 docs/api/doxygen/annotated.html                    |    1 +
 docs/api/doxygen/annotation_8h.html                |    1 +
 docs/api/doxygen/annotation_8h_source.html         |    1 +
 docs/api/doxygen/array__utils_8h.html              |    1 +
 docs/api/doxygen/array__utils_8h_source.html       |    1 +
 docs/api/doxygen/attr__registry__map_8h.html       |    1 +
 .../api/doxygen/attr__registry__map_8h_source.html |    1 +
 docs/api/doxygen/auto__schedule_8h.html            |    1 +
 docs/api/doxygen/auto__schedule_8h_source.html     |    1 +
 docs/api/doxygen/auto__scheduler_2feature_8h.html  |    1 +
 .../auto__scheduler_2feature_8h_source.html        |    1 +
 docs/api/doxygen/autodiff_8h.html                  |    1 +
 docs/api/doxygen/autodiff_8h_source.html           |    1 +
 docs/api/doxygen/base_8h.html                      |    1 +
 docs/api/doxygen/base_8h_source.html               |    1 +
 docs/api/doxygen/bias__add_8h.html                 |    1 +
 docs/api/doxygen/bias__add_8h_source.html          |    1 +
 docs/api/doxygen/bitserial_8h.html                 |    1 +
 docs/api/doxygen/bitserial_8h_source.html          |    1 +
 docs/api/doxygen/block__scope_8h.html              |    1 +
 docs/api/doxygen/block__scope_8h_source.html       |    1 +
 docs/api/doxygen/bound_8h.html                     |    1 +
 docs/api/doxygen/bound_8h_source.html              |    1 +
 docs/api/doxygen/broadcast_8h.html                 |    1 +
 docs/api/doxygen/broadcast_8h_source.html          |    1 +
 docs/api/doxygen/buffer_8h.html                    |    1 +
 docs/api/doxygen/buffer_8h_source.html             |    1 +
 docs/api/doxygen/builtin_8h.html                   |    1 +
 docs/api/doxygen/builtin_8h_source.html            |   31 +-
 docs/api/doxygen/bytecode_8h.html                  |    1 +
 docs/api/doxygen/bytecode_8h_source.html           |    5 +-
 docs/api/doxygen/c__backend__api_8h.html           |    1 +
 docs/api/doxygen/c__backend__api_8h_source.html    |    5 +-
 docs/api/doxygen/c__runtime__api_8h.html           |    1 +
 docs/api/doxygen/c__runtime__api_8h_source.html    |    7 +-
 docs/api/doxygen/classes.html                      |    1 +
 .../doxygen/classtvm_1_1AttrFieldInfo-members.html |    1 +
 docs/api/doxygen/classtvm_1_1AttrFieldInfo.html    |    1 +
 .../classtvm_1_1AttrFieldInfoNode-members.html     |    1 +
 .../api/doxygen/classtvm_1_1AttrFieldInfoNode.html |    1 +
 .../classtvm_1_1AttrRegistryMap-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1AttrRegistryMap.html  |    1 +
 ...tvm_1_1AttrRegistryMapContainerMap-members.html |    1 +
 .../classtvm_1_1AttrRegistryMapContainerMap.html   |    1 +
 docs/api/doxygen/classtvm_1_1AttrVisitor.html      |    1 +
 docs/api/doxygen/classtvm_1_1Attrs-members.html    |    1 +
 docs/api/doxygen/classtvm_1_1Attrs.html            |    1 +
 .../api/doxygen/classtvm_1_1AttrsNode-members.html |    1 +
 docs/api/doxygen/classtvm_1_1AttrsNode.html        |    1 +
 .../doxygen/classtvm_1_1BaseAttrsNode-members.html |    1 +
 docs/api/doxygen/classtvm_1_1BaseAttrsNode.html    |    1 +
 docs/api/doxygen/classtvm_1_1BaseExpr-members.html |    1 +
 docs/api/doxygen/classtvm_1_1BaseExpr.html         |    1 +
 .../doxygen/classtvm_1_1BaseExprNode-members.html  |    1 +
 docs/api/doxygen/classtvm_1_1BaseExprNode.html     |    1 +
 .../classtvm_1_1BaseExpr__inherit__graph.svg       |   20 +-
 docs/api/doxygen/classtvm_1_1BaseFunc-members.html |    1 +
 docs/api/doxygen/classtvm_1_1BaseFunc.html         |    1 +
 .../doxygen/classtvm_1_1BaseFuncNode-members.html  |    1 +
 docs/api/doxygen/classtvm_1_1BaseFuncNode.html     |    1 +
 .../classtvm_1_1BaseTensorType-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1BaseTensorType.html   |    1 +
 .../classtvm_1_1BaseTensorTypeNode-members.html    |    1 +
 .../doxygen/classtvm_1_1BaseTensorTypeNode.html    |    1 +
 .../classtvm_1_1BaseValueEqual-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1BaseValueEqual.html   |    1 +
 .../doxygen/classtvm_1_1BaseValueHash-members.html |    1 +
 docs/api/doxygen/classtvm_1_1BaseValueHash.html    |    1 +
 docs/api/doxygen/classtvm_1_1Bool-members.html     |    1 +
 docs/api/doxygen/classtvm_1_1Bool.html             |    1 +
 .../doxygen/classtvm_1_1CompileError-members.html  |    1 +
 docs/api/doxygen/classtvm_1_1CompileError.html     |    1 +
 .../doxygen/classtvm_1_1Constructor-members.html   |    1 +
 docs/api/doxygen/classtvm_1_1Constructor.html      |    1 +
 .../classtvm_1_1ConstructorNode-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1ConstructorNode.html  |    1 +
 .../doxygen/classtvm_1_1Diagnostic-members.html    |    1 +
 docs/api/doxygen/classtvm_1_1Diagnostic.html       |    1 +
 .../classtvm_1_1DiagnosticBuilder-members.html     |    1 +
 .../api/doxygen/classtvm_1_1DiagnosticBuilder.html |    1 +
 .../classtvm_1_1DiagnosticContext-members.html     |    1 +
 .../api/doxygen/classtvm_1_1DiagnosticContext.html |    1 +
 .../classtvm_1_1DiagnosticContextNode-members.html |    1 +
 .../doxygen/classtvm_1_1DiagnosticContextNode.html |    1 +
 .../classtvm_1_1DiagnosticNode-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1DiagnosticNode.html   |    1 +
 .../classtvm_1_1DiagnosticRenderer-members.html    |    1 +
 .../doxygen/classtvm_1_1DiagnosticRenderer.html    |    1 +
 ...classtvm_1_1DiagnosticRendererNode-members.html |    1 +
 .../classtvm_1_1DiagnosticRendererNode.html        |    1 +
 .../api/doxygen/classtvm_1_1DictAttrs-members.html |    1 +
 docs/api/doxygen/classtvm_1_1DictAttrs.html        |    1 +
 .../doxygen/classtvm_1_1DictAttrsNode-members.html |    1 +
 docs/api/doxygen/classtvm_1_1DictAttrsNode.html    |    1 +
 docs/api/doxygen/classtvm_1_1EnvFunc-members.html  |    1 +
 docs/api/doxygen/classtvm_1_1EnvFunc.html          |    1 +
 .../doxygen/classtvm_1_1EnvFuncNode-members.html   |    1 +
 docs/api/doxygen/classtvm_1_1EnvFuncNode.html      |    1 +
 .../doxygen/classtvm_1_1ErrorReporter-members.html |    1 +
 docs/api/doxygen/classtvm_1_1ErrorReporter.html    |    1 +
 docs/api/doxygen/classtvm_1_1FloatImm-members.html |    1 +
 docs/api/doxygen/classtvm_1_1FloatImm.html         |    1 +
 .../doxygen/classtvm_1_1FloatImmNode-members.html  |    1 +
 docs/api/doxygen/classtvm_1_1FloatImmNode.html     |    1 +
 docs/api/doxygen/classtvm_1_1FuncType-members.html |    1 +
 docs/api/doxygen/classtvm_1_1FuncType.html         |    1 +
 .../doxygen/classtvm_1_1FuncTypeNode-members.html  |    1 +
 docs/api/doxygen/classtvm_1_1FuncTypeNode.html     |    1 +
 .../doxygen/classtvm_1_1GenericFunc-members.html   |    1 +
 docs/api/doxygen/classtvm_1_1GenericFunc.html      |    1 +
 .../classtvm_1_1GenericFuncNode-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1GenericFuncNode.html  |    1 +
 .../doxygen/classtvm_1_1GlobalTypeVar-members.html |    1 +
 docs/api/doxygen/classtvm_1_1GlobalTypeVar.html    |    1 +
 .../classtvm_1_1GlobalTypeVarNode-members.html     |    1 +
 .../api/doxygen/classtvm_1_1GlobalTypeVarNode.html |    1 +
 .../api/doxygen/classtvm_1_1GlobalVar-members.html |    1 +
 docs/api/doxygen/classtvm_1_1GlobalVar.html        |    1 +
 .../doxygen/classtvm_1_1GlobalVarNode-members.html |    1 +
 docs/api/doxygen/classtvm_1_1GlobalVarNode.html    |    1 +
 docs/api/doxygen/classtvm_1_1IRModule-members.html |    1 +
 docs/api/doxygen/classtvm_1_1IRModule.html         |    1 +
 .../doxygen/classtvm_1_1IRModuleNode-members.html  |    1 +
 docs/api/doxygen/classtvm_1_1IRModuleNode.html     |    1 +
 .../classtvm_1_1IncompleteType-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1IncompleteType.html   |    1 +
 .../classtvm_1_1IncompleteTypeNode-members.html    |    1 +
 .../doxygen/classtvm_1_1IncompleteTypeNode.html    |    1 +
 docs/api/doxygen/classtvm_1_1IntImm-members.html   |    1 +
 docs/api/doxygen/classtvm_1_1IntImm.html           |    1 +
 .../doxygen/classtvm_1_1IntImmNode-members.html    |    1 +
 docs/api/doxygen/classtvm_1_1IntImmNode.html       |    1 +
 docs/api/doxygen/classtvm_1_1Integer-members.html  |    1 +
 docs/api/doxygen/classtvm_1_1Integer.html          |    1 +
 .../doxygen/classtvm_1_1MemoryInfo-members.html    |    1 +
 docs/api/doxygen/classtvm_1_1MemoryInfo.html       |    1 +
 .../classtvm_1_1MemoryInfoNode-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1MemoryInfoNode.html   |    1 +
 docs/api/doxygen/classtvm_1_1NodeFunctor.html      |    1 +
 ...jectRef_01_6n_00_01Args_8_8_8_08_4-members.html |    1 +
 ...nst_01ObjectRef_01_6n_00_01Args_8_8_8_08_4.html |    1 +
 docs/api/doxygen/classtvm_1_1Op-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1Op.html               |    1 +
 .../api/doxygen/classtvm_1_1OpAttrMap-members.html |    1 +
 docs/api/doxygen/classtvm_1_1OpAttrMap.html        |    1 +
 docs/api/doxygen/classtvm_1_1OpNode-members.html   |    1 +
 docs/api/doxygen/classtvm_1_1OpNode.html           |    1 +
 .../doxygen/classtvm_1_1OpRegEntry-members.html    |    1 +
 docs/api/doxygen/classtvm_1_1OpRegEntry.html       |    1 +
 .../doxygen/classtvm_1_1PointerType-members.html   |    1 +
 docs/api/doxygen/classtvm_1_1PointerType.html      |    1 +
 .../classtvm_1_1PointerTypeNode-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1PointerTypeNode.html  |    1 +
 docs/api/doxygen/classtvm_1_1PrimExpr-members.html |    1 +
 docs/api/doxygen/classtvm_1_1PrimExpr.html         |    1 +
 .../doxygen/classtvm_1_1PrimExprNode-members.html  |    1 +
 docs/api/doxygen/classtvm_1_1PrimExprNode.html     |    1 +
 .../classtvm_1_1PrimExpr__inherit__graph.svg       |   20 +-
 docs/api/doxygen/classtvm_1_1PrimType-members.html |    1 +
 docs/api/doxygen/classtvm_1_1PrimType.html         |    1 +
 .../doxygen/classtvm_1_1PrimTypeNode-members.html  |    1 +
 docs/api/doxygen/classtvm_1_1PrimTypeNode.html     |    1 +
 docs/api/doxygen/classtvm_1_1Range-members.html    |    1 +
 docs/api/doxygen/classtvm_1_1Range.html            |    1 +
 .../api/doxygen/classtvm_1_1RangeNode-members.html |    1 +
 docs/api/doxygen/classtvm_1_1RangeNode.html        |    1 +
 .../classtvm_1_1ReflectionVTable-members.html      |    1 +
 docs/api/doxygen/classtvm_1_1ReflectionVTable.html |    1 +
 ...vm_1_1ReflectionVTable_1_1Registry-members.html |    1 +
 .../classtvm_1_1ReflectionVTable_1_1Registry.html  |    1 +
 .../api/doxygen/classtvm_1_1RelayExpr-members.html |    1 +
 docs/api/doxygen/classtvm_1_1RelayExpr.html        |    1 +
 .../doxygen/classtvm_1_1RelayExprNode-members.html |    1 +
 docs/api/doxygen/classtvm_1_1RelayExprNode.html    |    1 +
 .../doxygen/classtvm_1_1RelayRefType-members.html  |    1 +
 docs/api/doxygen/classtvm_1_1RelayRefType.html     |    1 +
 .../classtvm_1_1RelayRefTypeNode-members.html      |    1 +
 docs/api/doxygen/classtvm_1_1RelayRefTypeNode.html |    1 +
 .../doxygen/classtvm_1_1ReprPrinter-members.html   |    1 +
 docs/api/doxygen/classtvm_1_1ReprPrinter.html      |    1 +
 .../doxygen/classtvm_1_1SEqualReducer-members.html |    1 +
 docs/api/doxygen/classtvm_1_1SEqualReducer.html    |    1 +
 ...asstvm_1_1SEqualReducer_1_1Handler-members.html |    1 +
 .../classtvm_1_1SEqualReducer_1_1Handler.html      |    1 +
 .../doxygen/classtvm_1_1SHashReducer-members.html  |    1 +
 docs/api/doxygen/classtvm_1_1SHashReducer.html     |    1 +
 ...lasstvm_1_1SHashReducer_1_1Handler-members.html |    1 +
 .../classtvm_1_1SHashReducer_1_1Handler.html       |    1 +
 .../doxygen/classtvm_1_1SourceName-members.html    |    1 +
 docs/api/doxygen/classtvm_1_1SourceName.html       |    1 +
 .../classtvm_1_1SourceNameNode-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1SourceNameNode.html   |    1 +
 docs/api/doxygen/classtvm_1_1Span-members.html     |    1 +
 docs/api/doxygen/classtvm_1_1Span.html             |    1 +
 docs/api/doxygen/classtvm_1_1SpanNode-members.html |    1 +
 docs/api/doxygen/classtvm_1_1SpanNode.html         |    1 +
 .../classtvm_1_1StructuralEqual-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1StructuralEqual.html  |    1 +
 .../classtvm_1_1StructuralHash-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1StructuralHash.html   |    1 +
 docs/api/doxygen/classtvm_1_1Target-members.html   |    1 +
 docs/api/doxygen/classtvm_1_1Target.html           |    1 +
 .../doxygen/classtvm_1_1TargetKind-members.html    |    1 +
 docs/api/doxygen/classtvm_1_1TargetKind.html       |    1 +
 .../classtvm_1_1TargetKindAttrMap-members.html     |    1 +
 .../api/doxygen/classtvm_1_1TargetKindAttrMap.html |    1 +
 .../classtvm_1_1TargetKindNode-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1TargetKindNode.html   |    1 +
 .../classtvm_1_1TargetKindRegEntry-members.html    |    1 +
 .../doxygen/classtvm_1_1TargetKindRegEntry.html    |    1 +
 .../doxygen/classtvm_1_1TargetNode-members.html    |    1 +
 docs/api/doxygen/classtvm_1_1TargetNode.html       |    1 +
 .../api/doxygen/classtvm_1_1TargetTag-members.html |    1 +
 docs/api/doxygen/classtvm_1_1TargetTag.html        |    1 +
 .../doxygen/classtvm_1_1TargetTagNode-members.html |    1 +
 docs/api/doxygen/classtvm_1_1TargetTagNode.html    |    1 +
 .../classtvm_1_1TargetTagRegEntry-members.html     |    1 +
 .../api/doxygen/classtvm_1_1TargetTagRegEntry.html |    1 +
 .../doxygen/classtvm_1_1TensorType-members.html    |    1 +
 docs/api/doxygen/classtvm_1_1TensorType.html       |    1 +
 .../classtvm_1_1TensorTypeNode-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1TensorTypeNode.html   |    1 +
 .../api/doxygen/classtvm_1_1TupleType-members.html |    1 +
 docs/api/doxygen/classtvm_1_1TupleType.html        |    1 +
 .../doxygen/classtvm_1_1TupleTypeNode-members.html |    1 +
 docs/api/doxygen/classtvm_1_1TupleTypeNode.html    |    1 +
 docs/api/doxygen/classtvm_1_1Type-members.html     |    1 +
 docs/api/doxygen/classtvm_1_1Type.html             |    1 +
 docs/api/doxygen/classtvm_1_1TypeCall-members.html |    1 +
 docs/api/doxygen/classtvm_1_1TypeCall.html         |    1 +
 .../doxygen/classtvm_1_1TypeCallNode-members.html  |    1 +
 docs/api/doxygen/classtvm_1_1TypeCallNode.html     |    1 +
 .../classtvm_1_1TypeConstraint-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1TypeConstraint.html   |    1 +
 .../classtvm_1_1TypeConstraintNode-members.html    |    1 +
 .../doxygen/classtvm_1_1TypeConstraintNode.html    |    1 +
 docs/api/doxygen/classtvm_1_1TypeData-members.html |    1 +
 docs/api/doxygen/classtvm_1_1TypeData.html         |    1 +
 .../doxygen/classtvm_1_1TypeDataNode-members.html  |    1 +
 docs/api/doxygen/classtvm_1_1TypeDataNode.html     |    1 +
 docs/api/doxygen/classtvm_1_1TypeFunctor.html      |    1 +
 ..._01Type_01_6n_00_01Args_8_8_8_08_4-members.html |    1 +
 ..._07const_01Type_01_6n_00_01Args_8_8_8_08_4.html |    1 +
 .../doxygen/classtvm_1_1TypeMutator-members.html   |    1 +
 docs/api/doxygen/classtvm_1_1TypeMutator.html      |    1 +
 docs/api/doxygen/classtvm_1_1TypeNode-members.html |    1 +
 docs/api/doxygen/classtvm_1_1TypeNode.html         |    1 +
 .../doxygen/classtvm_1_1TypeRelation-members.html  |    1 +
 docs/api/doxygen/classtvm_1_1TypeRelation.html     |    1 +
 .../classtvm_1_1TypeRelationNode-members.html      |    1 +
 docs/api/doxygen/classtvm_1_1TypeRelationNode.html |    1 +
 .../doxygen/classtvm_1_1TypeReporter-members.html  |    1 +
 docs/api/doxygen/classtvm_1_1TypeReporter.html     |    1 +
 .../classtvm_1_1TypeReporterNode-members.html      |    1 +
 docs/api/doxygen/classtvm_1_1TypeReporterNode.html |    1 +
 docs/api/doxygen/classtvm_1_1TypeVar-members.html  |    1 +
 docs/api/doxygen/classtvm_1_1TypeVar.html          |    1 +
 .../doxygen/classtvm_1_1TypeVarNode-members.html   |    1 +
 docs/api/doxygen/classtvm_1_1TypeVarNode.html      |    1 +
 .../doxygen/classtvm_1_1TypeVisitor-members.html   |    1 +
 docs/api/doxygen/classtvm_1_1TypeVisitor.html      |    1 +
 docs/api/doxygen/classtvm_1_1TypedEnvFunc.html     |    1 +
 ...pedEnvFunc_3_01R_07Args_8_8_8_08_4-members.html |    1 +
 ...vm_1_1TypedEnvFunc_3_01R_07Args_8_8_8_08_4.html |    1 +
 docs/api/doxygen/classtvm_1_1With-members.html     |    1 +
 docs/api/doxygen/classtvm_1_1With.html             |    1 +
 .../classtvm_1_1arith_1_1Analyzer-members.html     |    1 +
 .../api/doxygen/classtvm_1_1arith_1_1Analyzer.html |    1 +
 ...vm_1_1arith_1_1CanonicalSimplifier-members.html |    1 +
 .../classtvm_1_1arith_1_1CanonicalSimplifier.html  |    1 +
 ...classtvm_1_1arith_1_1ConstIntBound-members.html |    1 +
 .../classtvm_1_1arith_1_1ConstIntBound.html        |    1 +
 ..._1_1arith_1_1ConstIntBoundAnalyzer-members.html |    1 +
 ...classtvm_1_1arith_1_1ConstIntBoundAnalyzer.html |    1 +
 ...stvm_1_1arith_1_1ConstIntBoundNode-members.html |    1 +
 .../classtvm_1_1arith_1_1ConstIntBoundNode.html    |    1 +
 ...stvm_1_1arith_1_1ConstraintContext-members.html |    1 +
 .../classtvm_1_1arith_1_1ConstraintContext.html    |    1 +
 ...lasstvm_1_1arith_1_1IntConstraints-members.html |    1 +
 .../classtvm_1_1arith_1_1IntConstraints.html       |    1 +
 ...tvm_1_1arith_1_1IntConstraintsNode-members.html |    1 +
 .../classtvm_1_1arith_1_1IntConstraintsNode.html   |    1 +
 ..._1arith_1_1IntConstraintsTransform-members.html |    1 +
 ...asstvm_1_1arith_1_1IntConstraintsTransform.html |    1 +
 ...ith_1_1IntConstraintsTransformNode-members.html |    1 +
 ...vm_1_1arith_1_1IntConstraintsTransformNode.html |    1 +
 ...lasstvm_1_1arith_1_1IntGroupBounds-members.html |    1 +
 .../classtvm_1_1arith_1_1IntGroupBounds.html       |    1 +
 ...tvm_1_1arith_1_1IntGroupBoundsNode-members.html |    1 +
 .../classtvm_1_1arith_1_1IntGroupBoundsNode.html   |    1 +
 .../classtvm_1_1arith_1_1IntSet-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1arith_1_1IntSet.html  |    1 +
 ...lasstvm_1_1arith_1_1IntSetAnalyzer-members.html |    1 +
 .../classtvm_1_1arith_1_1IntSetAnalyzer.html       |    1 +
 .../classtvm_1_1arith_1_1IntSetNode-members.html   |    1 +
 .../doxygen/classtvm_1_1arith_1_1IntSetNode.html   |    1 +
 .../classtvm_1_1arith_1_1IterMapExpr-members.html  |    1 +
 .../doxygen/classtvm_1_1arith_1_1IterMapExpr.html  |    1 +
 ...asstvm_1_1arith_1_1IterMapExprNode-members.html |    1 +
 .../classtvm_1_1arith_1_1IterMapExprNode.html      |    1 +
 .../classtvm_1_1arith_1_1IterMark-members.html     |    1 +
 .../api/doxygen/classtvm_1_1arith_1_1IterMark.html |    1 +
 .../classtvm_1_1arith_1_1IterMarkNode-members.html |    1 +
 .../doxygen/classtvm_1_1arith_1_1IterMarkNode.html |    1 +
 ...classtvm_1_1arith_1_1IterSplitExpr-members.html |    1 +
 .../classtvm_1_1arith_1_1IterSplitExpr.html        |    1 +
 ...stvm_1_1arith_1_1IterSplitExprNode-members.html |    1 +
 .../classtvm_1_1arith_1_1IterSplitExprNode.html    |    1 +
 .../classtvm_1_1arith_1_1IterSumExpr-members.html  |    1 +
 .../doxygen/classtvm_1_1arith_1_1IterSumExpr.html  |    1 +
 ...asstvm_1_1arith_1_1IterSumExprNode-members.html |    1 +
 .../classtvm_1_1arith_1_1IterSumExprNode.html      |    1 +
 .../classtvm_1_1arith_1_1ModularSet-members.html   |    1 +
 .../doxygen/classtvm_1_1arith_1_1ModularSet.html   |    1 +
 ...tvm_1_1arith_1_1ModularSetAnalyzer-members.html |    1 +
 .../classtvm_1_1arith_1_1ModularSetAnalyzer.html   |    1 +
 ...lasstvm_1_1arith_1_1ModularSetNode-members.html |    1 +
 .../classtvm_1_1arith_1_1ModularSetNode.html       |    1 +
 ...stvm_1_1arith_1_1RewriteSimplifier-members.html |    1 +
 .../classtvm_1_1arith_1_1RewriteSimplifier.html    |    1 +
 ...1auto__scheduler_1_1AccessAnalyzer-members.html |    1 +
 ...sstvm_1_1auto__scheduler_1_1AccessAnalyzer.html |    1 +
 ...o__scheduler_1_1AccessAnalyzerNode-members.html |    1 +
 ...m_1_1auto__scheduler_1_1AccessAnalyzerNode.html |    1 +
 ...1auto__scheduler_1_1AnnotationStep-members.html |    1 +
 ...sstvm_1_1auto__scheduler_1_1AnnotationStep.html |    1 +
 ...o__scheduler_1_1AnnotationStepNode-members.html |    1 +
 ...m_1_1auto__scheduler_1_1AnnotationStepNode.html |    1 +
 ...vm_1_1auto__scheduler_1_1AttachMap-members.html |    1 +
 .../classtvm_1_1auto__scheduler_1_1AttachMap.html  |    1 +
 ..._1auto__scheduler_1_1AttachMapNode-members.html |    1 +
 ...asstvm_1_1auto__scheduler_1_1AttachMapNode.html |    1 +
 ..._1_1auto__scheduler_1_1BuildResult-members.html |    1 +
 ...classtvm_1_1auto__scheduler_1_1BuildResult.html |    1 +
 ...auto__scheduler_1_1BuildResultNode-members.html |    1 +
 ...stvm_1_1auto__scheduler_1_1BuildResultNode.html |    1 +
 ..._1auto__scheduler_1_1CacheReadStep-members.html |    1 +
 ...asstvm_1_1auto__scheduler_1_1CacheReadStep.html |    1 +
 ...to__scheduler_1_1CacheReadStepNode-members.html |    1 +
 ...vm_1_1auto__scheduler_1_1CacheReadStepNode.html |    1 +
 ...1auto__scheduler_1_1CacheWriteStep-members.html |    1 +
 ...sstvm_1_1auto__scheduler_1_1CacheWriteStep.html |    1 +
 ...o__scheduler_1_1CacheWriteStepNode-members.html |    1 +
 ...m_1_1auto__scheduler_1_1CacheWriteStepNode.html |    1 +
 ..._1auto__scheduler_1_1ComputeAtStep-members.html |    1 +
 ...asstvm_1_1auto__scheduler_1_1ComputeAtStep.html |    1 +
 ...to__scheduler_1_1ComputeAtStepNode-members.html |    1 +
 ...vm_1_1auto__scheduler_1_1ComputeAtStepNode.html |    1 +
 ...m_1_1auto__scheduler_1_1ComputeDAG-members.html |    1 +
 .../classtvm_1_1auto__scheduler_1_1ComputeDAG.html |    1 +
 ...1auto__scheduler_1_1ComputeDAGNode-members.html |    1 +
 ...sstvm_1_1auto__scheduler_1_1ComputeDAGNode.html |    1 +
 ...to__scheduler_1_1ComputeInlineStep-members.html |    1 +
 ...vm_1_1auto__scheduler_1_1ComputeInlineStep.html |    1 +
 ...scheduler_1_1ComputeInlineStepNode-members.html |    1 +
 ..._1auto__scheduler_1_1ComputeInlineStepNode.html |    1 +
 ...auto__scheduler_1_1ComputeRootStep-members.html |    1 +
 ...stvm_1_1auto__scheduler_1_1ComputeRootStep.html |    1 +
 ...__scheduler_1_1ComputeRootStepNode-members.html |    1 +
 ..._1_1auto__scheduler_1_1ComputeRootStepNode.html |    1 +
 ...vm_1_1auto__scheduler_1_1CostModel-members.html |    1 +
 .../classtvm_1_1auto__scheduler_1_1CostModel.html  |    1 +
 ..._1auto__scheduler_1_1CostModelNode-members.html |    1 +
 ...asstvm_1_1auto__scheduler_1_1CostModelNode.html |    1 +
 ..._scheduler_1_1FollowFusedSplitStep-members.html |    1 +
 ...1_1auto__scheduler_1_1FollowFusedSplitStep.html |    1 +
 ...eduler_1_1FollowFusedSplitStepNode-members.html |    1 +
 ...uto__scheduler_1_1FollowFusedSplitStepNode.html |    1 +
 ...auto__scheduler_1_1FollowSplitStep-members.html |    1 +
 ...stvm_1_1auto__scheduler_1_1FollowSplitStep.html |    1 +
 ...__scheduler_1_1FollowSplitStepNode-members.html |    1 +
 ..._1_1auto__scheduler_1_1FollowSplitStepNode.html |    1 +
 ...tvm_1_1auto__scheduler_1_1FuseStep-members.html |    1 +
 .../classtvm_1_1auto__scheduler_1_1FuseStep.html   |    1 +
 ...1_1auto__scheduler_1_1FuseStepNode-members.html |    1 +
 ...lasstvm_1_1auto__scheduler_1_1FuseStepNode.html |    1 +
 ...1auto__scheduler_1_1HardwareParams-members.html |    1 +
 ...sstvm_1_1auto__scheduler_1_1HardwareParams.html |    1 +
 ...o__scheduler_1_1HardwareParamsNode-members.html |    1 +
 ...m_1_1auto__scheduler_1_1HardwareParamsNode.html |    1 +
 ...tvm_1_1auto__scheduler_1_1Iterator-members.html |    1 +
 .../classtvm_1_1auto__scheduler_1_1Iterator.html   |    1 +
 ...1_1auto__scheduler_1_1IteratorNode-members.html |    1 +
 ...lasstvm_1_1auto__scheduler_1_1IteratorNode.html |    1 +
 ...1_1auto__scheduler_1_1LocalBuilder-members.html |    1 +
 ...lasstvm_1_1auto__scheduler_1_1LocalBuilder.html |    1 +
 ...uto__scheduler_1_1LocalBuilderNode-members.html |    1 +
 ...tvm_1_1auto__scheduler_1_1LocalBuilderNode.html |    1 +
 ..._1_1auto__scheduler_1_1LocalRunner-members.html |    1 +
 ...classtvm_1_1auto__scheduler_1_1LocalRunner.html |    1 +
 ...auto__scheduler_1_1LocalRunnerNode-members.html |    1 +
 ...stvm_1_1auto__scheduler_1_1LocalRunnerNode.html |    1 +
 ...auto__scheduler_1_1MeasureCallback-members.html |    1 +
 ...stvm_1_1auto__scheduler_1_1MeasureCallback.html |    1 +
 ...__scheduler_1_1MeasureCallbackNode-members.html |    1 +
 ..._1_1auto__scheduler_1_1MeasureCallbackNode.html |    1 +
 ...1_1auto__scheduler_1_1MeasureInput-members.html |    1 +
 ...lasstvm_1_1auto__scheduler_1_1MeasureInput.html |    1 +
 ...uto__scheduler_1_1MeasureInputNode-members.html |    1 +
 ...tvm_1_1auto__scheduler_1_1MeasureInputNode.html |    1 +
 ..._1auto__scheduler_1_1MeasureResult-members.html |    1 +
 ...asstvm_1_1auto__scheduler_1_1MeasureResult.html |    1 +
 ...to__scheduler_1_1MeasureResultNode-members.html |    1 +
 ...vm_1_1auto__scheduler_1_1MeasureResultNode.html |    1 +
 ...m_1_1auto__scheduler_1_1PragmaStep-members.html |    1 +
 .../classtvm_1_1auto__scheduler_1_1PragmaStep.html |    1 +
 ...1auto__scheduler_1_1PragmaStepNode-members.html |    1 +
 ...sstvm_1_1auto__scheduler_1_1PragmaStepNode.html |    1 +
 ...scheduler_1_1PreloadMeasuredStates-members.html |    1 +
 ..._1auto__scheduler_1_1PreloadMeasuredStates.html |    1 +
 ...duler_1_1PreloadMeasuredStatesNode-members.html |    1 +
 ...to__scheduler_1_1PreloadMeasuredStatesNode.html |    1 +
 ...1auto__scheduler_1_1ProgramBuilder-members.html |    1 +
 ...sstvm_1_1auto__scheduler_1_1ProgramBuilder.html |    1 +
 ...o__scheduler_1_1ProgramBuilderNode-members.html |    1 +
 ...m_1_1auto__scheduler_1_1ProgramBuilderNode.html |    1 +
 ...auto__scheduler_1_1ProgramMeasurer-members.html |    1 +
 ...stvm_1_1auto__scheduler_1_1ProgramMeasurer.html |    1 +
 ...__scheduler_1_1ProgramMeasurerNode-members.html |    1 +
 ..._1_1auto__scheduler_1_1ProgramMeasurerNode.html |    1 +
 ..._1auto__scheduler_1_1ProgramRunner-members.html |    1 +
 ...asstvm_1_1auto__scheduler_1_1ProgramRunner.html |    1 +
 ...to__scheduler_1_1ProgramRunnerNode-members.html |    1 +
 ...vm_1_1auto__scheduler_1_1ProgramRunnerNode.html |    1 +
 ...uler_1_1PythonBasedMeasureCallback-members.html |    1 +
 ...o__scheduler_1_1PythonBasedMeasureCallback.html |    1 +
 ..._1_1PythonBasedMeasureCallbackNode-members.html |    1 +
 ...cheduler_1_1PythonBasedMeasureCallbackNode.html |    1 +
 ...uto__scheduler_1_1PythonBasedModel-members.html |    1 +
 ...tvm_1_1auto__scheduler_1_1PythonBasedModel.html |    1 +
 ..._scheduler_1_1PythonBasedModelNode-members.html |    1 +
 ...1_1auto__scheduler_1_1PythonBasedModelNode.html |    1 +
 ...vm_1_1auto__scheduler_1_1RPCRunner-members.html |    1 +
 .../classtvm_1_1auto__scheduler_1_1RPCRunner.html  |    1 +
 ..._1auto__scheduler_1_1RPCRunnerNode-members.html |    1 +
 ...asstvm_1_1auto__scheduler_1_1RPCRunnerNode.html |    1 +
 ..._1_1auto__scheduler_1_1RandomModel-members.html |    1 +
 ...classtvm_1_1auto__scheduler_1_1RandomModel.html |    1 +
 ...auto__scheduler_1_1RandomModelNode-members.html |    1 +
 ...stvm_1_1auto__scheduler_1_1RandomModelNode.html |    1 +
 ...1_1auto__scheduler_1_1RecordReader-members.html |    1 +
 ...lasstvm_1_1auto__scheduler_1_1RecordReader.html |    1 +
 ...uto__scheduler_1_1RecordReaderNode-members.html |    1 +
 ...tvm_1_1auto__scheduler_1_1RecordReaderNode.html |    1 +
 ...1_1auto__scheduler_1_1RecordToFile-members.html |    1 +
 ...lasstvm_1_1auto__scheduler_1_1RecordToFile.html |    1 +
 ...uto__scheduler_1_1RecordToFileNode-members.html |    1 +
 ...tvm_1_1auto__scheduler_1_1RecordToFileNode.html |    1 +
 ..._1_1auto__scheduler_1_1ReorderStep-members.html |    1 +
 ...classtvm_1_1auto__scheduler_1_1ReorderStep.html |    1 +
 ...auto__scheduler_1_1ReorderStepNode-members.html |    1 +
 ...stvm_1_1auto__scheduler_1_1ReorderStepNode.html |    1 +
 ..._1_1auto__scheduler_1_1RfactorStep-members.html |    1 +
 ...classtvm_1_1auto__scheduler_1_1RfactorStep.html |    1 +
 ...auto__scheduler_1_1RfactorStepNode-members.html |    1 +
 ...stvm_1_1auto__scheduler_1_1RfactorStepNode.html |    1 +
 ...1auto__scheduler_1_1SearchCallback-members.html |    1 +
 ...sstvm_1_1auto__scheduler_1_1SearchCallback.html |    1 +
 ...o__scheduler_1_1SearchCallbackNode-members.html |    1 +
 ...m_1_1auto__scheduler_1_1SearchCallbackNode.html |    1 +
 ...1_1auto__scheduler_1_1SearchPolicy-members.html |    1 +
 ...lasstvm_1_1auto__scheduler_1_1SearchPolicy.html |    1 +
 ...uto__scheduler_1_1SearchPolicyNode-members.html |    1 +
 ...tvm_1_1auto__scheduler_1_1SearchPolicyNode.html |    1 +
 ...m_1_1auto__scheduler_1_1SearchTask-members.html |    1 +
 .../classtvm_1_1auto__scheduler_1_1SearchTask.html |    1 +
 ...1auto__scheduler_1_1SearchTaskNode-members.html |    1 +
 ...sstvm_1_1auto__scheduler_1_1SearchTaskNode.html |    1 +
 ...vm_1_1auto__scheduler_1_1SplitStep-members.html |    1 +
 .../classtvm_1_1auto__scheduler_1_1SplitStep.html  |    1 +
 ..._1auto__scheduler_1_1SplitStepNode-members.html |    1 +
 ...asstvm_1_1auto__scheduler_1_1SplitStepNode.html |    1 +
 ...asstvm_1_1auto__scheduler_1_1Stage-members.html |    1 +
 .../classtvm_1_1auto__scheduler_1_1Stage.html      |    1 +
 ...vm_1_1auto__scheduler_1_1StageNode-members.html |    1 +
 .../classtvm_1_1auto__scheduler_1_1StageNode.html  |    1 +
 ...asstvm_1_1auto__scheduler_1_1State-members.html |    1 +
 .../classtvm_1_1auto__scheduler_1_1State.html      |    1 +
 ...vm_1_1auto__scheduler_1_1StateNode-members.html |    1 +
 .../classtvm_1_1auto__scheduler_1_1StateNode.html  |    1 +
 ...lasstvm_1_1auto__scheduler_1_1Step-members.html |    1 +
 .../classtvm_1_1auto__scheduler_1_1Step.html       |    1 +
 ...tvm_1_1auto__scheduler_1_1StepNode-members.html |    1 +
 .../classtvm_1_1auto__scheduler_1_1StepNode.html   |    1 +
 ...uto__scheduler_1_1StorageAlignStep-members.html |    1 +
 ...tvm_1_1auto__scheduler_1_1StorageAlignStep.html |    1 +
 ..._scheduler_1_1StorageAlignStepNode-members.html |    1 +
 ...1_1auto__scheduler_1_1StorageAlignStepNode.html |    1 +
 ..._1auto__scheduler_1_1TuningOptions-members.html |    1 +
 ...asstvm_1_1auto__scheduler_1_1TuningOptions.html |    1 +
 ...to__scheduler_1_1TuningOptionsNode-members.html |    1 +
 ...vm_1_1auto__scheduler_1_1TuningOptionsNode.html |    1 +
 ...classtvm_1_1detail_1_1AttrDocEntry-members.html |    1 +
 .../classtvm_1_1detail_1_1AttrDocEntry.html        |    1 +
 ...asstvm_1_1detail_1_1AttrDocVisitor-members.html |    1 +
 .../classtvm_1_1detail_1_1AttrDocVisitor.html      |    1 +
 ...stvm_1_1detail_1_1AttrExistVisitor-members.html |    1 +
 .../classtvm_1_1detail_1_1AttrExistVisitor.html    |    1 +
 ...sstvm_1_1detail_1_1AttrInitVisitor-members.html |    1 +
 .../classtvm_1_1detail_1_1AttrInitVisitor.html     |    1 +
 ...1_1detail_1_1AttrNonDefaultVisitor-members.html |    1 +
 ...lasstvm_1_1detail_1_1AttrNonDefaultVisitor.html |    1 +
 ...tvm_1_1detail_1_1AttrNormalVisitor-members.html |    1 +
 .../classtvm_1_1detail_1_1AttrNormalVisitor.html   |    1 +
 ...vm_1_1detail_1_1AttrsSEqualVisitor-members.html |    1 +
 .../classtvm_1_1detail_1_1AttrsSEqualVisitor.html  |    1 +
 ...tvm_1_1detail_1_1AttrsSHashVisitor-members.html |    1 +
 .../classtvm_1_1detail_1_1AttrsSHashVisitor.html   |    1 +
 .../classtvm_1_1parser_1_1Source-members.html      |    1 +
 docs/api/doxygen/classtvm_1_1parser_1_1Source.html |    1 +
 .../classtvm_1_1parser_1_1SourceMap-members.html   |    1 +
 .../doxygen/classtvm_1_1parser_1_1SourceMap.html   |    1 +
 ...lasstvm_1_1parser_1_1SourceMapNode-members.html |    1 +
 .../classtvm_1_1parser_1_1SourceMapNode.html       |    1 +
 .../classtvm_1_1parser_1_1SourceNode-members.html  |    1 +
 .../doxygen/classtvm_1_1parser_1_1SourceNode.html  |    1 +
 .../classtvm_1_1relay_1_1AltPattern-members.html   |    1 +
 .../doxygen/classtvm_1_1relay_1_1AltPattern.html   |    1 +
 ...lasstvm_1_1relay_1_1AltPatternNode-members.html |    1 +
 .../classtvm_1_1relay_1_1AltPatternNode.html       |    1 +
 .../classtvm_1_1relay_1_1AttrPattern-members.html  |    1 +
 .../doxygen/classtvm_1_1relay_1_1AttrPattern.html  |    1 +
 ...asstvm_1_1relay_1_1AttrPatternNode-members.html |    1 +
 .../classtvm_1_1relay_1_1AttrPatternNode.html      |    1 +
 .../doxygen/classtvm_1_1relay_1_1Call-members.html |    1 +
 docs/api/doxygen/classtvm_1_1relay_1_1Call.html    |    1 +
 .../classtvm_1_1relay_1_1CallNode-members.html     |    1 +
 .../api/doxygen/classtvm_1_1relay_1_1CallNode.html |    1 +
 .../classtvm_1_1relay_1_1CallPattern-members.html  |    1 +
 .../doxygen/classtvm_1_1relay_1_1CallPattern.html  |    1 +
 ...asstvm_1_1relay_1_1CallPatternNode-members.html |    1 +
 .../classtvm_1_1relay_1_1CallPatternNode.html      |    1 +
 .../classtvm_1_1relay_1_1Clause-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1relay_1_1Clause.html  |    1 +
 .../classtvm_1_1relay_1_1ClauseNode-members.html   |    1 +
 .../doxygen/classtvm_1_1relay_1_1ClauseNode.html   |    1 +
 .../classtvm_1_1relay_1_1Constant-members.html     |    1 +
 .../api/doxygen/classtvm_1_1relay_1_1Constant.html |    1 +
 .../classtvm_1_1relay_1_1ConstantNode-members.html |    1 +
 .../doxygen/classtvm_1_1relay_1_1ConstantNode.html |    1 +
 ...asstvm_1_1relay_1_1ConstantPattern-members.html |    1 +
 .../classtvm_1_1relay_1_1ConstantPattern.html      |    1 +
 ...vm_1_1relay_1_1ConstantPatternNode-members.html |    1 +
 .../classtvm_1_1relay_1_1ConstantPatternNode.html  |    1 +
 ...sstvm_1_1relay_1_1ConstructorValue-members.html |    1 +
 .../classtvm_1_1relay_1_1ConstructorValue.html     |    1 +
 .../classtvm_1_1relay_1_1DFPattern-members.html    |    1 +
 .../doxygen/classtvm_1_1relay_1_1DFPattern.html    |    1 +
 ...stvm_1_1relay_1_1DFPatternCallback-members.html |    1 +
 .../classtvm_1_1relay_1_1DFPatternCallback.html    |    1 +
 ..._1_1relay_1_1DFPatternCallbackNode-members.html |    1 +
 ...classtvm_1_1relay_1_1DFPatternCallbackNode.html |    1 +
 .../classtvm_1_1relay_1_1DFPatternFunctor.html     |    1 +
 ...Pattern_01_6n_00_01Args_8_8_8_08_4-members.html |    1 +
 ...nst_01DFPattern_01_6n_00_01Args_8_8_8_08_4.html |    1 +
 ...classtvm_1_1relay_1_1DFPatternNode-members.html |    1 +
 .../classtvm_1_1relay_1_1DFPatternNode.html        |    1 +
 ...sstvm_1_1relay_1_1DFPatternVisitor-members.html |    1 +
 .../classtvm_1_1relay_1_1DFPatternVisitor.html     |    1 +
 ...asstvm_1_1relay_1_1DataTypePattern-members.html |    1 +
 .../classtvm_1_1relay_1_1DataTypePattern.html      |    1 +
 ...vm_1_1relay_1_1DataTypePatternNode-members.html |    1 +
 .../classtvm_1_1relay_1_1DataTypePatternNode.html  |    1 +
 ...sstvm_1_1relay_1_1DominatorPattern-members.html |    1 +
 .../classtvm_1_1relay_1_1DominatorPattern.html     |    1 +
 ...m_1_1relay_1_1DominatorPatternNode-members.html |    1 +
 .../classtvm_1_1relay_1_1DominatorPatternNode.html |    1 +
 .../doxygen/classtvm_1_1relay_1_1ExprFunctor.html  |    1 +
 ..._01Expr_01_6n_00_01Args_8_8_8_08_4-members.html |    1 +
 ..._07const_01Expr_01_6n_00_01Args_8_8_8_08_4.html |    1 +
 .../classtvm_1_1relay_1_1ExprMutator-members.html  |    1 +
 .../doxygen/classtvm_1_1relay_1_1ExprMutator.html  |    1 +
 .../classtvm_1_1relay_1_1ExprPattern-members.html  |    1 +
 .../doxygen/classtvm_1_1relay_1_1ExprPattern.html  |    1 +
 ...asstvm_1_1relay_1_1ExprPatternNode-members.html |    1 +
 .../classtvm_1_1relay_1_1ExprPatternNode.html      |    1 +
 .../classtvm_1_1relay_1_1ExprRewriter-members.html |    1 +
 .../doxygen/classtvm_1_1relay_1_1ExprRewriter.html |    1 +
 .../classtvm_1_1relay_1_1ExprVisitor-members.html  |    1 +
 .../doxygen/classtvm_1_1relay_1_1ExprVisitor.html  |    1 +
 .../classtvm_1_1relay_1_1FeatureSet-members.html   |    1 +
 .../doxygen/classtvm_1_1relay_1_1FeatureSet.html   |    1 +
 .../classtvm_1_1relay_1_1Function-members.html     |    1 +
 .../api/doxygen/classtvm_1_1relay_1_1Function.html |    1 +
 .../classtvm_1_1relay_1_1FunctionNode-members.html |    1 +
 .../doxygen/classtvm_1_1relay_1_1FunctionNode.html |    1 +
 ...asstvm_1_1relay_1_1FunctionPattern-members.html |    1 +
 .../classtvm_1_1relay_1_1FunctionPattern.html      |    1 +
 ...vm_1_1relay_1_1FunctionPatternNode-members.html |    1 +
 .../classtvm_1_1relay_1_1FunctionPatternNode.html  |    1 +
 .../doxygen/classtvm_1_1relay_1_1Id-members.html   |    1 +
 docs/api/doxygen/classtvm_1_1relay_1_1Id.html      |    1 +
 .../classtvm_1_1relay_1_1IdNode-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1relay_1_1IdNode.html  |    1 +
 .../doxygen/classtvm_1_1relay_1_1If-members.html   |    1 +
 docs/api/doxygen/classtvm_1_1relay_1_1If.html      |    1 +
 .../classtvm_1_1relay_1_1IfNode-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1relay_1_1IfNode.html  |    1 +
 .../classtvm_1_1relay_1_1IfPattern-members.html    |    1 +
 .../doxygen/classtvm_1_1relay_1_1IfPattern.html    |    1 +
 ...classtvm_1_1relay_1_1IfPatternNode-members.html |    1 +
 .../classtvm_1_1relay_1_1IfPatternNode.html        |    1 +
 ...tvm_1_1relay_1_1InterpreterClosure-members.html |    1 +
 .../classtvm_1_1relay_1_1InterpreterClosure.html   |    1 +
 ..._1_1relay_1_1InterpreterClosureObj-members.html |    1 +
 ...classtvm_1_1relay_1_1InterpreterClosureObj.html |    1 +
 .../doxygen/classtvm_1_1relay_1_1Let-members.html  |    1 +
 docs/api/doxygen/classtvm_1_1relay_1_1Let.html     |    1 +
 .../classtvm_1_1relay_1_1LetNode-members.html      |    1 +
 docs/api/doxygen/classtvm_1_1relay_1_1LetNode.html |    1 +
 .../classtvm_1_1relay_1_1LetPattern-members.html   |    1 +
 .../doxygen/classtvm_1_1relay_1_1LetPattern.html   |    1 +
 ...lasstvm_1_1relay_1_1LetPatternNode-members.html |    1 +
 .../classtvm_1_1relay_1_1LetPatternNode.html       |    1 +
 .../classtvm_1_1relay_1_1Match-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1relay_1_1Match.html   |    1 +
 .../classtvm_1_1relay_1_1MatchNode-members.html    |    1 +
 .../doxygen/classtvm_1_1relay_1_1MatchNode.html    |    1 +
 ...sstvm_1_1relay_1_1MixedModeMutator-members.html |    1 +
 .../classtvm_1_1relay_1_1MixedModeMutator.html     |    1 +
 ...sstvm_1_1relay_1_1MixedModeVisitor-members.html |    1 +
 .../classtvm_1_1relay_1_1MixedModeVisitor.html     |    1 +
 ...sstvm_1_1relay_1_1OpImplementation-members.html |    1 +
 .../classtvm_1_1relay_1_1OpImplementation.html     |    1 +
 ...m_1_1relay_1_1OpImplementationNode-members.html |    1 +
 .../classtvm_1_1relay_1_1OpImplementationNode.html |    1 +
 ...sstvm_1_1relay_1_1OpSpecialization-members.html |    1 +
 .../classtvm_1_1relay_1_1OpSpecialization.html     |    1 +
 ...m_1_1relay_1_1OpSpecializationNode-members.html |    1 +
 .../classtvm_1_1relay_1_1OpSpecializationNode.html |    1 +
 .../classtvm_1_1relay_1_1OpStrategy-members.html   |    1 +
 .../doxygen/classtvm_1_1relay_1_1OpStrategy.html   |    1 +
 ...lasstvm_1_1relay_1_1OpStrategyNode-members.html |    1 +
 .../classtvm_1_1relay_1_1OpStrategyNode.html       |    1 +
 .../classtvm_1_1relay_1_1Pattern-members.html      |    1 +
 docs/api/doxygen/classtvm_1_1relay_1_1Pattern.html |    1 +
 ...tvm_1_1relay_1_1PatternConstructor-members.html |    1 +
 .../classtvm_1_1relay_1_1PatternConstructor.html   |    1 +
 ...1_1relay_1_1PatternConstructorNode-members.html |    1 +
 ...lasstvm_1_1relay_1_1PatternConstructorNode.html |    1 +
 .../classtvm_1_1relay_1_1PatternFunctor.html       |    1 +
 ...Pattern_01_6n_00_01Args_8_8_8_08_4-members.html |    1 +
 ...const_01Pattern_01_6n_00_01Args_8_8_8_08_4.html |    1 +
 ...lasstvm_1_1relay_1_1PatternMutator-members.html |    1 +
 .../classtvm_1_1relay_1_1PatternMutator.html       |    1 +
 .../classtvm_1_1relay_1_1PatternNode-members.html  |    1 +
 .../doxygen/classtvm_1_1relay_1_1PatternNode.html  |    1 +
 .../classtvm_1_1relay_1_1PatternTuple-members.html |    1 +
 .../doxygen/classtvm_1_1relay_1_1PatternTuple.html |    1 +
 ...sstvm_1_1relay_1_1PatternTupleNode-members.html |    1 +
 .../classtvm_1_1relay_1_1PatternTupleNode.html     |    1 +
 .../classtvm_1_1relay_1_1PatternVar-members.html   |    1 +
 .../doxygen/classtvm_1_1relay_1_1PatternVar.html   |    1 +
 ...lasstvm_1_1relay_1_1PatternVarNode-members.html |    1 +
 .../classtvm_1_1relay_1_1PatternVarNode.html       |    1 +
 ...lasstvm_1_1relay_1_1PatternVisitor-members.html |    1 +
 .../classtvm_1_1relay_1_1PatternVisitor.html       |    1 +
 ...asstvm_1_1relay_1_1PatternWildcard-members.html |    1 +
 .../classtvm_1_1relay_1_1PatternWildcard.html      |    1 +
 ...vm_1_1relay_1_1PatternWildcardNode-members.html |    1 +
 .../classtvm_1_1relay_1_1PatternWildcardNode.html  |    1 +
 .../classtvm_1_1relay_1_1RecClosure-members.html   |    1 +
 .../doxygen/classtvm_1_1relay_1_1RecClosure.html   |    1 +
 ...classtvm_1_1relay_1_1RecClosureObj-members.html |    1 +
 .../classtvm_1_1relay_1_1RecClosureObj.html        |    1 +
 .../classtvm_1_1relay_1_1RefCreate-members.html    |    1 +
 .../doxygen/classtvm_1_1relay_1_1RefCreate.html    |    1 +
 ...classtvm_1_1relay_1_1RefCreateNode-members.html |    1 +
 .../classtvm_1_1relay_1_1RefCreateNode.html        |    1 +
 .../classtvm_1_1relay_1_1RefRead-members.html      |    1 +
 docs/api/doxygen/classtvm_1_1relay_1_1RefRead.html |    1 +
 .../classtvm_1_1relay_1_1RefReadNode-members.html  |    1 +
 .../doxygen/classtvm_1_1relay_1_1RefReadNode.html  |    1 +
 .../classtvm_1_1relay_1_1RefValue-members.html     |    1 +
 .../api/doxygen/classtvm_1_1relay_1_1RefValue.html |    1 +
 .../classtvm_1_1relay_1_1RefWrite-members.html     |    1 +
 .../api/doxygen/classtvm_1_1relay_1_1RefWrite.html |    1 +
 .../classtvm_1_1relay_1_1RefWriteNode-members.html |    1 +
 .../doxygen/classtvm_1_1relay_1_1RefWriteNode.html |    1 +
 .../classtvm_1_1relay_1_1RelayNode-members.html    |    1 +
 .../doxygen/classtvm_1_1relay_1_1RelayNode.html    |    1 +
 .../classtvm_1_1relay_1_1ShapePattern-members.html |    1 +
 .../doxygen/classtvm_1_1relay_1_1ShapePattern.html |    1 +
 ...sstvm_1_1relay_1_1ShapePatternNode-members.html |    1 +
 .../classtvm_1_1relay_1_1ShapePatternNode.html     |    1 +
 .../classtvm_1_1relay_1_1TempExpr-members.html     |    1 +
 .../api/doxygen/classtvm_1_1relay_1_1TempExpr.html |    1 +
 .../classtvm_1_1relay_1_1TempExprNode-members.html |    1 +
 .../doxygen/classtvm_1_1relay_1_1TempExprNode.html |    1 +
 .../classtvm_1_1relay_1_1Tuple-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1relay_1_1Tuple.html   |    1 +
 .../classtvm_1_1relay_1_1TupleGetItem-members.html |    1 +
 .../doxygen/classtvm_1_1relay_1_1TupleGetItem.html |    1 +
 ...sstvm_1_1relay_1_1TupleGetItemNode-members.html |    1 +
 .../classtvm_1_1relay_1_1TupleGetItemNode.html     |    1 +
 ...vm_1_1relay_1_1TupleGetItemPattern-members.html |    1 +
 .../classtvm_1_1relay_1_1TupleGetItemPattern.html  |    1 +
 ..._1relay_1_1TupleGetItemPatternNode-members.html |    1 +
 ...asstvm_1_1relay_1_1TupleGetItemPatternNode.html |    1 +
 .../classtvm_1_1relay_1_1TupleNode-members.html    |    1 +
 .../doxygen/classtvm_1_1relay_1_1TupleNode.html    |    1 +
 .../classtvm_1_1relay_1_1TuplePattern-members.html |    1 +
 .../doxygen/classtvm_1_1relay_1_1TuplePattern.html |    1 +
 ...sstvm_1_1relay_1_1TuplePatternNode-members.html |    1 +
 .../classtvm_1_1relay_1_1TuplePatternNode.html     |    1 +
 .../classtvm_1_1relay_1_1TypePattern-members.html  |    1 +
 .../doxygen/classtvm_1_1relay_1_1TypePattern.html  |    1 +
 ...asstvm_1_1relay_1_1TypePatternNode-members.html |    1 +
 .../classtvm_1_1relay_1_1TypePatternNode.html      |    1 +
 .../doxygen/classtvm_1_1relay_1_1Var-members.html  |    1 +
 docs/api/doxygen/classtvm_1_1relay_1_1Var.html     |    1 +
 .../classtvm_1_1relay_1_1VarNode-members.html      |    1 +
 docs/api/doxygen/classtvm_1_1relay_1_1VarNode.html |    1 +
 .../classtvm_1_1relay_1_1VarPattern-members.html   |    1 +
 .../doxygen/classtvm_1_1relay_1_1VarPattern.html   |    1 +
 ...lasstvm_1_1relay_1_1VarPatternNode-members.html |    1 +
 .../classtvm_1_1relay_1_1VarPatternNode.html       |    1 +
 ...asstvm_1_1relay_1_1WildcardPattern-members.html |    1 +
 .../classtvm_1_1relay_1_1WildcardPattern.html      |    1 +
 ...vm_1_1relay_1_1WildcardPatternNode-members.html |    1 +
 .../classtvm_1_1relay_1_1WildcardPatternNode.html  |    1 +
 .../classtvm_1_1runtime_1_1ADT-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1runtime_1_1ADT.html   |    1 +
 .../classtvm_1_1runtime_1_1ADTObj-members.html     |    1 +
 .../api/doxygen/classtvm_1_1runtime_1_1ADTObj.html |    1 +
 .../classtvm_1_1runtime_1_1Array-members.html      |    1 +
 docs/api/doxygen/classtvm_1_1runtime_1_1Array.html |    1 +
 .../classtvm_1_1runtime_1_1ArrayNode-members.html  |    1 +
 .../doxygen/classtvm_1_1runtime_1_1ArrayNode.html  |    1 +
 .../classtvm_1_1runtime_1_1Closure-members.html    |    1 +
 .../doxygen/classtvm_1_1runtime_1_1Closure.html    |    1 +
 .../classtvm_1_1runtime_1_1ClosureObj-members.html |    1 +
 .../doxygen/classtvm_1_1runtime_1_1ClosureObj.html |    1 +
 .../classtvm_1_1runtime_1_1DataType-members.html   |    1 +
 .../doxygen/classtvm_1_1runtime_1_1DataType.html   |    1 +
 ...lasstvm_1_1runtime_1_1DenseMapNode-members.html |    1 +
 .../classtvm_1_1runtime_1_1DenseMapNode.html       |    1 +
 .../classtvm_1_1runtime_1_1DeviceAPI-members.html  |    1 +
 .../doxygen/classtvm_1_1runtime_1_1DeviceAPI.html  |    1 +
 ...tvm_1_1runtime_1_1InplaceArrayBase-members.html |    1 +
 .../classtvm_1_1runtime_1_1InplaceArrayBase.html   |    1 +
 ...classtvm_1_1runtime_1_1IterAdapter-members.html |    1 +
 .../classtvm_1_1runtime_1_1IterAdapter.html        |    1 +
 .../classtvm_1_1runtime_1_1Map-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1runtime_1_1Map.html   |    1 +
 .../classtvm_1_1runtime_1_1MapNode-members.html    |    1 +
 .../doxygen/classtvm_1_1runtime_1_1MapNode.html    |    1 +
 ..._1_1runtime_1_1MapNode_1_1iterator-members.html |    1 +
 ...classtvm_1_1runtime_1_1MapNode_1_1iterator.html |    1 +
 ...stvm_1_1runtime_1_1Map_1_1iterator-members.html |    1 +
 .../classtvm_1_1runtime_1_1Map_1_1iterator.html    |    1 +
 .../classtvm_1_1runtime_1_1Module-members.html     |    1 +
 .../api/doxygen/classtvm_1_1runtime_1_1Module.html |    1 +
 .../classtvm_1_1runtime_1_1ModuleNode-members.html |    1 +
 .../doxygen/classtvm_1_1runtime_1_1ModuleNode.html |    1 +
 .../classtvm_1_1runtime_1_1NDArray-members.html    |    1 +
 .../doxygen/classtvm_1_1runtime_1_1NDArray.html    |    1 +
 ...1_1runtime_1_1NDArray_1_1Container-members.html |    1 +
 ...lasstvm_1_1runtime_1_1NDArray_1_1Container.html |    1 +
 ...untime_1_1NDArray_1_1ContainerBase-members.html |    1 +
 ...tvm_1_1runtime_1_1NDArray_1_1ContainerBase.html |    1 +
 ...tvm_1_1runtime_1_1ObjAllocatorBase-members.html |    1 +
 .../classtvm_1_1runtime_1_1ObjAllocatorBase.html   |    1 +
 .../classtvm_1_1runtime_1_1Object-members.html     |    1 +
 .../api/doxygen/classtvm_1_1runtime_1_1Object.html |    1 +
 .../classtvm_1_1runtime_1_1ObjectPtr-members.html  |    1 +
 .../doxygen/classtvm_1_1runtime_1_1ObjectPtr.html  |    1 +
 .../classtvm_1_1runtime_1_1ObjectRef-members.html  |    1 +
 .../doxygen/classtvm_1_1runtime_1_1ObjectRef.html  |    1 +
 .../classtvm_1_1runtime_1_1Optional-members.html   |    1 +
 .../doxygen/classtvm_1_1runtime_1_1Optional.html   |    1 +
 .../classtvm_1_1runtime_1_1PackedFunc-members.html |    1 +
 .../doxygen/classtvm_1_1runtime_1_1PackedFunc.html |    1 +
 .../classtvm_1_1runtime_1_1Registry-members.html   |    1 +
 .../doxygen/classtvm_1_1runtime_1_1Registry.html   |    1 +
 ...m_1_1runtime_1_1ReverseIterAdapter-members.html |    1 +
 .../classtvm_1_1runtime_1_1ReverseIterAdapter.html |    1 +
 ...m_1_1runtime_1_1SimpleObjAllocator-members.html |    1 +
 .../classtvm_1_1runtime_1_1SimpleObjAllocator.html |    1 +
 ...SimpleObjAllocator_1_1ArrayHandler-members.html |    1 +
 ...time_1_1SimpleObjAllocator_1_1ArrayHandler.html |    1 +
 ...e_1_1SimpleObjAllocator_1_1Handler-members.html |    1 +
 ..._1runtime_1_1SimpleObjAllocator_1_1Handler.html |    1 +
 ...lasstvm_1_1runtime_1_1SmallMapNode-members.html |    1 +
 .../classtvm_1_1runtime_1_1SmallMapNode.html       |    1 +
 .../classtvm_1_1runtime_1_1String-members.html     |    1 +
 .../api/doxygen/classtvm_1_1runtime_1_1String.html |    1 +
 .../classtvm_1_1runtime_1_1StringObj-members.html  |    1 +
 .../doxygen/classtvm_1_1runtime_1_1StringObj.html  |    1 +
 ...1_1runtime_1_1StringObj_1_1FromStd-members.html |    1 +
 ...lasstvm_1_1runtime_1_1StringObj_1_1FromStd.html |    1 +
 ...classtvm_1_1runtime_1_1TVMArgValue-members.html |    1 +
 .../classtvm_1_1runtime_1_1TVMArgValue.html        |    1 +
 .../classtvm_1_1runtime_1_1TVMArgs-members.html    |    1 +
 .../doxygen/classtvm_1_1runtime_1_1TVMArgs.html    |    1 +
 ...asstvm_1_1runtime_1_1TVMArgsSetter-members.html |    1 +
 .../classtvm_1_1runtime_1_1TVMArgsSetter.html      |    1 +
 ...1_1TVMMovableArgValueWithContext__-members.html |    1 +
 ...runtime_1_1TVMMovableArgValueWithContext__.html |    1 +
 ...1_1runtime_1_1TVMMovableArgValue__-members.html |    1 +
 ...lasstvm_1_1runtime_1_1TVMMovableArgValue__.html |    1 +
 ...asstvm_1_1runtime_1_1TVMPODValue__-members.html |    1 +
 .../classtvm_1_1runtime_1_1TVMPODValue__.html      |    1 +
 ...classtvm_1_1runtime_1_1TVMRetValue-members.html |    1 +
 .../classtvm_1_1runtime_1_1TVMRetValue.html        |    1 +
 .../classtvm_1_1runtime_1_1Timer-members.html      |    1 +
 docs/api/doxygen/classtvm_1_1runtime_1_1Timer.html |    1 +
 .../classtvm_1_1runtime_1_1TimerNode-members.html  |    1 +
 .../doxygen/classtvm_1_1runtime_1_1TimerNode.html  |    1 +
 .../classtvm_1_1runtime_1_1TypedPackedFunc.html    |    1 +
 ...PackedFunc_3_01R_07Args_8_8_8_08_4-members.html |    1 +
 ...1_1TypedPackedFunc_3_01R_07Args_8_8_8_08_4.html |    1 +
 ...ntime_1_1micro__rpc_1_1FrameBuffer-members.html |    1 +
 ...vm_1_1runtime_1_1micro__rpc_1_1FrameBuffer.html |    1 +
 ...1_1runtime_1_1micro__rpc_1_1Framer-members.html |    1 +
 ...lasstvm_1_1runtime_1_1micro__rpc_1_1Framer.html |    1 +
 ...micro__rpc_1_1PacketFieldSizeBytes-members.html |    1 +
 ...time_1_1micro__rpc_1_1PacketFieldSizeBytes.html |    1 +
 ..._1runtime_1_1micro__rpc_1_1Session-members.html |    1 +
 ...asstvm_1_1runtime_1_1micro__rpc_1_1Session.html |    1 +
 ...1runtime_1_1micro__rpc_1_1Unframer-members.html |    1 +
 ...sstvm_1_1runtime_1_1micro__rpc_1_1Unframer.html |    1 +
 ...ntime_1_1micro__rpc_1_1WriteStream-members.html |    1 +
 ...vm_1_1runtime_1_1micro__rpc_1_1WriteStream.html |    1 +
 ...1runtime_1_1profiling_1_1CountNode-members.html |    1 +
 ...sstvm_1_1runtime_1_1profiling_1_1CountNode.html |    1 +
 ...ntime_1_1profiling_1_1DurationNode-members.html |    1 +
 ...vm_1_1runtime_1_1profiling_1_1DurationNode.html |    1 +
 ...untime_1_1profiling_1_1PercentNode-members.html |    1 +
 ...tvm_1_1runtime_1_1profiling_1_1PercentNode.html |    1 +
 ..._1runtime_1_1profiling_1_1Profiler-members.html |    1 +
 ...asstvm_1_1runtime_1_1profiling_1_1Profiler.html |    1 +
 ..._1_1runtime_1_1profiling_1_1Report-members.html |    1 +
 ...classtvm_1_1runtime_1_1profiling_1_1Report.html |    1 +
 ...runtime_1_1profiling_1_1ReportNode-members.html |    1 +
 ...stvm_1_1runtime_1_1profiling_1_1ReportNode.html |    1 +
 ...untime_1_1threading_1_1ThreadGroup-members.html |    1 +
 ...tvm_1_1runtime_1_1threading_1_1ThreadGroup.html |    1 +
 ...stvm_1_1runtime_1_1vm_1_1Allocator-members.html |    1 +
 .../classtvm_1_1runtime_1_1vm_1_1Allocator.html    |    1 +
 ...tvm_1_1runtime_1_1vm_1_1Executable-members.html |    1 +
 .../classtvm_1_1runtime_1_1vm_1_1Executable.html   |    1 +
 ..._1_1runtime_1_1vm_1_1MemoryManager-members.html |    1 +
 ...classtvm_1_1runtime_1_1vm_1_1MemoryManager.html |    1 +
 ...asstvm_1_1runtime_1_1vm_1_1Storage-members.html |    1 +
 .../classtvm_1_1runtime_1_1vm_1_1Storage.html      |    1 +
 ...tvm_1_1runtime_1_1vm_1_1StorageObj-members.html |    1 +
 .../classtvm_1_1runtime_1_1vm_1_1StorageObj.html   |    1 +
 ...stvm_1_1runtime_1_1vm_1_1VMClosure-members.html |    1 +
 .../classtvm_1_1runtime_1_1vm_1_1VMClosure.html    |    1 +
 ...m_1_1runtime_1_1vm_1_1VMClosureObj-members.html |    1 +
 .../classtvm_1_1runtime_1_1vm_1_1VMClosureObj.html |    1 +
 ...1_1runtime_1_1vm_1_1VirtualMachine-members.html |    1 +
 ...lasstvm_1_1runtime_1_1vm_1_1VirtualMachine.html |    1 +
 ...lasstvm_1_1te_1_1BaseComputeOpNode-members.html |    1 +
 .../classtvm_1_1te_1_1BaseComputeOpNode.html       |    1 +
 .../classtvm_1_1te_1_1ComputeOp-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1te_1_1ComputeOp.html  |    1 +
 .../classtvm_1_1te_1_1ComputeOpNode-members.html   |    1 +
 .../doxygen/classtvm_1_1te_1_1ComputeOpNode.html   |    1 +
 .../classtvm_1_1te_1_1ExternOp-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1te_1_1ExternOp.html   |    1 +
 .../classtvm_1_1te_1_1ExternOpNode-members.html    |    1 +
 .../doxygen/classtvm_1_1te_1_1ExternOpNode.html    |    1 +
 .../doxygen/classtvm_1_1te_1_1Fuse-members.html    |    1 +
 docs/api/doxygen/classtvm_1_1te_1_1Fuse.html       |    1 +
 .../classtvm_1_1te_1_1FuseNode-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1te_1_1FuseNode.html   |    1 +
 .../classtvm_1_1te_1_1HybridOp-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1te_1_1HybridOp.html   |    1 +
 .../classtvm_1_1te_1_1HybridOpNode-members.html    |    1 +
 .../doxygen/classtvm_1_1te_1_1HybridOpNode.html    |    1 +
 .../classtvm_1_1te_1_1IterVarAttr-members.html     |    1 +
 .../api/doxygen/classtvm_1_1te_1_1IterVarAttr.html |    1 +
 .../classtvm_1_1te_1_1IterVarAttrNode-members.html |    1 +
 .../doxygen/classtvm_1_1te_1_1IterVarAttrNode.html |    1 +
 .../classtvm_1_1te_1_1IterVarRelation-members.html |    1 +
 .../doxygen/classtvm_1_1te_1_1IterVarRelation.html |    1 +
 ...sstvm_1_1te_1_1IterVarRelationNode-members.html |    1 +
 .../classtvm_1_1te_1_1IterVarRelationNode.html     |    1 +
 .../classtvm_1_1te_1_1Operation-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1te_1_1Operation.html  |    1 +
 .../classtvm_1_1te_1_1OperationNode-members.html   |    1 +
 .../doxygen/classtvm_1_1te_1_1OperationNode.html   |    1 +
 .../classtvm_1_1te_1_1PlaceholderOp-members.html   |    1 +
 .../doxygen/classtvm_1_1te_1_1PlaceholderOp.html   |    1 +
 ...lasstvm_1_1te_1_1PlaceholderOpNode-members.html |    1 +
 .../classtvm_1_1te_1_1PlaceholderOpNode.html       |    1 +
 .../doxygen/classtvm_1_1te_1_1Rebase-members.html  |    1 +
 docs/api/doxygen/classtvm_1_1te_1_1Rebase.html     |    1 +
 .../classtvm_1_1te_1_1RebaseNode-members.html      |    1 +
 docs/api/doxygen/classtvm_1_1te_1_1RebaseNode.html |    1 +
 .../doxygen/classtvm_1_1te_1_1ScanOp-members.html  |    1 +
 docs/api/doxygen/classtvm_1_1te_1_1ScanOp.html     |    1 +
 .../classtvm_1_1te_1_1ScanOpNode-members.html      |    1 +
 docs/api/doxygen/classtvm_1_1te_1_1ScanOpNode.html |    1 +
 .../classtvm_1_1te_1_1Schedule-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1te_1_1Schedule.html   |    1 +
 .../classtvm_1_1te_1_1ScheduleNode-members.html    |    1 +
 .../doxygen/classtvm_1_1te_1_1ScheduleNode.html    |    1 +
 .../classtvm_1_1te_1_1Singleton-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1te_1_1Singleton.html  |    1 +
 .../classtvm_1_1te_1_1SingletonNode-members.html   |    1 +
 .../doxygen/classtvm_1_1te_1_1SingletonNode.html   |    1 +
 ...stvm_1_1te_1_1SpecializedCondition-members.html |    1 +
 .../classtvm_1_1te_1_1SpecializedCondition.html    |    1 +
 ..._1_1te_1_1SpecializedConditionNode-members.html |    1 +
 ...classtvm_1_1te_1_1SpecializedConditionNode.html |    1 +
 .../doxygen/classtvm_1_1te_1_1Split-members.html   |    1 +
 docs/api/doxygen/classtvm_1_1te_1_1Split.html      |    1 +
 .../classtvm_1_1te_1_1SplitNode-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1te_1_1SplitNode.html  |    1 +
 .../doxygen/classtvm_1_1te_1_1Stage-members.html   |    1 +
 docs/api/doxygen/classtvm_1_1te_1_1Stage.html      |    1 +
 .../classtvm_1_1te_1_1StageNode-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1te_1_1StageNode.html  |    1 +
 .../doxygen/classtvm_1_1te_1_1Tensor-members.html  |    1 +
 docs/api/doxygen/classtvm_1_1te_1_1Tensor.html     |    1 +
 .../classtvm_1_1te_1_1TensorComputeOp-members.html |    1 +
 .../doxygen/classtvm_1_1te_1_1TensorComputeOp.html |    1 +
 ...sstvm_1_1te_1_1TensorComputeOpNode-members.html |    1 +
 .../classtvm_1_1te_1_1TensorComputeOpNode.html     |    1 +
 .../classtvm_1_1te_1_1TensorIntrin-members.html    |    1 +
 .../doxygen/classtvm_1_1te_1_1TensorIntrin.html    |    1 +
 ...classtvm_1_1te_1_1TensorIntrinCall-members.html |    1 +
 .../classtvm_1_1te_1_1TensorIntrinCall.html        |    1 +
 ...stvm_1_1te_1_1TensorIntrinCallNode-members.html |    1 +
 .../classtvm_1_1te_1_1TensorIntrinCallNode.html    |    1 +
 ...classtvm_1_1te_1_1TensorIntrinNode-members.html |    1 +
 .../classtvm_1_1te_1_1TensorIntrinNode.html        |    1 +
 .../classtvm_1_1te_1_1TensorNode-members.html      |    1 +
 docs/api/doxygen/classtvm_1_1te_1_1TensorNode.html |    1 +
 .../classtvm_1_1te_1_1Tensor_1_1Slice-members.html |    1 +
 .../doxygen/classtvm_1_1te_1_1Tensor_1_1Slice.html |    1 +
 .../doxygen/classtvm_1_1tir_1_1Add-members.html    |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Add.html       |    1 +
 .../classtvm_1_1tir_1_1AddNode-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1AddNode.html   |    1 +
 .../classtvm_1_1tir_1_1Allocate-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Allocate.html  |    1 +
 .../classtvm_1_1tir_1_1AllocateNode-members.html   |    1 +
 .../doxygen/classtvm_1_1tir_1_1AllocateNode.html   |    1 +
 .../doxygen/classtvm_1_1tir_1_1And-members.html    |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1And.html       |    1 +
 .../classtvm_1_1tir_1_1AndNode-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1AndNode.html   |    1 +
 .../doxygen/classtvm_1_1tir_1_1Any-members.html    |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Any.html       |    1 +
 .../classtvm_1_1tir_1_1AnyNode-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1AnyNode.html   |    1 +
 .../classtvm_1_1tir_1_1AssertStmt-members.html     |    1 +
 .../api/doxygen/classtvm_1_1tir_1_1AssertStmt.html |    1 +
 .../classtvm_1_1tir_1_1AssertStmtNode-members.html |    1 +
 .../doxygen/classtvm_1_1tir_1_1AssertStmtNode.html |    1 +
 .../classtvm_1_1tir_1_1AttrStmt-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1AttrStmt.html  |    1 +
 .../classtvm_1_1tir_1_1AttrStmtNode-members.html   |    1 +
 .../doxygen/classtvm_1_1tir_1_1AttrStmtNode.html   |    1 +
 ...classtvm_1_1tir_1_1BijectiveLayout-members.html |    1 +
 .../classtvm_1_1tir_1_1BijectiveLayout.html        |    1 +
 ...stvm_1_1tir_1_1BijectiveLayoutNode-members.html |    1 +
 .../classtvm_1_1tir_1_1BijectiveLayoutNode.html    |    1 +
 .../classtvm_1_1tir_1_1BinaryOpNode-members.html   |    1 +
 .../doxygen/classtvm_1_1tir_1_1BinaryOpNode.html   |    1 +
 .../doxygen/classtvm_1_1tir_1_1Block-members.html  |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Block.html     |    1 +
 .../classtvm_1_1tir_1_1BlockNode-members.html      |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1BlockNode.html |    1 +
 .../classtvm_1_1tir_1_1BlockRV-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1BlockRV.html   |    1 +
 .../classtvm_1_1tir_1_1BlockRVNode-members.html    |    1 +
 .../doxygen/classtvm_1_1tir_1_1BlockRVNode.html    |    1 +
 .../classtvm_1_1tir_1_1BlockRealize-members.html   |    1 +
 .../doxygen/classtvm_1_1tir_1_1BlockRealize.html   |    1 +
 ...lasstvm_1_1tir_1_1BlockRealizeNode-members.html |    1 +
 .../classtvm_1_1tir_1_1BlockRealizeNode.html       |    1 +
 .../classtvm_1_1tir_1_1BlockScope-members.html     |    1 +
 .../api/doxygen/classtvm_1_1tir_1_1BlockScope.html |    1 +
 .../classtvm_1_1tir_1_1BlockScopeNode-members.html |    1 +
 .../doxygen/classtvm_1_1tir_1_1BlockScopeNode.html |    1 +
 .../classtvm_1_1tir_1_1Broadcast-members.html      |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Broadcast.html |    1 +
 .../classtvm_1_1tir_1_1BroadcastNode-members.html  |    1 +
 .../doxygen/classtvm_1_1tir_1_1BroadcastNode.html  |    1 +
 .../doxygen/classtvm_1_1tir_1_1Buffer-members.html |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Buffer.html    |    1 +
 .../classtvm_1_1tir_1_1BufferLoad-members.html     |    8 +-
 .../api/doxygen/classtvm_1_1tir_1_1BufferLoad.html |   23 +-
 .../classtvm_1_1tir_1_1BufferLoadNode-members.html |    1 +
 .../doxygen/classtvm_1_1tir_1_1BufferLoadNode.html |    1 +
 .../classtvm_1_1tir_1_1BufferLoad__coll__graph.svg |   82 +-
 ...asstvm_1_1tir_1_1BufferLoad__inherit__graph.svg |   82 +-
 .../classtvm_1_1tir_1_1BufferNode-members.html     |    1 +
 .../api/doxygen/classtvm_1_1tir_1_1BufferNode.html |    1 +
 .../classtvm_1_1tir_1_1BufferRealize-members.html  |    1 +
 .../doxygen/classtvm_1_1tir_1_1BufferRealize.html  |    1 +
 ...asstvm_1_1tir_1_1BufferRealizeNode-members.html |    1 +
 .../classtvm_1_1tir_1_1BufferRealizeNode.html      |    1 +
 .../classtvm_1_1tir_1_1BufferRegion-members.html   |    5 +-
 .../doxygen/classtvm_1_1tir_1_1BufferRegion.html   |   74 +-
 ...lasstvm_1_1tir_1_1BufferRegionNode-members.html |    1 +
 .../classtvm_1_1tir_1_1BufferRegionNode.html       |    1 +
 ...lasstvm_1_1tir_1_1BufferRegion__coll__graph.svg |   45 +-
 ...stvm_1_1tir_1_1BufferRegion__inherit__graph.svg |   45 +-
 .../classtvm_1_1tir_1_1BufferStore-members.html    |    6 +-
 .../doxygen/classtvm_1_1tir_1_1BufferStore.html    |   23 +-
 ...classtvm_1_1tir_1_1BufferStoreNode-members.html |    1 +
 .../classtvm_1_1tir_1_1BufferStoreNode.html        |    1 +
 ...classtvm_1_1tir_1_1BufferStore__coll__graph.svg |   58 +-
 ...sstvm_1_1tir_1_1BufferStore__inherit__graph.svg |   58 +-
 .../doxygen/classtvm_1_1tir_1_1Call-members.html   |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Call.html      |    1 +
 .../classtvm_1_1tir_1_1CallNode-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1CallNode.html  |    1 +
 .../doxygen/classtvm_1_1tir_1_1Cast-members.html   |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Cast.html      |    1 +
 .../classtvm_1_1tir_1_1CastNode-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1CastNode.html  |    1 +
 .../classtvm_1_1tir_1_1CmpOpNode-members.html      |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1CmpOpNode.html |    1 +
 .../classtvm_1_1tir_1_1CommReducer-members.html    |    1 +
 .../doxygen/classtvm_1_1tir_1_1CommReducer.html    |    1 +
 ...classtvm_1_1tir_1_1CommReducerNode-members.html |    1 +
 .../classtvm_1_1tir_1_1CommReducerNode.html        |    1 +
 .../classtvm_1_1tir_1_1DataProducer-members.html   |    1 +
 .../doxygen/classtvm_1_1tir_1_1DataProducer.html   |    1 +
 ...lasstvm_1_1tir_1_1DataProducerNode-members.html |    1 +
 .../classtvm_1_1tir_1_1DataProducerNode.html       |    1 +
 .../classtvm_1_1tir_1_1Dependency-members.html     |    1 +
 .../api/doxygen/classtvm_1_1tir_1_1Dependency.html |    1 +
 .../classtvm_1_1tir_1_1DependencyNode-members.html |    1 +
 .../doxygen/classtvm_1_1tir_1_1DependencyNode.html |    1 +
 .../doxygen/classtvm_1_1tir_1_1Div-members.html    |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Div.html       |    1 +
 .../classtvm_1_1tir_1_1DivNode-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1DivNode.html   |    1 +
 .../api/doxygen/classtvm_1_1tir_1_1EQ-members.html |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1EQ.html        |    1 +
 .../doxygen/classtvm_1_1tir_1_1EQNode-members.html |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1EQNode.html    |    1 +
 .../classtvm_1_1tir_1_1Evaluate-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Evaluate.html  |    1 +
 .../classtvm_1_1tir_1_1EvaluateNode-members.html   |    1 +
 .../doxygen/classtvm_1_1tir_1_1EvaluateNode.html   |    1 +
 .../doxygen/classtvm_1_1tir_1_1ExprFunctor.html    |    1 +
 ...rimExpr_01_6n_00_01Args_8_8_8_08_4-members.html |    1 +
 ...onst_01PrimExpr_01_6n_00_01Args_8_8_8_08_4.html |    1 +
 .../classtvm_1_1tir_1_1ExprMutator-members.html    |    1 +
 .../doxygen/classtvm_1_1tir_1_1ExprMutator.html    |    1 +
 .../classtvm_1_1tir_1_1ExprVisitor-members.html    |    1 +
 .../doxygen/classtvm_1_1tir_1_1ExprVisitor.html    |    1 +
 .../classtvm_1_1tir_1_1FloorDiv-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1FloorDiv.html  |    1 +
 .../classtvm_1_1tir_1_1FloorDivNode-members.html   |    1 +
 .../doxygen/classtvm_1_1tir_1_1FloorDivNode.html   |    1 +
 .../classtvm_1_1tir_1_1FloorMod-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1FloorMod.html  |    1 +
 .../classtvm_1_1tir_1_1FloorModNode-members.html   |    1 +
 .../doxygen/classtvm_1_1tir_1_1FloorModNode.html   |    1 +
 .../doxygen/classtvm_1_1tir_1_1For-members.html    |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1For.html       |    1 +
 .../classtvm_1_1tir_1_1ForNode-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1ForNode.html   |    1 +
 .../api/doxygen/classtvm_1_1tir_1_1GE-members.html |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1GE.html        |    1 +
 .../doxygen/classtvm_1_1tir_1_1GENode-members.html |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1GENode.html    |    1 +
 .../api/doxygen/classtvm_1_1tir_1_1GT-members.html |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1GT.html        |    1 +
 .../doxygen/classtvm_1_1tir_1_1GTNode-members.html |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1GTNode.html    |    1 +
 .../classtvm_1_1tir_1_1IfThenElse-members.html     |    1 +
 .../api/doxygen/classtvm_1_1tir_1_1IfThenElse.html |    1 +
 .../classtvm_1_1tir_1_1IfThenElseNode-members.html |    1 +
 .../doxygen/classtvm_1_1tir_1_1IfThenElseNode.html |    1 +
 .../classtvm_1_1tir_1_1IterVar-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1IterVar.html   |    1 +
 .../classtvm_1_1tir_1_1IterVarNode-members.html    |    1 +
 .../doxygen/classtvm_1_1tir_1_1IterVarNode.html    |    1 +
 .../api/doxygen/classtvm_1_1tir_1_1LE-members.html |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1LE.html        |    1 +
 .../api/doxygen/classtvm_1_1tir_1_1LT-members.html |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1LT.html        |    1 +
 .../doxygen/classtvm_1_1tir_1_1LTNode-members.html |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1LTNode.html    |    1 +
 .../doxygen/classtvm_1_1tir_1_1Layout-members.html |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Layout.html    |    1 +
 .../classtvm_1_1tir_1_1LayoutAxis-members.html     |    1 +
 .../api/doxygen/classtvm_1_1tir_1_1LayoutAxis.html |    1 +
 .../classtvm_1_1tir_1_1LayoutNode-members.html     |    1 +
 .../api/doxygen/classtvm_1_1tir_1_1LayoutNode.html |    1 +
 .../doxygen/classtvm_1_1tir_1_1Let-members.html    |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Let.html       |    1 +
 .../classtvm_1_1tir_1_1LetNode-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1LetNode.html   |    1 +
 .../classtvm_1_1tir_1_1LetStmt-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1LetStmt.html   |    1 +
 .../classtvm_1_1tir_1_1LetStmtNode-members.html    |    1 +
 .../doxygen/classtvm_1_1tir_1_1LetStmtNode.html    |    1 +
 .../classtvm_1_1tir_1_1LinkedParam-members.html    |    1 +
 .../doxygen/classtvm_1_1tir_1_1LinkedParam.html    |    1 +
 ...classtvm_1_1tir_1_1LinkedParamNode-members.html |    1 +
 .../classtvm_1_1tir_1_1LinkedParamNode.html        |    1 +
 .../doxygen/classtvm_1_1tir_1_1Load-members.html   |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Load.html      |    1 +
 .../classtvm_1_1tir_1_1LoadNode-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1LoadNode.html  |    1 +
 .../doxygen/classtvm_1_1tir_1_1LoopRV-members.html |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1LoopRV.html    |    1 +
 .../classtvm_1_1tir_1_1LoopRVNode-members.html     |    1 +
 .../api/doxygen/classtvm_1_1tir_1_1LoopRVNode.html |    1 +
 ...asstvm_1_1tir_1_1MatchBufferRegion-members.html |    1 +
 .../classtvm_1_1tir_1_1MatchBufferRegion.html      |    1 +
 ...vm_1_1tir_1_1MatchBufferRegionNode-members.html |    1 +
 .../classtvm_1_1tir_1_1MatchBufferRegionNode.html  |    1 +
 ..._1tir_1_1MatchBufferRegionNode__coll__graph.svg |   27 +-
 .../doxygen/classtvm_1_1tir_1_1Max-members.html    |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Max.html       |    1 +
 .../classtvm_1_1tir_1_1MaxNode-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1MaxNode.html   |    1 +
 .../doxygen/classtvm_1_1tir_1_1Min-members.html    |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Min.html       |    1 +
 .../classtvm_1_1tir_1_1MinNode-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1MinNode.html   |    1 +
 .../doxygen/classtvm_1_1tir_1_1Mod-members.html    |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Mod.html       |    1 +
 .../classtvm_1_1tir_1_1ModNode-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1ModNode.html   |    1 +
 .../doxygen/classtvm_1_1tir_1_1Mul-members.html    |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Mul.html       |    1 +
 .../classtvm_1_1tir_1_1MulNode-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1MulNode.html   |    1 +
 .../api/doxygen/classtvm_1_1tir_1_1NE-members.html |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1NE.html        |    1 +
 .../doxygen/classtvm_1_1tir_1_1NENode-members.html |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1NENode.html    |    1 +
 .../doxygen/classtvm_1_1tir_1_1Not-members.html    |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Not.html       |    1 +
 .../classtvm_1_1tir_1_1NotNode-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1NotNode.html   |    1 +
 .../api/doxygen/classtvm_1_1tir_1_1Or-members.html |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Or.html        |    1 +
 .../doxygen/classtvm_1_1tir_1_1OrNode-members.html |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1OrNode.html    |    1 +
 .../classtvm_1_1tir_1_1Prefetch-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Prefetch.html  |    1 +
 .../classtvm_1_1tir_1_1PrefetchNode-members.html   |    1 +
 .../doxygen/classtvm_1_1tir_1_1PrefetchNode.html   |    1 +
 .../classtvm_1_1tir_1_1PrimFunc-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1PrimFunc.html  |    1 +
 .../classtvm_1_1tir_1_1PrimFuncNode-members.html   |    1 +
 .../doxygen/classtvm_1_1tir_1_1PrimFuncNode.html   |    1 +
 .../classtvm_1_1tir_1_1ProducerLoad-members.html   |    1 +
 .../doxygen/classtvm_1_1tir_1_1ProducerLoad.html   |    1 +
 ...lasstvm_1_1tir_1_1ProducerLoadNode-members.html |    1 +
 .../classtvm_1_1tir_1_1ProducerLoadNode.html       |    1 +
 ...classtvm_1_1tir_1_1ProducerRealize-members.html |    1 +
 .../classtvm_1_1tir_1_1ProducerRealize.html        |    1 +
 ...stvm_1_1tir_1_1ProducerRealizeNode-members.html |    1 +
 .../classtvm_1_1tir_1_1ProducerRealizeNode.html    |    1 +
 .../classtvm_1_1tir_1_1ProducerStore-members.html  |    1 +
 .../doxygen/classtvm_1_1tir_1_1ProducerStore.html  |    1 +
 ...asstvm_1_1tir_1_1ProducerStoreNode-members.html |    1 +
 .../classtvm_1_1tir_1_1ProducerStoreNode.html      |    1 +
 .../doxygen/classtvm_1_1tir_1_1Ramp-members.html   |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Ramp.html      |    1 +
 .../classtvm_1_1tir_1_1RampNode-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1RampNode.html  |    1 +
 .../doxygen/classtvm_1_1tir_1_1Reduce-members.html |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Reduce.html    |    1 +
 .../classtvm_1_1tir_1_1ReduceNode-members.html     |    1 +
 .../api/doxygen/classtvm_1_1tir_1_1ReduceNode.html |    1 +
 .../classtvm_1_1tir_1_1Schedule-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Schedule.html  |    1 +
 .../classtvm_1_1tir_1_1ScheduleNode-members.html   |    1 +
 .../doxygen/classtvm_1_1tir_1_1ScheduleNode.html   |    1 +
 .../classtvm_1_1tir_1_1ScheduleState-members.html  |    1 +
 .../doxygen/classtvm_1_1tir_1_1ScheduleState.html  |    1 +
 ...asstvm_1_1tir_1_1ScheduleStateNode-members.html |    1 +
 .../classtvm_1_1tir_1_1ScheduleStateNode.html      |    1 +
 .../doxygen/classtvm_1_1tir_1_1Select-members.html |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Select.html    |    1 +
 .../classtvm_1_1tir_1_1SelectNode-members.html     |    1 +
 .../api/doxygen/classtvm_1_1tir_1_1SelectNode.html |    1 +
 .../classtvm_1_1tir_1_1SeqStmt-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1SeqStmt.html   |    1 +
 .../classtvm_1_1tir_1_1SeqStmtNode-members.html    |    1 +
 .../doxygen/classtvm_1_1tir_1_1SeqStmtNode.html    |    1 +
 ...tvm_1_1tir_1_1SeqStmt_1_1Flattener-members.html |    1 +
 .../classtvm_1_1tir_1_1SeqStmt_1_1Flattener.html   |    1 +
 .../classtvm_1_1tir_1_1Shuffle-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Shuffle.html   |    1 +
 .../classtvm_1_1tir_1_1ShuffleNode-members.html    |    1 +
 .../doxygen/classtvm_1_1tir_1_1ShuffleNode.html    |    1 +
 .../classtvm_1_1tir_1_1SizeVar-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1SizeVar.html   |    1 +
 .../classtvm_1_1tir_1_1SizeVarNode-members.html    |    1 +
 .../doxygen/classtvm_1_1tir_1_1SizeVarNode.html    |    1 +
 .../doxygen/classtvm_1_1tir_1_1Stmt-members.html   |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Stmt.html      |    1 +
 ...classtvm_1_1tir_1_1StmtExprMutator-members.html |    1 +
 .../classtvm_1_1tir_1_1StmtExprMutator.html        |    1 +
 ...classtvm_1_1tir_1_1StmtExprVisitor-members.html |    1 +
 .../classtvm_1_1tir_1_1StmtExprVisitor.html        |    1 +
 .../doxygen/classtvm_1_1tir_1_1StmtFunctor.html    |    1 +
 ...tmt_01_6n_00_01Args_8_8_8args_08_4-members.html |    1 +
 ...onst_01Stmt_01_6n_00_01Args_8_8_8args_08_4.html |    1 +
 .../classtvm_1_1tir_1_1StmtMutator-members.html    |    1 +
 .../doxygen/classtvm_1_1tir_1_1StmtMutator.html    |    1 +
 .../classtvm_1_1tir_1_1StmtNode-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1StmtNode.html  |    1 +
 .../classtvm_1_1tir_1_1StmtSRef-members.html       |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1StmtSRef.html  |    1 +
 .../classtvm_1_1tir_1_1StmtSRefNode-members.html   |    1 +
 .../doxygen/classtvm_1_1tir_1_1StmtSRefNode.html   |    1 +
 .../classtvm_1_1tir_1_1StmtVisitor-members.html    |    1 +
 .../doxygen/classtvm_1_1tir_1_1StmtVisitor.html    |    1 +
 .../classtvm_1_1tir_1_1Stmt__inherit__graph.svg    |   20 +-
 .../doxygen/classtvm_1_1tir_1_1Store-members.html  |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Store.html     |    1 +
 .../classtvm_1_1tir_1_1StoreNode-members.html      |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1StoreNode.html |    1 +
 .../classtvm_1_1tir_1_1StringImm-members.html      |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1StringImm.html |    1 +
 .../classtvm_1_1tir_1_1StringImmNode-members.html  |    1 +
 .../doxygen/classtvm_1_1tir_1_1StringImmNode.html  |    1 +
 .../doxygen/classtvm_1_1tir_1_1Sub-members.html    |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Sub.html       |    1 +
 .../classtvm_1_1tir_1_1SubNode-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1SubNode.html   |    1 +
 .../doxygen/classtvm_1_1tir_1_1Var-members.html    |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1Var.html       |    1 +
 .../classtvm_1_1tir_1_1VarNode-members.html        |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1VarNode.html   |    1 +
 .../doxygen/classtvm_1_1tir_1_1While-members.html  |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1While.html     |    1 +
 .../classtvm_1_1tir_1_1WhileNode-members.html      |    1 +
 docs/api/doxygen/classtvm_1_1tir_1_1WhileNode.html |    1 +
 .../classtvm_1_1transform_1_1Pass-members.html     |    1 +
 .../api/doxygen/classtvm_1_1transform_1_1Pass.html |    1 +
 ...asstvm_1_1transform_1_1PassContext-members.html |    1 +
 .../classtvm_1_1transform_1_1PassContext.html      |    1 +
 ...vm_1_1transform_1_1PassContextNode-members.html |    1 +
 .../classtvm_1_1transform_1_1PassContextNode.html  |    1 +
 .../classtvm_1_1transform_1_1PassInfo-members.html |    1 +
 .../doxygen/classtvm_1_1transform_1_1PassInfo.html |    1 +
 ...sstvm_1_1transform_1_1PassInfoNode-members.html |    1 +
 .../classtvm_1_1transform_1_1PassInfoNode.html     |    1 +
 .../classtvm_1_1transform_1_1PassNode-members.html |    1 +
 .../doxygen/classtvm_1_1transform_1_1PassNode.html |    1 +
 ...lasstvm_1_1transform_1_1Sequential-members.html |    1 +
 .../classtvm_1_1transform_1_1Sequential.html       |    1 +
 docs/api/doxygen/codegen_8h.html                   |    1 +
 docs/api/doxygen/codegen_8h_source.html            |    1 +
 docs/api/doxygen/compute__dag_8h.html              |    1 +
 docs/api/doxygen/compute__dag_8h_source.html       |    1 +
 docs/api/doxygen/constant__utils_8h.html           |    1 +
 docs/api/doxygen/constant__utils_8h_source.html    |    1 +
 docs/api/doxygen/container_8h.html                 |    1 +
 docs/api/doxygen/container_8h_source.html          |    1 +
 docs/api/doxygen/cost__model_8h.html               |    1 +
 docs/api/doxygen/cost__model_8h_source.html        |    1 +
 docs/api/doxygen/crt_2packed__func_8h.html         |    1 +
 docs/api/doxygen/crt_2packed__func_8h_source.html  |    1 +
 docs/api/doxygen/crt_8h.html                       |    1 +
 docs/api/doxygen/crt_8h_source.html                |    1 +
 docs/api/doxygen/cublas_8h.html                    |    1 +
 docs/api/doxygen/cublas_8h_source.html             |    1 +
 docs/api/doxygen/cuda_2dense_8h.html               |    1 +
 docs/api/doxygen/cuda_2dense_8h_source.html        |    1 +
 docs/api/doxygen/cuda_2injective_8h.html           |    1 +
 docs/api/doxygen/cuda_2injective_8h_source.html    |    1 +
 docs/api/doxygen/cuda_2normalization_8h.html       |    1 +
 .../api/doxygen/cuda_2normalization_8h_source.html |    1 +
 docs/api/doxygen/cuda_2pooling_8h.html             |    1 +
 docs/api/doxygen/cuda_2pooling_8h_source.html      |    4 +-
 docs/api/doxygen/cuda_2reduction_8h.html           |    1 +
 docs/api/doxygen/cuda_2reduction_8h_source.html    |    1 +
 docs/api/doxygen/cuda_2softmax_8h.html             |    1 +
 docs/api/doxygen/cuda_2softmax_8h_source.html      |    1 +
 docs/api/doxygen/data__layout_8h.html              |    1 +
 docs/api/doxygen/data__layout_8h_source.html       |    3 +-
 docs/api/doxygen/data__type_8h.html                |    1 +
 docs/api/doxygen/data__type_8h_source.html         |    1 +
 docs/api/doxygen/dataflow__matcher_8h.html         |    1 +
 docs/api/doxygen/dataflow__matcher_8h_source.html  |    1 +
 docs/api/doxygen/dataflow__pattern_8h.html         |    1 +
 docs/api/doxygen/dataflow__pattern_8h_source.html  |    3 +-
 .../api/doxygen/dataflow__pattern__functor_8h.html |    1 +
 .../dataflow__pattern__functor_8h_source.html      |    1 +
 docs/api/doxygen/debug_8h.html                     |    1 +
 docs/api/doxygen/debug_8h_source.html              |    1 +
 docs/api/doxygen/detail_2broadcast_8h.html         |    1 +
 docs/api/doxygen/detail_2broadcast_8h_source.html  |    3 +-
 docs/api/doxygen/detail_2extern_8h.html            |    1 +
 docs/api/doxygen/detail_2extern_8h_source.html     |    7 +-
 docs/api/doxygen/device__api_8h.html               |    1 +
 docs/api/doxygen/device__api_8h_source.html        |    5 +-
 docs/api/doxygen/device__copy_8h.html              |    1 +
 docs/api/doxygen/device__copy_8h_source.html       |    1 +
 docs/api/doxygen/diagnostic_8h.html                |    1 +
 docs/api/doxygen/diagnostic_8h_source.html         |    1 +
 docs/api/doxygen/dilate_8h.html                    |    1 +
 docs/api/doxygen/dilate_8h_source.html             |    1 +
 docs/api/doxygen/dir_000002_000006.html            |    1 +
 docs/api/doxygen/dir_000002_000008.html            |    1 +
 docs/api/doxygen/dir_000002_000009.html            |    1 +
 docs/api/doxygen/dir_000002_000019.html            |    1 +
 docs/api/doxygen/dir_000003_000009.html            |    1 +
 docs/api/doxygen/dir_000003_000013.html            |    1 +
 docs/api/doxygen/dir_000003_000020.html            |    1 +
 docs/api/doxygen/dir_000003_000021.html            |    1 +
 docs/api/doxygen/dir_000004_000006.html            |    1 +
 docs/api/doxygen/dir_000004_000008.html            |    1 +
 docs/api/doxygen/dir_000004_000009.html            |    1 +
 docs/api/doxygen/dir_000004_000011.html            |    1 +
 docs/api/doxygen/dir_000004_000013.html            |    1 +
 docs/api/doxygen/dir_000004_000020.html            |    1 +
 docs/api/doxygen/dir_000004_000021.html            |    1 +
 docs/api/doxygen/dir_000005_000006.html            |    1 +
 docs/api/doxygen/dir_000005_000009.html            |    1 +
 docs/api/doxygen/dir_000005_000019.html            |    1 +
 docs/api/doxygen/dir_000005_000020.html            |    1 +
 docs/api/doxygen/dir_000005_000021.html            |    1 +
 docs/api/doxygen/dir_000006_000004.html            |    1 +
 docs/api/doxygen/dir_000006_000009.html            |    1 +
 docs/api/doxygen/dir_000006_000013.html            |    1 +
 docs/api/doxygen/dir_000006_000014.html            |    1 +
 docs/api/doxygen/dir_000006_000019.html            |    1 +
 docs/api/doxygen/dir_000007_000006.html            |    1 +
 docs/api/doxygen/dir_000007_000009.html            |    1 +
 docs/api/doxygen/dir_000008_000006.html            |    1 +
 docs/api/doxygen/dir_000008_000009.html            |    1 +
 docs/api/doxygen/dir_000008_000013.html            |    1 +
 docs/api/doxygen/dir_000011_000006.html            |    1 +
 docs/api/doxygen/dir_000011_000009.html            |    1 +
 docs/api/doxygen/dir_000012_000002.html            |    1 +
 docs/api/doxygen/dir_000012_000006.html            |    1 +
 docs/api/doxygen/dir_000012_000008.html            |    1 +
 docs/api/doxygen/dir_000012_000009.html            |    1 +
 docs/api/doxygen/dir_000012_000020.html            |    1 +
 docs/api/doxygen/dir_000012_000021.html            |    1 +
 docs/api/doxygen/dir_000013_000009.html            |    1 +
 docs/api/doxygen/dir_000014_000006.html            |    1 +
 docs/api/doxygen/dir_000014_000009.html            |    1 +
 docs/api/doxygen/dir_000019_000009.html            |    1 +
 docs/api/doxygen/dir_000020_000006.html            |    1 +
 docs/api/doxygen/dir_000020_000008.html            |    1 +
 docs/api/doxygen/dir_000020_000009.html            |    1 +
 docs/api/doxygen/dir_000020_000013.html            |    1 +
 docs/api/doxygen/dir_000020_000019.html            |    1 +
 docs/api/doxygen/dir_000021_000002.html            |    1 +
 docs/api/doxygen/dir_000021_000008.html            |    1 +
 docs/api/doxygen/dir_000021_000009.html            |    1 +
 docs/api/doxygen/dir_000021_000019.html            |    1 +
 docs/api/doxygen/dir_000022_000006.html            |    1 +
 docs/api/doxygen/dir_000023_000002.html            |    1 +
 docs/api/doxygen/dir_000023_000008.html            |    1 +
 docs/api/doxygen/dir_000023_000021.html            |    1 +
 docs/api/doxygen/dir_000024_000021.html            |    1 +
 docs/api/doxygen/dir_000024_000023.html            |    1 +
 docs/api/doxygen/dir_000025_000020.html            |    1 +
 docs/api/doxygen/dir_000025_000021.html            |    1 +
 docs/api/doxygen/dir_000025_000023.html            |    1 +
 docs/api/doxygen/dir_000025_000024.html            |    1 +
 docs/api/doxygen/dir_000025_000026.html            |    1 +
 docs/api/doxygen/dir_000025_000028.html            |    1 +
 docs/api/doxygen/dir_000026_000002.html            |    1 +
 docs/api/doxygen/dir_000026_000021.html            |    1 +
 docs/api/doxygen/dir_000026_000023.html            |    1 +
 docs/api/doxygen/dir_000027_000020.html            |    1 +
 docs/api/doxygen/dir_000027_000021.html            |    1 +
 docs/api/doxygen/dir_000027_000023.html            |    1 +
 docs/api/doxygen/dir_000027_000024.html            |    1 +
 docs/api/doxygen/dir_000027_000025.html            |    1 +
 docs/api/doxygen/dir_000027_000026.html            |    1 +
 docs/api/doxygen/dir_000027_000028.html            |    1 +
 docs/api/doxygen/dir_000028_000020.html            |    1 +
 docs/api/doxygen/dir_000028_000021.html            |    1 +
 docs/api/doxygen/dir_000028_000023.html            |    1 +
 docs/api/doxygen/dir_000029_000020.html            |    1 +
 docs/api/doxygen/dir_000029_000021.html            |    1 +
 docs/api/doxygen/dir_000029_000023.html            |    1 +
 docs/api/doxygen/dir_000030_000021.html            |    1 +
 docs/api/doxygen/dir_000030_000023.html            |    1 +
 .../dir_006b1f4ac353a18abb55f74cc4796db6.html      |    1 +
 .../dir_02be2c9d68e402f80df60bd528724ee5.html      |    1 +
 .../dir_0a768efda19551002dc8866436c9ffae.html      |    1 +
 .../dir_194ecda214f05a38134392ac6a69b970.html      |    1 +
 .../dir_1f1b12d204a071c9e67e47fcbb552b86.html      |    1 +
 .../dir_2b0ef9f1c86b565a92e96353e1195b2c.html      |    1 +
 .../dir_3a038e7bfa2370c6aee2a5aecd5d3ef1.html      |    1 +
 .../dir_404558507ed35459f0d7a6d81d8c508d.html      |    1 +
 .../dir_519be2d4a83a987dbf989f1de527b870.html      |    1 +
 .../dir_54983dd6d74c59f67ee9e8e5a50aafc4.html      |    1 +
 .../dir_58d8b9f286105d83d91913f543711c4c.html      |    1 +
 .../dir_5baffeed82c1190bfdf7a4f918ab5ac6.html      |    1 +
 .../dir_5da96592f3a7c442b838b075c58254c2.html      |    1 +
 .../dir_63946bee875c6d52bce55e72a67a86ad.html      |    1 +
 .../dir_6cd4295f6ad5aa17e5b568d0e5b190e5.html      |    1 +
 .../dir_72c2f11201cd7636dc7624de0754daa5.html      |    1 +
 .../dir_8395ded0a3205c0748976a0d4487d38d.html      |    1 +
 .../dir_8e4e25e66b8623d88c5b5dd2040bca97.html      |    1 +
 .../dir_a2900df4deca8dd2bcded616f0fe650a.html      |    1 +
 .../dir_a98464176f1216e334ac3bbacd433085.html      |    1 +
 .../dir_ac57496531ccbad72f774fa62e6de987.html      |    1 +
 .../dir_b4c7d8e826c599ba55146c099a14beb5.html      |    1 +
 .../dir_d331277d4303e21ded95616eb56c1a9e.html      |    1 +
 .../dir_d3953cf7eb33eca56fc6850c0e98447d.html      |    1 +
 .../dir_d44c64559bbebec7f509842c48db8b23.html      |    1 +
 .../dir_d523279167051dc3aad9a40981221f4d.html      |    1 +
 .../dir_dc867ff9a37cad1764f1670dc7eba6c1.html      |    1 +
 .../dir_f13f0b82f2bd345d2d28ad76dc90e0ea.html      |    1 +
 .../dir_f97d855a3173728370e632aa77170e34.html      |    1 +
 .../dir_fafc18f54a755f417c55c769623cbfef.html      |    1 +
 .../dir_fb1b1bc11a768ab8cf63a96a73170118.html      |    1 +
 docs/api/doxygen/driver__api_8h.html               |    1 +
 docs/api/doxygen/driver__api_8h_source.html        |    1 +
 docs/api/doxygen/einsum_8h.html                    |    1 +
 docs/api/doxygen/einsum_8h_source.html             |    3 +-
 docs/api/doxygen/elemwise_8h.html                  |    1 +
 docs/api/doxygen/elemwise_8h_source.html           |    3 +-
 docs/api/doxygen/env__func_8h.html                 |    1 +
 docs/api/doxygen/env__func_8h_source.html          |    1 +
 docs/api/doxygen/error_8h.html                     |    1 +
 docs/api/doxygen/error_8h_source.html              |    1 +
 docs/api/doxygen/error__codes_8h.html              |    1 +
 docs/api/doxygen/error__codes_8h_source.html       |    1 +
 docs/api/doxygen/{index.html => examples.html}     |   11 +-
 docs/api/doxygen/executable_8h.html                |    1 +
 docs/api/doxygen/executable_8h_source.html         |    1 +
 docs/api/doxygen/files.html                        |    1 +
 docs/api/doxygen/flatten_8h.html                   |    1 +
 docs/api/doxygen/flatten_8h_source.html            |    1 +
 docs/api/doxygen/frame__buffer_8h.html             |    1 +
 docs/api/doxygen/frame__buffer_8h_source.html      |    1 +
 docs/api/doxygen/framing_8h.html                   |    1 +
 docs/api/doxygen/framing_8h_source.html            |    1 +
 docs/api/doxygen/func__registry_8h.html            |    1 +
 docs/api/doxygen/func__registry_8h_source.html     |    1 +
 docs/api/doxygen/functions.html                    |    1 +
 docs/api/doxygen/functions_0x7e.html               |    1 +
 docs/api/doxygen/functions_a.html                  |    1 +
 docs/api/doxygen/functions_b.html                  |    8 +-
 docs/api/doxygen/functions_c.html                  |    1 +
 docs/api/doxygen/functions_d.html                  |    9 +-
 docs/api/doxygen/functions_e.html                  |    1 +
 docs/api/doxygen/functions_enum.html               |    1 +
 docs/api/doxygen/functions_eval.html               |    1 +
 docs/api/doxygen/functions_f.html                  |    8 +-
 docs/api/doxygen/functions_func.html               |    1 +
 docs/api/doxygen/functions_func_0x7e.html          |    1 +
 docs/api/doxygen/functions_func_a.html             |    1 +
 docs/api/doxygen/functions_func_b.html             |    1 +
 docs/api/doxygen/functions_func_c.html             |    1 +
 docs/api/doxygen/functions_func_d.html             |    1 +
 docs/api/doxygen/functions_func_e.html             |    1 +
 docs/api/doxygen/functions_func_f.html             |    4 +
 docs/api/doxygen/functions_func_g.html             |    1 +
 docs/api/doxygen/functions_func_h.html             |    1 +
 docs/api/doxygen/functions_func_i.html             |    1 +
 docs/api/doxygen/functions_func_j.html             |    1 +
 docs/api/doxygen/functions_func_l.html             |    1 +
 docs/api/doxygen/functions_func_m.html             |    1 +
 docs/api/doxygen/functions_func_n.html             |    1 +
 docs/api/doxygen/functions_func_o.html             |    1 +
 docs/api/doxygen/functions_func_p.html             |    1 +
 docs/api/doxygen/functions_func_r.html             |    1 +
 docs/api/doxygen/functions_func_s.html             |    7 +-
 docs/api/doxygen/functions_func_t.html             |   14 +-
 docs/api/doxygen/functions_func_u.html             |    1 +
 docs/api/doxygen/functions_func_v.html             |    1 +
 docs/api/doxygen/functions_func_w.html             |    1 +
 docs/api/doxygen/functions_g.html                  |    1 +
 docs/api/doxygen/functions_h.html                  |    1 +
 docs/api/doxygen/functions_i.html                  |    1 +
 docs/api/doxygen/functions_j.html                  |    1 +
 docs/api/doxygen/functions_k.html                  |    1 +
 docs/api/doxygen/functions_l.html                  |    1 +
 docs/api/doxygen/functions_m.html                  |    4 +-
 docs/api/doxygen/functions_n.html                  |    1 +
 docs/api/doxygen/functions_o.html                  |   14 +-
 docs/api/doxygen/functions_p.html                  |    1 +
 docs/api/doxygen/functions_r.html                  |    5 +-
 docs/api/doxygen/functions_rela.html               |    1 +
 docs/api/doxygen/functions_s.html                  |    5 +-
 docs/api/doxygen/functions_t.html                  |   14 +-
 docs/api/doxygen/functions_type.html               |    1 +
 docs/api/doxygen/functions_u.html                  |    3 +-
 docs/api/doxygen/functions_v.html                  |    1 +
 docs/api/doxygen/functions_vars.html               |    1 +
 docs/api/doxygen/functions_vars_a.html             |    1 +
 docs/api/doxygen/functions_vars_b.html             |    4 +
 docs/api/doxygen/functions_vars_c.html             |    1 +
 docs/api/doxygen/functions_vars_d.html             |    9 +-
 docs/api/doxygen/functions_vars_e.html             |    1 +
 docs/api/doxygen/functions_vars_f.html             |    1 +
 docs/api/doxygen/functions_vars_g.html             |    1 +
 docs/api/doxygen/functions_vars_h.html             |    1 +
 docs/api/doxygen/functions_vars_i.html             |    1 +
 docs/api/doxygen/functions_vars_k.html             |    1 +
 docs/api/doxygen/functions_vars_l.html             |    1 +
 docs/api/doxygen/functions_vars_m.html             |    2 +
 docs/api/doxygen/functions_vars_n.html             |    1 +
 docs/api/doxygen/functions_vars_o.html             |    4 +-
 docs/api/doxygen/functions_vars_p.html             |    1 +
 docs/api/doxygen/functions_vars_r.html             |    1 +
 docs/api/doxygen/functions_vars_s.html             |    1 +
 docs/api/doxygen/functions_vars_t.html             |    1 +
 docs/api/doxygen/functions_vars_u.html             |    1 +
 docs/api/doxygen/functions_vars_v.html             |    1 +
 docs/api/doxygen/functions_vars_w.html             |    1 +
 docs/api/doxygen/functions_w.html                  |    1 +
 docs/api/doxygen/functor_8h.html                   |    1 +
 docs/api/doxygen/functor_8h_source.html            |    1 +
 docs/api/doxygen/fuse_8h.html                      |    1 +
 docs/api/doxygen/fuse_8h_source.html               |    1 +
 docs/api/doxygen/generic_2default_8h.html          |    1 +
 docs/api/doxygen/generic_2default_8h_source.html   |    1 +
 docs/api/doxygen/generic_2extern_8h.html           |    1 +
 docs/api/doxygen/generic_2extern_8h_source.html    |    1 +
 docs/api/doxygen/generic_2injective_8h.html        |    1 +
 docs/api/doxygen/generic_2injective_8h_source.html |    1 +
 docs/api/doxygen/generic__func_8h.html             |    1 +
 docs/api/doxygen/generic__func_8h_source.html      |    1 +
 docs/api/doxygen/globals.html                      |    1 +
 docs/api/doxygen/globals_defs.html                 |    1 +
 docs/api/doxygen/globals_e.html                    |    1 +
 docs/api/doxygen/globals_enum.html                 |    1 +
 docs/api/doxygen/globals_eval.html                 |    1 +
 docs/api/doxygen/globals_f.html                    |    1 +
 docs/api/doxygen/globals_func.html                 |    1 +
 docs/api/doxygen/globals_g.html                    |    1 +
 docs/api/doxygen/globals_i.html                    |    1 +
 docs/api/doxygen/globals_k.html                    |    1 +
 docs/api/doxygen/globals_l.html                    |    1 +
 docs/api/doxygen/globals_m.html                    |    1 +
 docs/api/doxygen/globals_n.html                    |    1 +
 docs/api/doxygen/globals_p.html                    |    1 +
 docs/api/doxygen/globals_r.html                    |    1 +
 docs/api/doxygen/globals_s.html                    |    1 +
 docs/api/doxygen/globals_t.html                    |    1 +
 docs/api/doxygen/globals_type.html                 |    1 +
 docs/api/doxygen/globals_u.html                    |    1 +
 docs/api/doxygen/globals_v.html                    |    1 +
 docs/api/doxygen/globals_vars.html                 |    1 +
 docs/api/doxygen/graph__executor_8h.html           |    1 +
 docs/api/doxygen/graph__executor_8h_source.html    |    1 +
 docs/api/doxygen/graph__executor__module_8h.html   |    1 +
 .../doxygen/graph__executor__module_8h_source.html |    1 +
 docs/api/doxygen/graph_legend.html                 |    1 +
 docs/api/doxygen/hierarchy.html                    |    1 +
 docs/api/doxygen/image_8h.html                     |    1 +
 docs/api/doxygen/image_8h_source.html              |    1 +
 docs/api/doxygen/index.html                        |    1 +
 docs/api/doxygen/inherits.html                     |    1 +
 docs/api/doxygen/int__set_8h.html                  |    1 +
 docs/api/doxygen/int__set_8h_source.html           |    1 +
 docs/api/doxygen/int__solver_8h.html               |    1 +
 docs/api/doxygen/int__solver_8h_source.html        |    1 +
 docs/api/doxygen/interpreter_8h.html               |    1 +
 docs/api/doxygen/interpreter_8h_source.html        |    1 +
 docs/api/doxygen/ir_2adt_8h.html                   |    1 +
 docs/api/doxygen/ir_2adt_8h_source.html            |    1 +
 docs/api/doxygen/ir_2attrs_8h.html                 |    1 +
 docs/api/doxygen/ir_2attrs_8h_source.html          |    1 +
 docs/api/doxygen/ir_2expr_8h.html                  |    1 +
 docs/api/doxygen/ir_2expr_8h_source.html           |    1 +
 docs/api/doxygen/ir_2function_8h.html              |    1 +
 docs/api/doxygen/ir_2function_8h_source.html       |    1 +
 docs/api/doxygen/ir_2module_8h.html                |    1 +
 docs/api/doxygen/ir_2module_8h_source.html         |    1 +
 docs/api/doxygen/ir_2op_8h.html                    |    1 +
 docs/api/doxygen/ir_2op_8h_source.html             |    1 +
 docs/api/doxygen/ir_2transform_8h.html             |    1 +
 docs/api/doxygen/ir_2transform_8h_source.html      |    1 +
 docs/api/doxygen/ir_2type_8h.html                  |    1 +
 docs/api/doxygen/ir_2type_8h_source.html           |    1 +
 docs/api/doxygen/iter__affine__map_8h.html         |    1 +
 docs/api/doxygen/iter__affine__map_8h_source.html  |    1 +
 docs/api/doxygen/local__response__norm_8h.html     |    1 +
 .../doxygen/local__response__norm_8h_source.html   |    1 +
 docs/api/doxygen/loop__state_8h.html               |    1 +
 docs/api/doxygen/loop__state_8h_source.html        |    1 +
 docs/api/doxygen/mapping_8h.html                   |    1 +
 docs/api/doxygen/mapping_8h_source.html            |    1 +
 docs/api/doxygen/measure_8h.html                   |    1 +
 docs/api/doxygen/measure_8h_source.html            |    3 +-
 docs/api/doxygen/measure__record_8h.html           |    1 +
 docs/api/doxygen/measure__record_8h_source.html    |    1 +
 docs/api/doxygen/memory__manager_8h.html           |    1 +
 docs/api/doxygen/memory__manager_8h_source.html    |    3 +-
 docs/api/doxygen/namespacedmlc.html                |    1 +
 docs/api/doxygen/namespacedmlc_1_1serializer.html  |    1 +
 docs/api/doxygen/namespacellvm.html                |    1 +
 docs/api/doxygen/namespacemembers.html             |    1 +
 docs/api/doxygen/namespacemembers_b.html           |    1 +
 docs/api/doxygen/namespacemembers_c.html           |    9 +-
 docs/api/doxygen/namespacemembers_d.html           |    1 +
 docs/api/doxygen/namespacemembers_e.html           |    1 +
 docs/api/doxygen/namespacemembers_enum.html        |    1 +
 docs/api/doxygen/namespacemembers_eval.html        |    1 +
 docs/api/doxygen/namespacemembers_f.html           |    8 +-
 docs/api/doxygen/namespacemembers_func.html        |    1 +
 docs/api/doxygen/namespacemembers_func_b.html      |    1 +
 docs/api/doxygen/namespacemembers_func_c.html      |    9 +-
 docs/api/doxygen/namespacemembers_func_d.html      |    1 +
 docs/api/doxygen/namespacemembers_func_e.html      |    1 +
 docs/api/doxygen/namespacemembers_func_f.html      |   10 +-
 docs/api/doxygen/namespacemembers_func_g.html      |    1 +
 docs/api/doxygen/namespacemembers_func_h.html      |    1 +
 docs/api/doxygen/namespacemembers_func_i.html      |    1 +
 docs/api/doxygen/namespacemembers_func_j.html      |    1 +
 docs/api/doxygen/namespacemembers_func_k.html      |    1 +
 docs/api/doxygen/namespacemembers_func_l.html      |    1 +
 docs/api/doxygen/namespacemembers_func_m.html      |    1 +
 docs/api/doxygen/namespacemembers_func_n.html      |    1 +
 docs/api/doxygen/namespacemembers_func_o.html      |    1 +
 docs/api/doxygen/namespacemembers_func_p.html      |   16 +-
 docs/api/doxygen/namespacemembers_func_q.html      |    1 +
 docs/api/doxygen/namespacemembers_func_r.html      |    1 +
 docs/api/doxygen/namespacemembers_func_s.html      |    7 +-
 docs/api/doxygen/namespacemembers_func_t.html      |    3 +-
 docs/api/doxygen/namespacemembers_func_u.html      |    1 +
 docs/api/doxygen/namespacemembers_func_v.html      |    1 +
 docs/api/doxygen/namespacemembers_func_w.html      |    1 +
 docs/api/doxygen/namespacemembers_func_y.html      |    1 +
 docs/api/doxygen/namespacemembers_g.html           |    1 +
 docs/api/doxygen/namespacemembers_h.html           |    1 +
 docs/api/doxygen/namespacemembers_i.html           |    1 +
 docs/api/doxygen/namespacemembers_j.html           |    1 +
 docs/api/doxygen/namespacemembers_k.html           |    6 +-
 docs/api/doxygen/namespacemembers_l.html           |    1 +
 docs/api/doxygen/namespacemembers_m.html           |    1 +
 docs/api/doxygen/namespacemembers_n.html           |    1 +
 docs/api/doxygen/namespacemembers_o.html           |    1 +
 docs/api/doxygen/namespacemembers_p.html           |   16 +-
 docs/api/doxygen/namespacemembers_q.html           |    1 +
 docs/api/doxygen/namespacemembers_r.html           |    1 +
 docs/api/doxygen/namespacemembers_s.html           |    7 +-
 docs/api/doxygen/namespacemembers_t.html           |    6 +-
 docs/api/doxygen/namespacemembers_type.html        |    4 +
 docs/api/doxygen/namespacemembers_u.html           |    1 +
 docs/api/doxygen/namespacemembers_v.html           |    1 +
 docs/api/doxygen/namespacemembers_vars.html        |    4 +
 docs/api/doxygen/namespacemembers_w.html           |    1 +
 docs/api/doxygen/namespacemembers_y.html           |    1 +
 docs/api/doxygen/namespaces.html                   |    1 +
 docs/api/doxygen/namespacetvm.html                 |    1 +
 docs/api/doxygen/namespacetvm_1_1arith.html        |    1 +
 docs/api/doxygen/namespacetvm_1_1attr.html         |    1 +
 .../doxygen/namespacetvm_1_1auto__scheduler.html   |    1 +
 docs/api/doxygen/namespacetvm_1_1codegen.html      |    1 +
 docs/api/doxygen/namespacetvm_1_1detail.html       |    1 +
 docs/api/doxygen/namespacetvm_1_1parser.html       |    1 +
 docs/api/doxygen/namespacetvm_1_1relay.html        |   18 +
 .../api/doxygen/namespacetvm_1_1relay_1_1attr.html |   18 +
 docs/api/doxygen/namespacetvm_1_1relay_1_1qnn.html |    1 +
 .../namespacetvm_1_1relay_1_1qnn_1_1transform.html |    1 +
 .../namespacetvm_1_1relay_1_1transform.html        |   22 +-
 docs/api/doxygen/namespacetvm_1_1runtime.html      |    1 +
 .../namespacetvm_1_1runtime_1_1micro__rpc.html     |    1 +
 .../namespacetvm_1_1runtime_1_1profiling.html      |    1 +
 .../doxygen/namespacetvm_1_1runtime_1_1symbol.html |    1 +
 .../namespacetvm_1_1runtime_1_1threading.html      |    1 +
 .../api/doxygen/namespacetvm_1_1runtime_1_1vm.html |    1 +
 docs/api/doxygen/namespacetvm_1_1support.html      |    1 +
 docs/api/doxygen/namespacetvm_1_1te.html           |    1 +
 docs/api/doxygen/namespacetvm_1_1tir.html          |    1 +
 docs/api/doxygen/namespacetvm_1_1tir_1_1attr.html  |    4 +-
 .../doxygen/namespacetvm_1_1tir_1_1builtin.html    |    9 +-
 .../doxygen/namespacetvm_1_1tir_1_1transform.html  |  167 ++-
 docs/api/doxygen/namespacetvm_1_1topi.html         |   31 +-
 .../doxygen/namespacetvm_1_1topi_1_1contrib.html   |    1 +
 docs/api/doxygen/namespacetvm_1_1topi_1_1cuda.html |    1 +
 .../doxygen/namespacetvm_1_1topi_1_1generic.html   |    1 +
 docs/api/doxygen/namespacetvm_1_1topi_1_1nn.html   |  177 +--
 docs/api/doxygen/namespacetvm_1_1topi_1_1rocm.html |    1 +
 .../doxygen/namespacetvm_1_1topi_1_1vision.html    |    1 +
 docs/api/doxygen/namespacetvm_1_1topi_1_1x86.html  |    1 +
 docs/api/doxygen/namespacetvm_1_1transform.html    |    1 +
 docs/api/doxygen/ndarray_8h.html                   |    1 +
 docs/api/doxygen/ndarray_8h_source.html            |    3 +-
 docs/api/doxygen/nn_2bnn_8h.html                   |    1 +
 docs/api/doxygen/nn_2bnn_8h_source.html            |    3 +-
 docs/api/doxygen/nn_2dense_8h.html                 |    1 +
 docs/api/doxygen/nn_2dense_8h_source.html          |    3 +-
 docs/api/doxygen/nn_2pooling_8h.html               |   28 +-
 docs/api/doxygen/nn_2pooling_8h_source.html        |   36 +-
 docs/api/doxygen/nn_2softmax_8h.html               |    1 +
 docs/api/doxygen/nn_2softmax_8h_source.html        |    1 +
 docs/api/doxygen/node_8h.html                      |    1 +
 docs/api/doxygen/node_8h_source.html               |    1 +
 docs/api/doxygen/object_8h.html                    |    1 +
 docs/api/doxygen/object_8h_source.html             |    1 +
 docs/api/doxygen/op__strategy_8h.html              |    1 +
 docs/api/doxygen/op__strategy_8h_source.html       |    1 +
 docs/api/doxygen/operation_8h.html                 |    1 +
 docs/api/doxygen/operation_8h_source.html          |    3 +-
 docs/api/doxygen/packed__func_8h.html              |    1 +
 docs/api/doxygen/packed__func_8h_source.html       |    1 +
 docs/api/doxygen/pad__utils_8h.html                |    1 +
 docs/api/doxygen/pad__utils_8h_source.html         |    1 +
 docs/api/doxygen/parallel__for_8h.html             |    1 +
 docs/api/doxygen/parallel__for_8h_source.html      |    1 +
 docs/api/doxygen/parser_8h.html                    |    1 +
 docs/api/doxygen/parser_8h_source.html             |    1 +
 docs/api/doxygen/pattern_8h.html                   |    1 +
 docs/api/doxygen/pattern_8h_source.html            |    1 +
 docs/api/doxygen/pattern__functor_8h.html          |    1 +
 docs/api/doxygen/pattern__functor_8h_source.html   |    1 +
 docs/api/doxygen/platform_8h.html                  |    1 +
 docs/api/doxygen/platform_8h_source.html           |    1 +
 docs/api/doxygen/profiling_8h.html                 |    1 +
 docs/api/doxygen/profiling_8h_source.html          |    1 +
 docs/api/doxygen/random_8h.html                    |    1 +
 docs/api/doxygen/random_8h_source.html             |    1 +
 docs/api/doxygen/ravel__unravel_8h.html            |    1 +
 docs/api/doxygen/ravel__unravel_8h_source.html     |    3 +-
 docs/api/doxygen/reduce_8h.html                    |    1 +
 docs/api/doxygen/reduce_8h_source.html             |    1 +
 docs/api/doxygen/reduction_8h.html                 |    1 +
 docs/api/doxygen/reduction_8h_source.html          |    5 +-
 docs/api/doxygen/reflection_8h.html                |    1 +
 docs/api/doxygen/reflection_8h_source.html         |    1 +
 docs/api/doxygen/registry_8h.html                  |    1 +
 docs/api/doxygen/registry_8h_source.html           |    1 +
 docs/api/doxygen/relay_2adt_8h.html                |    1 +
 docs/api/doxygen/relay_2adt_8h_source.html         |    1 +
 docs/api/doxygen/relay_2analysis_8h.html           |    1 +
 docs/api/doxygen/relay_2analysis_8h_source.html    |    1 +
 docs/api/doxygen/relay_2attrs_2memory_8h.html      |    1 +
 .../doxygen/relay_2attrs_2memory_8h_source.html    |    1 +
 docs/api/doxygen/relay_2attrs_2nn_8h.html          |    1 +
 docs/api/doxygen/relay_2attrs_2nn_8h_source.html   |  363 +++---
 docs/api/doxygen/relay_2attrs_2transform_8h.html   |    1 +
 .../doxygen/relay_2attrs_2transform_8h_source.html |  207 ++--
 docs/api/doxygen/relay_2attrs_2vm_8h.html          |    1 +
 docs/api/doxygen/relay_2attrs_2vm_8h_source.html   |    1 +
 docs/api/doxygen/relay_2expr_8h.html               |    1 +
 docs/api/doxygen/relay_2expr_8h_source.html        |    1 +
 docs/api/doxygen/relay_2expr__functor_8h.html      |    1 +
 .../doxygen/relay_2expr__functor_8h_source.html    |    1 +
 docs/api/doxygen/relay_2feature_8h.html            |    1 +
 docs/api/doxygen/relay_2feature_8h_source.html     |    1 +
 docs/api/doxygen/relay_2function_8h.html           |    4 +
 docs/api/doxygen/relay_2function_8h_source.html    |    4 +-
 docs/api/doxygen/relay_2op_8h.html                 |    1 +
 docs/api/doxygen/relay_2op_8h_source.html          |    1 +
 docs/api/doxygen/relay_2op__attr__types_8h.html    |    4 +
 .../doxygen/relay_2op__attr__types_8h_source.html  |   12 +-
 docs/api/doxygen/relay_2qnn_2attrs_8h.html         |    1 +
 docs/api/doxygen/relay_2qnn_2attrs_8h_source.html  |    1 +
 docs/api/doxygen/relay_2qnn_2transform_8h.html     |    1 +
 .../doxygen/relay_2qnn_2transform_8h_source.html   |    1 +
 docs/api/doxygen/relay_2transform_8h.html          |    1 +
 docs/api/doxygen/relay_2transform_8h_source.html   |    1 +
 docs/api/doxygen/relay_2type_8h.html               |    1 +
 docs/api/doxygen/relay_2type_8h_source.html        |    5 +-
 docs/api/doxygen/reorg_8h.html                     |    1 +
 docs/api/doxygen/reorg_8h_source.html              |    1 +
 docs/api/doxygen/repr__printer_8h.html             |    1 +
 docs/api/doxygen/repr__printer_8h_source.html      |    1 +
 docs/api/doxygen/rocblas_8h.html                   |    1 +
 docs/api/doxygen/rocblas_8h_source.html            |    1 +
 docs/api/doxygen/rocm_2dense_8h.html               |    1 +
 docs/api/doxygen/rocm_2dense_8h_source.html        |    1 +
 docs/api/doxygen/rocm_2injective_8h.html           |    1 +
 docs/api/doxygen/rocm_2injective_8h_source.html    |    1 +
 docs/api/doxygen/rocm_2normalization_8h.html       |    1 +
 .../api/doxygen/rocm_2normalization_8h_source.html |    1 +
 docs/api/doxygen/rocm_2pooling_8h.html             |    1 +
 docs/api/doxygen/rocm_2pooling_8h_source.html      |    1 +
 docs/api/doxygen/rocm_2reduction_8h.html           |    1 +
 docs/api/doxygen/rocm_2reduction_8h_source.html    |    1 +
 docs/api/doxygen/rocm_2softmax_8h.html             |    1 +
 docs/api/doxygen/rocm_2softmax_8h_source.html      |    1 +
 docs/api/doxygen/runtime_2crt_2memory_8h.html      |    1 +
 .../doxygen/runtime_2crt_2memory_8h_source.html    |    1 +
 docs/api/doxygen/runtime_2crt_2module_8h.html      |    1 +
 .../doxygen/runtime_2crt_2module_8h_source.html    |    1 +
 docs/api/doxygen/runtime_2memory_8h.html           |    1 +
 docs/api/doxygen/runtime_2memory_8h_source.html    |    1 +
 docs/api/doxygen/runtime_2module_8h.html           |    1 +
 docs/api/doxygen/runtime_2module_8h_source.html    |    1 +
 docs/api/doxygen/runtime_2vm_2vm_8h.html           |    1 +
 docs/api/doxygen/runtime_2vm_2vm_8h_source.html    |    3 +-
 docs/api/doxygen/schedule__pass_8h.html            |    1 +
 docs/api/doxygen/schedule__pass_8h_source.html     |    1 +
 docs/api/doxygen/search/all_10.js                  |   17 +-
 docs/api/doxygen/search/all_12.js                  |    2 +-
 docs/api/doxygen/search/all_13.js                  |    8 +-
 docs/api/doxygen/search/all_14.js                  |   25 +-
 docs/api/doxygen/search/all_15.js                  |    2 +-
 docs/api/doxygen/search/all_2.js                   |   13 +-
 docs/api/doxygen/search/all_3.js                   |   12 +-
 docs/api/doxygen/search/all_4.js                   |    2 +-
 docs/api/doxygen/search/all_6.js                   |   18 +-
 docs/api/doxygen/search/all_b.js                   |    1 +
 docs/api/doxygen/search/all_d.js                   |    4 +-
 docs/api/doxygen/search/all_f.js                   |    2 +-
 docs/api/doxygen/search/functions_10.js            |    9 +-
 docs/api/doxygen/search/functions_13.js            |    4 +-
 docs/api/doxygen/search/functions_14.js            |    4 +-
 docs/api/doxygen/search/functions_3.js             |    2 +
 docs/api/doxygen/search/functions_6.js             |    2 +
 docs/api/doxygen/search/functions_d.js             |    2 +-
 docs/api/doxygen/search/typedefs_e.js              |    1 +
 docs/api/doxygen/search/variables_2.js             |    1 +
 docs/api/doxygen/search/variables_4.js             |    2 +-
 docs/api/doxygen/search/variables_a.js             |    1 +
 docs/api/doxygen/search/variables_c.js             |    2 +-
 docs/api/doxygen/search/variables_e.js             |    2 +-
 docs/api/doxygen/search__policy_8h.html            |    1 +
 docs/api/doxygen/search__policy_8h_source.html     |    1 +
 docs/api/doxygen/search__task_8h.html              |    1 +
 docs/api/doxygen/search__task_8h_source.html       |    1 +
 docs/api/doxygen/serialization_8h.html             |    1 +
 docs/api/doxygen/serialization_8h_source.html      |    1 +
 docs/api/doxygen/serializer_8h.html                |    1 +
 docs/api/doxygen/serializer_8h_source.html         |    3 +-
 docs/api/doxygen/session_8h.html                   |    1 +
 docs/api/doxygen/session_8h_source.html            |    1 +
 docs/api/doxygen/source__map_8h.html               |    1 +
 docs/api/doxygen/source__map_8h_source.html        |    1 +
 docs/api/doxygen/span_8h.html                      |    1 +
 docs/api/doxygen/span_8h_source.html               |    1 +
 docs/api/doxygen/state_8h.html                     |    1 +
 docs/api/doxygen/state_8h_source.html              |    1 +
 docs/api/doxygen/stmt_8h.html                      |    1 +
 docs/api/doxygen/stmt_8h_source.html               |  343 +++---
 docs/api/doxygen/stmt__functor_8h.html             |    1 +
 docs/api/doxygen/stmt__functor_8h_source.html      |   25 +-
 .../structMemoryManagerInterface-members.html      |    1 +
 docs/api/doxygen/structMemoryManagerInterface.html |    1 +
 docs/api/doxygen/structTVMArgs-members.html        |    1 +
 docs/api/doxygen/structTVMArgs.html                |    1 +
 docs/api/doxygen/structTVMByteArray-members.html   |    1 +
 docs/api/doxygen/structTVMByteArray.html           |    1 +
 .../api/doxygen/structTVMFuncRegistry-members.html |    1 +
 docs/api/doxygen/structTVMFuncRegistry.html        |    1 +
 .../structTVMGraphExecutorGraphAttr-members.html   |    1 +
 .../doxygen/structTVMGraphExecutorGraphAttr.html   |    1 +
 docs/api/doxygen/structTVMModule-members.html      |    1 +
 docs/api/doxygen/structTVMModule.html              |    1 +
 .../structTVMMutableFuncRegistry-members.html      |    1 +
 docs/api/doxygen/structTVMMutableFuncRegistry.html |    1 +
 docs/api/doxygen/structTVMOpParam-members.html     |    1 +
 docs/api/doxygen/structTVMOpParam.html             |    1 +
 docs/api/doxygen/structTVMPackedFunc-members.html  |    1 +
 docs/api/doxygen/structTVMPackedFunc.html          |    1 +
 .../doxygen/structTVMParallelGroupEnv-members.html |    1 +
 docs/api/doxygen/structTVMParallelGroupEnv.html    |    1 +
 ...zer_1_1Handler_3_01DLDataType_01_4-members.html |    1 +
 ...1serializer_1_1Handler_3_01DLDataType_01_4.html |    1 +
 ...lizer_1_1Handler_3_01DLDevice_01_4-members.html |    1 +
 ...1_1serializer_1_1Handler_3_01DLDevice_01_4.html |    1 +
 .../doxygen/structtvm_1_1AttrError-members.html    |    1 +
 docs/api/doxygen/structtvm_1_1AttrError.html       |    1 +
 .../doxygen/structtvm_1_1ErrorBuilder-members.html |    1 +
 docs/api/doxygen/structtvm_1_1ErrorBuilder.html    |    1 +
 ...er_1_1AttachMapNode_1_1IterKeyHash-members.html |    1 +
 ..._scheduler_1_1AttachMapNode_1_1IterKeyHash.html |    1 +
 ...auto__scheduler_1_1SearchPolicyKey-members.html |    1 +
 ...ttvm_1_1auto__scheduler_1_1SearchPolicyKey.html |    1 +
 ...auto__scheduler_1_1StageAttributes-members.html |    1 +
 ...ttvm_1_1auto__scheduler_1_1StageAttributes.html |    1 +
 ...ructtvm_1_1detail_1_1AttrInitEntry-members.html |    1 +
 .../structtvm_1_1detail_1_1AttrInitEntry.html      |    1 +
 ...tructtvm_1_1detail_1_1AttrNopEntry-members.html |    1 +
 .../structtvm_1_1detail_1_1AttrNopEntry.html       |    1 +
 ...tail_1_1AttrTriggerNonDefaultEntry-members.html |    1 +
 ...vm_1_1detail_1_1AttrTriggerNonDefaultEntry.html |    1 +
 ...ttvm_1_1detail_1_1ImplSEqualReduce-members.html |    1 +
 .../structtvm_1_1detail_1_1ImplSEqualReduce.html   |    1 +
 ...lSEqualReduce_3_01T_00_01true_01_4-members.html |    1 +
 ...l_1_1ImplSEqualReduce_3_01T_00_01true_01_4.html |    1 +
 ...cttvm_1_1detail_1_1ImplSHashReduce-members.html |    1 +
 .../structtvm_1_1detail_1_1ImplSHashReduce.html    |    1 +
 ...plSHashReduce_3_01T_00_01true_01_4-members.html |    1 +
 ...il_1_1ImplSHashReduce_3_01T_00_01true_01_4.html |    1 +
 ...ucttvm_1_1detail_1_1ImplVisitAttrs-members.html |    1 +
 .../structtvm_1_1detail_1_1ImplVisitAttrs.html     |    1 +
 ...mplVisitAttrs_3_01T_00_01true_01_4-members.html |    1 +
 ...ail_1_1ImplVisitAttrs_3_01T_00_01true_01_4.html |    1 +
 ...cttvm_1_1detail_1_1ReflectionTrait-members.html |    1 +
 .../structtvm_1_1detail_1_1ReflectionTrait.html    |    1 +
 ...vm_1_1detail_1_1SelectSEqualReduce-members.html |    1 +
 .../structtvm_1_1detail_1_1SelectSEqualReduce.html |    1 +
 ...01T_00_01TraitName_00_01false_01_4-members.html |    1 +
 ...educe_3_01T_00_01TraitName_00_01false_01_4.html |    1 +
 ...tvm_1_1detail_1_1SelectSHashReduce-members.html |    1 +
 .../structtvm_1_1detail_1_1SelectSHashReduce.html  |    1 +
 ...01T_00_01TraitName_00_01false_01_4-members.html |    1 +
 ...educe_3_01T_00_01TraitName_00_01false_01_4.html |    1 +
 ...ttvm_1_1detail_1_1SelectVisitAttrs-members.html |    1 +
 .../structtvm_1_1detail_1_1SelectVisitAttrs.html   |    1 +
 ...01T_00_01TraitName_00_01false_01_4-members.html |    1 +
 ...Attrs_3_01T_00_01TraitName_00_01false_01_4.html |    1 +
 .../structtvm_1_1detail_1_1TypeName-members.html   |    1 +
 .../doxygen/structtvm_1_1detail_1_1TypeName.html   |    1 +
 ...tail_1_1TypeName_3_01DataType_01_4-members.html |    1 +
 ...vm_1_1detail_1_1TypeName_3_01DataType_01_4.html |    1 +
 ..._1detail_1_1TypeName_3_01bool_01_4-members.html |    1 +
 ...ucttvm_1_1detail_1_1TypeName_3_01bool_01_4.html |    1 +
 ...detail_1_1TypeName_3_01double_01_4-members.html |    1 +
 ...ttvm_1_1detail_1_1TypeName_3_01double_01_4.html |    1 +
 ...tail_1_1TypeName_3_01int64__t_01_4-members.html |    1 +
 ...vm_1_1detail_1_1TypeName_3_01int64__t_01_4.html |    1 +
 ...1_1detail_1_1TypeName_3_01int_01_4-members.html |    1 +
 ...ructtvm_1_1detail_1_1TypeName_3_01int_01_4.html |    1 +
 ...ail_1_1TypeName_3_01uint64__t_01_4-members.html |    1 +
 ...m_1_1detail_1_1TypeName_3_01uint64__t_01_4.html |    1 +
 ...ail_1_1TypeName_3_01void_01_5_01_4-members.html |    1 +
 ...m_1_1detail_1_1TypeName_3_01void_01_5_01_4.html |    1 +
 .../structtvm_1_1detail_1_1ValueTypeInfoMaker.html |    1 +
 ...cttvm_1_1detail_1_1is__specialized-members.html |    1 +
 .../structtvm_1_1detail_1_1is__specialized.html    |    1 +
 ...rgs_8_8_8_01_4_00_01Container_01_4-members.html |    1 +
 ...er_3_01Args_8_8_8_01_4_00_01Container_01_4.html |    1 +
 ...vm_1_1relay_1_1AdaptivePool1DAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1AdaptivePool1DAttrs.html |    1 +
 ...vm_1_1relay_1_1AdaptivePool2DAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1AdaptivePool2DAttrs.html |    1 +
 ...vm_1_1relay_1_1AdaptivePool3DAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1AdaptivePool3DAttrs.html |    1 +
 ...ucttvm_1_1relay_1_1AffineGridAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1AffineGridAttrs.html     |    1 +
 ...AllClassNonMaximumSuppressionAttrs-members.html |    1 +
 ...elay_1_1AllClassNonMaximumSuppressionAttrs.html |    1 +
 ...ttvm_1_1relay_1_1AllocStorageAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1AllocStorageAttrs.html   |    1 +
 ...cttvm_1_1relay_1_1AllocTensorAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1AllocTensorAttrs.html    |    1 +
 .../structtvm_1_1relay_1_1ArangeAttrs-members.html |    1 +
 .../doxygen/structtvm_1_1relay_1_1ArangeAttrs.html |    1 +
 ...structtvm_1_1relay_1_1ArgsortAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1ArgsortAttrs.html        |    1 +
 ...1AutoSchedulerLayoutTransformAttrs-members.html |    1 +
 ...relay_1_1AutoSchedulerLayoutTransformAttrs.html |    1 +
 ...ructtvm_1_1relay_1_1AvgPool1DAttrs-members.html |   36 +-
 .../structtvm_1_1relay_1_1AvgPool1DAttrs.html      |   19 +-
 ...tvm_1_1relay_1_1AvgPool1DAttrs__coll__graph.svg |  113 +-
 ..._1_1relay_1_1AvgPool1DAttrs__inherit__graph.svg |  101 +-
 ...ructtvm_1_1relay_1_1AvgPool2DAttrs-members.html |   36 +-
 .../structtvm_1_1relay_1_1AvgPool2DAttrs.html      |   19 +-
 ...tvm_1_1relay_1_1AvgPool2DAttrs__coll__graph.svg |  217 ++--
 ..._1_1relay_1_1AvgPool2DAttrs__inherit__graph.svg |  103 +-
 ...ructtvm_1_1relay_1_1AvgPool3DAttrs-members.html |   36 +-
 .../structtvm_1_1relay_1_1AvgPool3DAttrs.html      |   19 +-
 ...tvm_1_1relay_1_1AvgPool3DAttrs__coll__graph.svg |  113 +-
 ..._1_1relay_1_1AvgPool3DAttrs__inherit__graph.svg |  101 +-
 ...cttvm_1_1relay_1_1BatchMatmulAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1BatchMatmulAttrs.html    |    1 +
 ...ructtvm_1_1relay_1_1BatchNormAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1BatchNormAttrs.html      |    1 +
 ...vm_1_1relay_1_1BatchToSpaceNDAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1BatchToSpaceNDAttrs.html |    1 +
 ...structtvm_1_1relay_1_1BiasAddAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1BiasAddAttrs.html        |    1 +
 ...ttvm_1_1relay_1_1BinaryConv2DAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1BinaryConv2DAttrs.html   |    1 +
 ...cttvm_1_1relay_1_1BinaryDenseAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1BinaryDenseAttrs.html    |    1 +
 ...structtvm_1_1relay_1_1BitPackAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1BitPackAttrs.html        |    1 +
 .../structtvm_1_1relay_1_1CastAttrs-members.html   |    1 +
 .../doxygen/structtvm_1_1relay_1_1CastAttrs.html   |    1 +
 ...tructtvm_1_1relay_1_1CastHintAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1CastHintAttrs.html       |    1 +
 .../structtvm_1_1relay_1_1ClipAttrs-members.html   |    1 +
 .../doxygen/structtvm_1_1relay_1_1ClipAttrs.html   |    1 +
 ...tructtvm_1_1relay_1_1CompilerAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1CompilerAttrs.html       |    1 +
 ...cttvm_1_1relay_1_1ConcatenateAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1ConcatenateAttrs.html    |    1 +
 ...vm_1_1relay_1_1ConstructorValueObj-members.html |    1 +
 .../structtvm_1_1relay_1_1ConstructorValueObj.html |    1 +
 .../structtvm_1_1relay_1_1Conv1DAttrs-members.html |    1 +
 .../doxygen/structtvm_1_1relay_1_1Conv1DAttrs.html |    1 +
 ...m_1_1relay_1_1Conv1DTransposeAttrs-members.html |    1 +
 ...structtvm_1_1relay_1_1Conv1DTransposeAttrs.html |    1 +
 .../structtvm_1_1relay_1_1Conv2DAttrs-members.html |    1 +
 .../doxygen/structtvm_1_1relay_1_1Conv2DAttrs.html |    1 +
 ...m_1_1relay_1_1Conv2DTransposeAttrs-members.html |    1 +
 ...structtvm_1_1relay_1_1Conv2DTransposeAttrs.html |    1 +
 ...vm_1_1relay_1_1Conv2DWinogradAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1Conv2DWinogradAttrs.html |    1 +
 ...WinogradNNPACKWeightTransformAttrs-members.html |    1 +
 ..._1Conv2DWinogradNNPACKWeightTransformAttrs.html |    1 +
 .../structtvm_1_1relay_1_1Conv3DAttrs-members.html |    1 +
 .../doxygen/structtvm_1_1relay_1_1Conv3DAttrs.html |    1 +
 ...m_1_1relay_1_1Conv3DTransposeAttrs-members.html |    1 +
 ...structtvm_1_1relay_1_1Conv3DTransposeAttrs.html |    1 +
 ...vm_1_1relay_1_1Conv3DWinogradAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1Conv3DWinogradAttrs.html |    1 +
 ...ay_1_1ConvGemmWeightTransformAttrs-members.html |    1 +
 ...m_1_1relay_1_1ConvGemmWeightTransformAttrs.html |    1 +
 ..._1ConvWinogradWeightTransformAttrs-members.html |    1 +
 ...1relay_1_1ConvWinogradWeightTransformAttrs.html |    1 +
 ...cttvm_1_1relay_1_1CorrelationAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1CorrelationAttrs.html    |    1 +
 ...tvm_1_1relay_1_1CropAndResizeAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1CropAndResizeAttrs.html  |    1 +
 .../structtvm_1_1relay_1_1DebugAttrs-members.html  |    1 +
 .../doxygen/structtvm_1_1relay_1_1DebugAttrs.html  |    1 +
 ..._1_1relay_1_1DeformableConv2DAttrs-members.html |    1 +
 ...tructtvm_1_1relay_1_1DeformableConv2DAttrs.html |    1 +
 .../structtvm_1_1relay_1_1DenseAttrs-members.html  |    1 +
 .../doxygen/structtvm_1_1relay_1_1DenseAttrs.html  |    1 +
 ...ucttvm_1_1relay_1_1DeviceCopyAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1DeviceCopyAttrs.html     |    1 +
 .../structtvm_1_1relay_1_1DilateAttrs-members.html |    1 +
 .../doxygen/structtvm_1_1relay_1_1DilateAttrs.html |    1 +
 ...ucttvm_1_1relay_1_1Dilation2DAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1Dilation2DAttrs.html     |    1 +
 ...structtvm_1_1relay_1_1DropoutAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1DropoutAttrs.html        |    1 +
 ...ucttvm_1_1relay_1_1ExpandDimsAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1ExpandDimsAttrs.html     |    1 +
 ...ucttvm_1_1relay_1_1FIFOBufferAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1FIFOBufferAttrs.html     |    1 +
 ..._1relay_1_1FixedPointMultiplyAttrs-members.html |    1 +
 ...ucttvm_1_1relay_1_1FixedPointMultiplyAttrs.html |    1 +
 .../structtvm_1_1relay_1_1GatherAttrs-members.html |    1 +
 .../doxygen/structtvm_1_1relay_1_1GatherAttrs.html |    1 +
 ...vm_1_1relay_1_1GetValidCountsAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1GetValidCountsAttrs.html |    1 +
 ...ttvm_1_1relay_1_1GlobalPool2DAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1GlobalPool2DAttrs.html   |    1 +
 ...ucttvm_1_1relay_1_1GridSampleAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1GridSampleAttrs.html     |    1 +
 ...ructtvm_1_1relay_1_1GroupNormAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1GroupNormAttrs.html      |    1 +
 .../structtvm_1_1relay_1_1InitOpAttrs-members.html |    1 +
 .../doxygen/structtvm_1_1relay_1_1InitOpAttrs.html |    1 +
 ...ttvm_1_1relay_1_1InstanceNormAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1InstanceNormAttrs.html   |    1 +
 ...cttvm_1_1relay_1_1L2NormalizeAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1L2NormalizeAttrs.html    |    1 +
 .../structtvm_1_1relay_1_1LRNAttrs-members.html    |    1 +
 .../doxygen/structtvm_1_1relay_1_1LRNAttrs.html    |    1 +
 ...ructtvm_1_1relay_1_1LayerNormAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1LayerNormAttrs.html      |    1 +
 ...m_1_1relay_1_1LayoutTransformAttrs-members.html |    1 +
 ...structtvm_1_1relay_1_1LayoutTransformAttrs.html |    1 +
 ...ructtvm_1_1relay_1_1LeakyReluAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1LeakyReluAttrs.html      |    1 +
 ...tvm_1_1relay_1_1MatrixSetDiagAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1MatrixSetDiagAttrs.html  |    1 +
 ...ructtvm_1_1relay_1_1MaxPool1DAttrs-members.html |   36 +-
 .../structtvm_1_1relay_1_1MaxPool1DAttrs.html      |   19 +-
 ...tvm_1_1relay_1_1MaxPool1DAttrs__coll__graph.svg |  113 +-
 ..._1_1relay_1_1MaxPool1DAttrs__inherit__graph.svg |  101 +-
 ...ructtvm_1_1relay_1_1MaxPool2DAttrs-members.html |   36 +-
 .../structtvm_1_1relay_1_1MaxPool2DAttrs.html      |   19 +-
 ...tvm_1_1relay_1_1MaxPool2DAttrs__coll__graph.svg |  217 ++--
 ..._1_1relay_1_1MaxPool2DAttrs__inherit__graph.svg |  103 +-
 ...ructtvm_1_1relay_1_1MaxPool3DAttrs-members.html |   36 +-
 .../structtvm_1_1relay_1_1MaxPool3DAttrs.html      |   19 +-
 ...tvm_1_1relay_1_1MaxPool3DAttrs__coll__graph.svg |  113 +-
 ..._1_1relay_1_1MaxPool3DAttrs__inherit__graph.svg |  101 +-
 ...tructtvm_1_1relay_1_1MeshgridAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1MeshgridAttrs.html       |    1 +
 ...ructtvm_1_1relay_1_1MirrorPadAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1MirrorPadAttrs.html      |    1 +
 ...tvm_1_1relay_1_1MultiBoxPriorAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1MultiBoxPriorAttrs.html  |    1 +
 ...relay_1_1MultiBoxTransformLocAttrs-members.html |    1 +
 ...ttvm_1_1relay_1_1MultiBoxTransformLocAttrs.html |    1 +
 ...cttvm_1_1relay_1_1NdarraySizeAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1NdarraySizeAttrs.html    |    1 +
 ...elay_1_1NonMaximumSuppressionAttrs-members.html |    1 +
 ...tvm_1_1relay_1_1NonMaximumSuppressionAttrs.html |    1 +
 ...tructtvm_1_1relay_1_1OnDeviceAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1OnDeviceAttrs.html       |    1 +
 .../structtvm_1_1relay_1_1OneHotAttrs-members.html |    1 +
 .../doxygen/structtvm_1_1relay_1_1OneHotAttrs.html |    1 +
 .../structtvm_1_1relay_1_1PReluAttrs-members.html  |    1 +
 .../doxygen/structtvm_1_1relay_1_1PReluAttrs.html  |    1 +
 .../structtvm_1_1relay_1_1PadAttrs-members.html    |    1 +
 .../doxygen/structtvm_1_1relay_1_1PadAttrs.html    |    1 +
 ...tructtvm_1_1relay_1_1ProposalAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1ProposalAttrs.html       |    1 +
 ...tructtvm_1_1relay_1_1ROIAlignAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1ROIAlignAttrs.html       |    1 +
 ...structtvm_1_1relay_1_1ROIPoolAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1ROIPoolAttrs.html        |    1 +
 .../structtvm_1_1relay_1_1ReduceAttrs-members.html |    1 +
 .../doxygen/structtvm_1_1relay_1_1ReduceAttrs.html |    1 +
 .../structtvm_1_1relay_1_1RefValueObj-members.html |    1 +
 .../doxygen/structtvm_1_1relay_1_1RefValueObj.html |    1 +
 .../structtvm_1_1relay_1_1RepeatAttrs-members.html |    1 +
 .../doxygen/structtvm_1_1relay_1_1RepeatAttrs.html |    1 +
 ...structtvm_1_1relay_1_1ReshapeAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1ReshapeAttrs.html        |    1 +
 ...cttvm_1_1relay_1_1ReshapeLikeAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1ReshapeLikeAttrs.html    |    1 +
 ...tvm_1_1relay_1_1ReshapeTensorAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1ReshapeTensorAttrs.html  |    1 +
 ...tructtvm_1_1relay_1_1Resize3dAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1Resize3dAttrs.html       |    1 +
 .../structtvm_1_1relay_1_1ResizeAttrs-members.html |    1 +
 .../doxygen/structtvm_1_1relay_1_1ResizeAttrs.html |    1 +
 ...structtvm_1_1relay_1_1ReverseAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1ReverseAttrs.html        |    1 +
 ...m_1_1relay_1_1ReverseSequenceAttrs-members.html |    1 +
 ...structtvm_1_1relay_1_1ReverseSequenceAttrs.html |    1 +
 .../structtvm_1_1relay_1_1ScanopAttrs-members.html |    1 +
 .../doxygen/structtvm_1_1relay_1_1ScanopAttrs.html |    1 +
 ...ucttvm_1_1relay_1_1ScatterAddAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1ScatterAddAttrs.html     |    1 +
 ...structtvm_1_1relay_1_1ScatterAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1ScatterAttrs.html        |    1 +
 ...ructtvm_1_1relay_1_1ScatterNDAttrs-members.html |    3 +-
 .../structtvm_1_1relay_1_1ScatterNDAttrs.html      |   11 +-
 ...tvm_1_1relay_1_1ScatterNDAttrs__coll__graph.svg |  194 ++-
 ..._1_1relay_1_1ScatterNDAttrs__inherit__graph.svg |    2 +-
 ...ttvm_1_1relay_1_1SequenceMaskAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1SequenceMaskAttrs.html   |    1 +
 ...ructtvm_1_1relay_1_1ShapeFuncAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1ShapeFuncAttrs.html      |    1 +
 ...structtvm_1_1relay_1_1ShapeOfAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1ShapeOfAttrs.html        |    1 +
 ...ructtvm_1_1relay_1_1SliceLikeAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1SliceLikeAttrs.html      |    1 +
 ...structtvm_1_1relay_1_1SoftmaxAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1SoftmaxAttrs.html        |    1 +
 ...vm_1_1relay_1_1SpaceToBatchNDAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1SpaceToBatchNDAttrs.html |    1 +
 ...cttvm_1_1relay_1_1SparseDenseAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1SparseDenseAttrs.html    |    1 +
 ...tvm_1_1relay_1_1SparseToDenseAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1SparseToDenseAttrs.html  |    1 +
 ...m_1_1relay_1_1SparseTransposeAttrs-members.html |    1 +
 ...structtvm_1_1relay_1_1SparseTransposeAttrs.html |    1 +
 .../structtvm_1_1relay_1_1SplitAttrs-members.html  |    1 +
 .../doxygen/structtvm_1_1relay_1_1SplitAttrs.html  |    1 +
 ...structtvm_1_1relay_1_1SqueezeAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1SqueezeAttrs.html        |    1 +
 .../structtvm_1_1relay_1_1StackAttrs-members.html  |    1 +
 .../doxygen/structtvm_1_1relay_1_1StackAttrs.html  |    1 +
 ...ttvm_1_1relay_1_1StridedSliceAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1StridedSliceAttrs.html   |    1 +
 ...tructtvm_1_1relay_1_1SubPixelAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1SubPixelAttrs.html       |    1 +
 .../structtvm_1_1relay_1_1TakeAttrs-members.html   |   30 +-
 .../doxygen/structtvm_1_1relay_1_1TakeAttrs.html   |   19 +-
 ...tructtvm_1_1relay_1_1TakeAttrs__coll__graph.svg |  195 +--
 ...cttvm_1_1relay_1_1TakeAttrs__inherit__graph.svg |   95 +-
 ..._1_1relay_1_1ThreefryGenerateAttrs-members.html |    1 +
 ...tructtvm_1_1relay_1_1ThreefryGenerateAttrs.html |    1 +
 .../structtvm_1_1relay_1_1TileAttrs-members.html   |    1 +
 .../doxygen/structtvm_1_1relay_1_1TileAttrs.html   |    1 +
 .../structtvm_1_1relay_1_1TopKAttrs-members.html   |    1 +
 .../doxygen/structtvm_1_1relay_1_1TopKAttrs.html   |    1 +
 ...ructtvm_1_1relay_1_1TransposeAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1TransposeAttrs.html      |    1 +
 .../structtvm_1_1relay_1_1UniqueAttrs-members.html |    1 +
 .../doxygen/structtvm_1_1relay_1_1UniqueAttrs.html |    1 +
 ...ttvm_1_1relay_1_1UpSampling3DAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1UpSampling3DAttrs.html   |    1 +
 ...ucttvm_1_1relay_1_1UpSamplingAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1UpSamplingAttrs.html     |    1 +
 ...tructtvm_1_1relay_1_1VarianceAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1VarianceAttrs.html       |    1 +
 ...ructtvm_1_1relay_1_1YoloReorgAttrs-members.html |    1 +
 .../structtvm_1_1relay_1_1YoloReorgAttrs.html      |    1 +
 ...1_1relay_1_1qnn_1_1DequantizeAttrs-members.html |    1 +
 ...ructtvm_1_1relay_1_1qnn_1_1DequantizeAttrs.html |    1 +
 ...m_1_1relay_1_1qnn_1_1QuantizeAttrs-members.html |    1 +
 ...structtvm_1_1relay_1_1qnn_1_1QuantizeAttrs.html |    1 +
 ...1_1relay_1_1qnn_1_1RequantizeAttrs-members.html |    1 +
 ...ructtvm_1_1relay_1_1qnn_1_1RequantizeAttrs.html |    1 +
 ...y_1_1qnn_1_1SimulatedQuantizeAttrs-members.html |    1 +
 ..._1_1relay_1_1qnn_1_1SimulatedQuantizeAttrs.html |    1 +
 .../structtvm_1_1relay_1_1v__info-members.html     |    1 +
 .../api/doxygen/structtvm_1_1relay_1_1v__info.html |    1 +
 ...runtime_1_1Array_1_1ValueConverter-members.html |    1 +
 ...ttvm_1_1runtime_1_1Array_1_1ValueConverter.html |    1 +
 .../structtvm_1_1runtime_1_1NullOptType.html       |    1 +
 ...tructtvm_1_1runtime_1_1ObjectEqual-members.html |    1 +
 .../structtvm_1_1runtime_1_1ObjectEqual.html       |    1 +
 ...structtvm_1_1runtime_1_1ObjectHash-members.html |    1 +
 .../structtvm_1_1runtime_1_1ObjectHash.html        |    1 +
 ...cttvm_1_1runtime_1_1ObjectPtrEqual-members.html |    1 +
 .../structtvm_1_1runtime_1_1ObjectPtrEqual.html    |    1 +
 ...ucttvm_1_1runtime_1_1ObjectPtrHash-members.html |    1 +
 .../structtvm_1_1runtime_1_1ObjectPtrHash.html     |    1 +
 ...vm_1_1runtime_1_1ObjectTypeChecker-members.html |    1 +
 .../structtvm_1_1runtime_1_1ObjectTypeChecker.html |    1 +
 ...eChecker_3_01Array_3_01T_01_4_01_4-members.html |    1 +
 ...bjectTypeChecker_3_01Array_3_01T_01_4_01_4.html |    1 +
 ...ker_3_01Map_3_01K_00_01V_01_4_01_4-members.html |    1 +
 ...TypeChecker_3_01Map_3_01K_00_01V_01_4_01_4.html |    1 +
 ...untime_1_1PackedFuncValueConverter-members.html |    1 +
 ...tvm_1_1runtime_1_1PackedFuncValueConverter.html |    1 +
 ...erter_3_01Optional_3_01T_01_4_01_4-members.html |    1 +
 ...alueConverter_3_01Optional_3_01T_01_4_01_4.html |    1 +
 ...ncValueConverter_3_01PrimExpr_01_4-members.html |    1 +
 ...PackedFuncValueConverter_3_01PrimExpr_01_4.html |    1 +
 ...alueConverter_3_01tvm_1_1Bool_01_4-members.html |    1 +
 ...kedFuncValueConverter_3_01tvm_1_1Bool_01_4.html |    1 +
 ...eConverter_3_01tvm_1_1Integer_01_4-members.html |    1 +
 ...FuncValueConverter_3_01tvm_1_1Integer_01_4.html |    1 +
 ...3_1_1tvm_1_1runtime_1_1String_01_4-members.html |    1 +
 ...nverter_3_1_1tvm_1_1runtime_1_1String_01_4.html |    1 +
 .../structtvm_1_1runtime_1_1TypeIndex-members.html |    1 +
 .../doxygen/structtvm_1_1runtime_1_1TypeIndex.html |    1 +
 ...ime_1_1micro__rpc_1_1SessionHeader-members.html |    1 +
 ..._1_1runtime_1_1micro__rpc_1_1SessionHeader.html |    1 +
 ...1runtime_1_1profiling_1_1CallFrame-members.html |    1 +
 ...cttvm_1_1runtime_1_1profiling_1_1CallFrame.html |    1 +
 ...ructtvm_1_1runtime_1_1vm_1_1Buffer-members.html |    1 +
 .../structtvm_1_1runtime_1_1vm_1_1Buffer.html      |    1 +
 ...vm_1_1runtime_1_1vm_1_1Instruction-members.html |    1 +
 .../structtvm_1_1runtime_1_1vm_1_1Instruction.html |    1 +
 ...ucttvm_1_1runtime_1_1vm_1_1VMFrame-members.html |    1 +
 .../structtvm_1_1runtime_1_1vm_1_1VMFrame.html     |    1 +
 ...tvm_1_1runtime_1_1vm_1_1VMFunction-members.html |    1 +
 .../structtvm_1_1runtime_1_1vm_1_1VMFunction.html  |    1 +
 .../structtvm_1_1te_1_1TensorDom-members.html      |    1 +
 docs/api/doxygen/structtvm_1_1te_1_1TensorDom.html |    1 +
 .../structtvm_1_1tir_1_1BlockInfo-members.html     |    1 +
 .../api/doxygen/structtvm_1_1tir_1_1BlockInfo.html |    1 +
 .../structtvm_1_1tir_1_1ExprDeepEqual-members.html |    1 +
 .../doxygen/structtvm_1_1tir_1_1ExprDeepEqual.html |    1 +
 .../structtvm_1_1tir_1_1LENode-members.html        |    1 +
 docs/api/doxygen/structtvm_1_1tir_1_1LENode.html   |    1 +
 docs/api/doxygen/structural__equal_8h.html         |    1 +
 docs/api/doxygen/structural__equal_8h_source.html  |    1 +
 docs/api/doxygen/structural__hash_8h.html          |    1 +
 docs/api/doxygen/structural__hash_8h_source.html   |    1 +
 docs/api/doxygen/tag_8h.html                       |    1 +
 docs/api/doxygen/tag_8h_source.html                |    1 +
 docs/api/doxygen/tags_8h.html                      |    1 +
 docs/api/doxygen/tags_8h_source.html               |    1 +
 docs/api/doxygen/target_8h.html                    |    1 +
 docs/api/doxygen/target_8h_source.html             |    1 +
 docs/api/doxygen/target__info_8h.html              |    1 +
 docs/api/doxygen/target__info_8h_source.html       |    1 +
 docs/api/doxygen/target__kind_8h.html              |    1 +
 docs/api/doxygen/target__kind_8h_source.html       |    3 +-
 docs/api/doxygen/te_2schedule_8h.html              |    1 +
 docs/api/doxygen/te_2schedule_8h_source.html       |    1 +
 docs/api/doxygen/tensor_8h.html                    |    1 +
 docs/api/doxygen/tensor_8h_source.html             |    3 +-
 docs/api/doxygen/tensor__intrin_8h.html            |    1 +
 docs/api/doxygen/tensor__intrin_8h_source.html     |    1 +
 docs/api/doxygen/tensor__type_8h.html              |    1 +
 docs/api/doxygen/tensor__type_8h_source.html       |    3 +-
 docs/api/doxygen/tensor__utils_8h.html             |    1 +
 docs/api/doxygen/tensor__utils_8h_source.html      |    1 +
 docs/api/doxygen/threading__backend_8h.html        |    1 +
 docs/api/doxygen/threading__backend_8h_source.html |    1 +
 docs/api/doxygen/tir_2analysis_8h.html             |    1 +
 docs/api/doxygen/tir_2analysis_8h_source.html      |    3 +-
 docs/api/doxygen/tir_2expr_8h.html                 |    1 +
 docs/api/doxygen/tir_2expr_8h_source.html          |  164 +--
 docs/api/doxygen/tir_2expr__functor_8h.html        |    1 +
 docs/api/doxygen/tir_2expr__functor_8h_source.html |   19 +-
 docs/api/doxygen/tir_2function_8h.html             |    1 +
 docs/api/doxygen/tir_2function_8h_source.html      |    1 +
 docs/api/doxygen/tir_2op_8h.html                   |    1 +
 docs/api/doxygen/tir_2op_8h_source.html            |    9 +-
 docs/api/doxygen/tir_2op__attr__types_8h.html      |    1 +
 .../doxygen/tir_2op__attr__types_8h_source.html    |    1 +
 docs/api/doxygen/tir_2schedule_2schedule_8h.html   |    1 +
 .../doxygen/tir_2schedule_2schedule_8h_source.html |    5 +-
 docs/api/doxygen/tir_2transform_8h.html            |    9 +
 docs/api/doxygen/tir_2transform_8h_source.html     |    8 +-
 docs/api/doxygen/topi_2nn_8h.html                  |    1 +
 docs/api/doxygen/topi_2nn_8h_source.html           |    3 +-
 docs/api/doxygen/topi_2transform_8h.html           |   13 +-
 docs/api/doxygen/topi_2transform_8h_source.html    |   46 +-
 docs/api/doxygen/transform__step_8h.html           |    1 +
 docs/api/doxygen/transform__step_8h_source.html    |    1 +
 docs/api/doxygen/type__functor_8h.html             |    1 +
 docs/api/doxygen/type__functor_8h_source.html      |    1 +
 docs/api/doxygen/type__relation_8h.html            |    1 +
 docs/api/doxygen/type__relation_8h_source.html     |    1 +
 docs/api/doxygen/unionTVMValue-members.html        |    1 +
 docs/api/doxygen/unionTVMValue.html                |    1 +
 docs/api/doxygen/utils_8h.html                     |    1 +
 docs/api/doxygen/utils_8h_source.html              |    1 +
 docs/api/doxygen/utvm__rpc__server_8h.html         |    1 +
 docs/api/doxygen/utvm__rpc__server_8h_source.html  |    1 +
 docs/api/doxygen/utvm__runtime_8h.html             |    1 +
 docs/api/doxygen/utvm__runtime_8h_source.html      |    1 +
 docs/api/doxygen/var_8h.html                       |    1 +
 docs/api/doxygen/var_8h_source.html                |    1 +
 docs/api/doxygen/vision_8h.html                    |    1 +
 docs/api/doxygen/vision_8h_source.html             |    1 +
 docs/api/doxygen/with_8h.html                      |    1 +
 docs/api/doxygen/with_8h_source.html               |    1 +
 docs/api/doxygen/write__stream_8h.html             |    1 +
 docs/api/doxygen/write__stream_8h_source.html      |    1 +
 docs/api/doxygen/x86_2bnn_8h.html                  |    1 +
 docs/api/doxygen/x86_2bnn_8h_source.html           |    1 +
 docs/api/doxygen/x86_2default_8h.html              |    1 +
 docs/api/doxygen/x86_2default_8h_source.html       |    1 +
 docs/api/doxygen/x86_2injective_8h.html            |    1 +
 docs/api/doxygen/x86_2injective_8h_source.html     |    1 +
 .../javadoc/org/apache/tvm/class-use/Function.html |   16 +-
 .../javadoc/org/apache/tvm/class-use/Module.html   |    8 +-
 docs/api/links.html                                |    6 +-
 docs/api/python/auto_scheduler.html                |   42 +-
 docs/api/python/autotvm.html                       |    6 +-
 docs/api/python/contrib.html                       |    6 +-
 docs/api/python/driver.html                        |    6 +-
 docs/api/python/error.html                         |    6 +-
 docs/api/python/graph_executor.html                |    6 +-
 docs/api/python/index.html                         |    6 +-
 docs/api/python/ir.html                            |    6 +-
 docs/api/python/micro.html                         |    6 +-
 docs/api/python/ndarray.html                       |    6 +-
 docs/api/python/relay/analysis.html                |    6 +-
 docs/api/python/relay/backend.html                 |    6 +-
 docs/api/python/relay/dataflow_pattern.html        |    6 +-
 docs/api/python/relay/frontend.html                |    6 +-
 docs/api/python/relay/image.html                   |    6 +-
 docs/api/python/relay/index.html                   |   32 +-
 docs/api/python/relay/nn.html                      |   24 +-
 docs/api/python/relay/testing.html                 |    6 +-
 docs/api/python/relay/transform.html               |    6 +-
 docs/api/python/relay/vision.html                  |    6 +-
 docs/api/python/rpc.html                           |   10 +-
 docs/api/python/runtime.html                       |    6 +-
 docs/api/python/target.html                        |    6 +-
 docs/api/python/te.html                            |    6 +-
 docs/api/python/tir.html                           |  165 ++-
 docs/api/python/topi.html                          |   89 +-
 docs/api/python/vta/index.html                     |    6 +-
 docs/api/typedoc/classes/bytestreamreader.html     |   12 +-
 docs/api/typedoc/classes/cachedcallstack.html      |   34 +-
 docs/api/typedoc/classes/dldatatype.html           |   12 +-
 docs/api/typedoc/classes/dldevice.html             |   10 +-
 docs/api/typedoc/classes/environment.html          |   12 +-
 docs/api/typedoc/classes/ffilibrary.html           |   20 +-
 docs/api/typedoc/classes/graphexecutor.html        |   16 +-
 docs/api/typedoc/classes/instance.html             |   40 +-
 docs/api/typedoc/classes/memory.html               |   34 +-
 docs/api/typedoc/classes/module.html               |   10 +-
 docs/api/typedoc/classes/ndarray.html              |   22 +-
 docs/api/typedoc/classes/packedfunccell.html       |    6 +-
 docs/api/typedoc/classes/rpcserver.html            |   14 +-
 docs/api/typedoc/classes/scalar.html               |    6 +-
 docs/api/typedoc/classes/webgpucontext.html        |   12 +-
 docs/api/typedoc/enums/argtypecode.html            |   30 +-
 docs/api/typedoc/enums/aynccallbackcode.html       |    4 +-
 docs/api/typedoc/enums/dldatatypecode.html         |    8 +-
 docs/api/typedoc/enums/rpcserverstate.html         |   12 +-
 docs/api/typedoc/enums/sizeof.html                 |   18 +-
 docs/api/typedoc/index.html                        |  114 +-
 docs/api/typedoc/interfaces/disposable.html        |    2 +-
 docs/api/typedoc/interfaces/functioninfo.html      |    6 +-
 docs/api/typedoc/interfaces/libraryprovider.html   |    4 +-
 docs/contribute/code_guide.html                    |    6 +-
 docs/contribute/code_review.html                   |    6 +-
 docs/contribute/committer_guide.html               |    6 +-
 docs/contribute/community.html                     |    6 +-
 docs/contribute/document.html                      |    6 +-
 docs/contribute/error_handling.html                |    6 +-
 docs/contribute/git_howto.html                     |    6 +-
 docs/contribute/index.html                         |    6 +-
 docs/contribute/pull_request.html                  |    6 +-
 docs/contribute/release_process.html               |    6 +-
 docs/deploy/android.html                           |    6 +-
 docs/deploy/arm_compute_lib.html                   |    8 +-
 docs/deploy/bnns.html                              |    6 +-
 docs/deploy/cpp_deploy.html                        |    6 +-
 docs/deploy/hls.html                               |    6 +-
 docs/deploy/index.html                             |    6 +-
 docs/deploy/integrate.html                         |    6 +-
 docs/deploy/tensorrt.html                          |    6 +-
 docs/deploy/vitis_ai.html                          |    6 +-
 docs/dev/benchmark.html                            |    6 +-
 docs/dev/codebase_walkthrough.html                 |    6 +-
 docs/dev/convert_layout.html                       |    6 +-
 docs/dev/debugger.html                             |    6 +-
 docs/dev/frontend/tensorflow.html                  |    6 +-
 docs/dev/how_to.html                               |    6 +-
 docs/dev/hybrid_script.html                        |    6 +-
 docs/dev/index.html                                |   12 +-
 docs/dev/inferbound.html                           |    6 +-
 docs/dev/introduction_to_module_serialization.html |    6 +-
 docs/dev/microtvm_design.html                      |    6 +-
 docs/dev/pass_infra.html                           |   10 +-
 docs/dev/relay_add_op.html                         |    6 +-
 docs/dev/relay_add_pass.html                       |    6 +-
 docs/dev/relay_bring_your_own_codegen.html         |    6 +-
 docs/dev/relay_intro.html                          |    6 +-
 docs/dev/relay_op_strategy.html                    |    6 +-
 docs/dev/runtime.html                              |   10 +-
 docs/dev/security.html                             |    6 +-
 docs/dev/virtual_machine.html                      |    6 +-
 docs/errors.html                                   |    6 +-
 docs/faq.html                                      |    6 +-
 docs/genindex.html                                 |   16 +-
 docs/index.html                                    |    6 +-
 docs/install/docker.html                           |    6 +-
 docs/install/from_source.html                      |   15 +-
 docs/install/index.html                            |    6 +-
 docs/install/nnpack.html                           |    6 +-
 docs/langref/hybrid_script.html                    |    6 +-
 docs/langref/index.html                            |    6 +-
 docs/langref/relay_adt.html                        |    6 +-
 docs/langref/relay_expr.html                       |    6 +-
 docs/langref/relay_op.html                         |    6 +-
 docs/langref/relay_pattern.html                    |    6 +-
 docs/langref/relay_type.html                       |    6 +-
 docs/microtvm/index.html                           |    6 +-
 docs/objects.inv                                   |  Bin 19057 -> 19009 bytes
 docs/py-modindex.html                              |    6 +-
 docs/search.html                                   |    6 +-
 docs/searchindex.js                                |    2 +-
 .../auto_scheduler/sg_execution_times.html         |   20 +-
 .../auto_scheduler/tune_conv2d_layer_cuda.html     | 1277 ++++++++++----------
 .../tutorials/auto_scheduler/tune_network_arm.html |    8 +-
 .../auto_scheduler/tune_network_cuda.html          |   12 +-
 .../auto_scheduler/tune_network_mali.html          |   92 +-
 .../tutorials/auto_scheduler/tune_network_x86.html |   12 +-
 docs/tutorials/auto_scheduler/tune_sparse_x86.html |   42 +-
 docs/tutorials/autotvm/sg_execution_times.html     |   18 +-
 docs/tutorials/autotvm/tune_conv2d_cuda.html       |   52 +-
 docs/tutorials/autotvm/tune_relay_arm.html         |   10 +-
 docs/tutorials/autotvm/tune_relay_cuda.html        |   12 +-
 docs/tutorials/autotvm/tune_relay_mobile_gpu.html  |   10 +-
 docs/tutorials/autotvm/tune_relay_x86.html         |    6 +-
 docs/tutorials/dev/bring_your_own_datatypes.html   |    6 +-
 docs/tutorials/dev/low_level_custom_pass.html      |   10 +-
 docs/tutorials/dev/sg_execution_times.html         |   14 +-
 docs/tutorials/dev/use_pass_infra.html             |    6 +-
 docs/tutorials/frontend/build_gcn.html             |    6 +-
 .../frontend/deploy_model_on_android.html          |   10 +-
 docs/tutorials/frontend/deploy_model_on_rasp.html  |    6 +-
 .../frontend/deploy_object_detection_pytorch.html  |    8 +-
 docs/tutorials/frontend/deploy_prequantized.html   |    8 +-
 .../frontend/deploy_prequantized_tflite.html       |   10 +-
 docs/tutorials/frontend/deploy_quantized.html      |    6 +-
 docs/tutorials/frontend/deploy_sparse.html         |    8 +-
 docs/tutorials/frontend/deploy_ssd_gluoncv.html    |    8 +-
 docs/tutorials/frontend/from_caffe2.html           |    6 +-
 docs/tutorials/frontend/from_coreml.html           |    6 +-
 docs/tutorials/frontend/from_darknet.html          |    6 +-
 docs/tutorials/frontend/from_keras.html            |    6 +-
 docs/tutorials/frontend/from_mxnet.html            |    6 +-
 docs/tutorials/frontend/from_onnx.html             |    8 +-
 docs/tutorials/frontend/from_pytorch.html          |    6 +-
 docs/tutorials/frontend/from_tensorflow.html       |  148 +--
 docs/tutorials/frontend/from_tflite.html           |    6 +-
 docs/tutorials/frontend/sg_execution_times.html    |   46 +-
 docs/tutorials/frontend/using_external_lib.html    |    6 +-
 .../get_started/auto_tuning_with_python.html       |  159 +--
 docs/tutorials/get_started/autotvm_matmul.html     |    6 +-
 .../get_started/cross_compilation_and_rpc.html     |    8 +-
 docs/tutorials/get_started/install.html            |    6 +-
 docs/tutorials/get_started/introduction.html       |    6 +-
 docs/tutorials/get_started/relay_quick_start.html  |  126 +-
 docs/tutorials/get_started/sg_execution_times.html |   26 +-
 .../get_started/tensor_expr_get_started.html       |   67 +-
 docs/tutorials/get_started/tune_matmul_x86.html    |  128 +-
 .../get_started/tvmc_command_line_driver.html      |    6 +-
 docs/tutorials/index.html                          |   81 +-
 docs/tutorials/language/extern_op.html             |    6 +-
 docs/tutorials/language/intrin_math.html           |    9 +-
 docs/tutorials/language/reduction.html             |    6 +-
 docs/tutorials/language/scan.html                  |    6 +-
 docs/tutorials/language/schedule_primitives.html   |   14 +-
 docs/tutorials/language/sg_execution_times.html    |   24 +-
 docs/tutorials/language/tedd.html                  |    6 +-
 docs/tutorials/language/tensorize.html             |   14 +-
 docs/tutorials/language/tuple_inputs.html          |   20 +-
 docs/tutorials/micro/micro_reference_vm.html       |    6 +-
 docs/tutorials/micro/micro_tflite.html             |    6 +-
 docs/tutorials/micro/sg_execution_times.html       |   12 +-
 docs/tutorials/optimize/opt_conv_cuda.html         |    9 +-
 docs/tutorials/optimize/opt_conv_tensorcore.html   |   13 +-
 docs/tutorials/optimize/opt_gemm.html              |   23 +-
 .../optimize/opt_matmul_auto_tensorcore.html       |  921 --------------
 docs/tutorials/optimize/sg_execution_times.html    |   15 +-
 docs/tutorials/topi/intro_topi.html                |    8 +-
 docs/tutorials/topi/sg_execution_times.html        |   10 +-
 docs/vta/dev/config.html                           |    6 +-
 docs/vta/dev/hardware.html                         |    6 +-
 docs/vta/dev/index.html                            |    6 +-
 docs/vta/index.html                                |    6 +-
 docs/vta/install.html                              |    6 +-
 docs/vta/tutorials/autotvm/sg_execution_times.html |   12 +-
 docs/vta/tutorials/autotvm/tune_alu_vta.html       |    6 +-
 docs/vta/tutorials/autotvm/tune_relay_vta.html     |  194 +--
 .../tutorials/frontend/deploy_classification.html  |   28 +-
 .../vta/tutorials/frontend/sg_execution_times.html |   10 +-
 docs/vta/tutorials/index.html                      |    6 +-
 docs/vta/tutorials/matrix_multiply.html            |    6 +-
 docs/vta/tutorials/optimize/convolution_opt.html   |   14 +-
 .../tutorials/optimize/matrix_multiply_opt.html    |   14 +-
 .../vta/tutorials/optimize/sg_execution_times.html |   12 +-
 docs/vta/tutorials/sg_execution_times.html         |   12 +-
 docs/vta/tutorials/vta_get_started.html            |    6 +-
 2581 files changed, 8383 insertions(+), 7870 deletions(-)

diff --git a/docs/_downloads/0c8b1cb0bb1d1dff7899c341215a0f35/tune_network_mali.ipynb b/docs/_downloads/0c8b1cb0bb1d1dff7899c341215a0f35/tune_network_mali.ipynb
index ab3faab..d649bd4 100644
--- a/docs/_downloads/0c8b1cb0bb1d1dff7899c341215a0f35/tune_network_mali.ipynb
+++ b/docs/_downloads/0c8b1cb0bb1d1dff7899c341215a0f35/tune_network_mali.ipynb
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\nAuto-scheduling a Neural Network for mali GPU\n=============================================\n**Author**: `Zhao Wu <https://github.com/FrozenGene>`_\n\nAuto-tuning for specific devices and workloads is critical for getting the\nbest performance. This is a tutorial on how to tune a whole neural\nnetwork for mali GPU with the auto-scheduler.\n\nTo auto-tune a neural network, we partition the network into small subgraphs and \ntune them independently. Each subgraph is treated as  [...]
+        "\nAuto-scheduling a Neural Network for mali GPU\n=============================================\n**Author**: `Zhao Wu <https://github.com/FrozenGene>`_\n\nAuto-tuning for specific devices and workloads is critical for getting the\nbest performance. This is a tutorial on how to tune a whole neural\nnetwork for mali GPU with the auto-scheduler.\n\nTo auto-tune a neural network, we partition the network into small subgraphs and\ntune them independently. Each subgraph is treated as o [...]
       ]
     },
     {
@@ -87,7 +87,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "<div class=\"alert alert-info\"><h4>Note</h4><p>How to get the hardware parameters from remote device\n\n  .. code-block:: python\n\n    from tvm.auto_scheduler.utils import request_remote\n    remote = request_remote(device_key, \"0.0.0.0\", 9190)\n    dev = remote.cl()\n    max_shared_memory_per_block = dev.max_shared_memory_per_block\n    # There is no explicit local memory limition\n    # so we can use INT32_MAX to disalbe the check on local_memory.\n    max_local_memory_per [...]
+        "<div class=\"alert alert-info\"><h4>Note</h4><p>How to get the hardware parameters from remote device\n\n  .. code-block:: python\n\n    from tvm.auto_scheduler.utils import request_remote\n    remote = request_remote(device_key, \"127.0.0.1\", 9190)\n    dev = remote.cl()\n    max_shared_memory_per_block = dev.max_shared_memory_per_block\n    # There is no explicit local memory limition\n    # so we can use INT32_MAX to disable the check on local_memory.\n    max_local_memory_p [...]
       ]
     },
     {
@@ -105,7 +105,7 @@
       },
       "outputs": [],
       "source": [
-        "def tune_and_evaluate():\n    print(\"Begin tuning...\")\n    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)\n    tune_option = auto_scheduler.TuningOptions(\n        num_measure_trials=200,  # change this to 20000 to achieve the best performance\n        builder=auto_scheduler.LocalBuilder(build_func=\"ndk\" if use_ndk else \"default\"),\n        runner=auto_scheduler.RPCRunner(\n            device_key, host=\"0.0.0.0\", port=9190, repeat=3, timeout=50\n        ),\n  [...]
+        "def tune_and_evaluate():\n    print(\"Begin tuning...\")\n    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)\n    tune_option = auto_scheduler.TuningOptions(\n        num_measure_trials=200,  # change this to 20000 to achieve the best performance\n        builder=auto_scheduler.LocalBuilder(build_func=\"ndk\" if use_ndk else \"default\"),\n        runner=auto_scheduler.RPCRunner(\n            device_key, host=\"127.0.0.1\", port=9190, repeat=3, timeout=50\n        ),\ [...]
       ]
     },
     {
diff --git a/docs/_downloads/10e16681be542cc483fa89e9b4678a27/opt_matmul_auto_tensorcore.py b/docs/_downloads/10e16681be542cc483fa89e9b4678a27/opt_matmul_auto_tensorcore.py
deleted file mode 100644
index 03682a0..0000000
--- a/docs/_downloads/10e16681be542cc483fa89e9b4678a27/opt_matmul_auto_tensorcore.py
+++ /dev/null
@@ -1,544 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-.. _opt-matmul-auto-tensorcore:
-
-How to optimize matmul with Auto TensorCore CodeGen
-===================================================
-**Author**: `Minmin Sun <https://github.com/minminsun>`_, \
-            `Lanbo Li <https://github.com/Orion34C>`_, \
-            `Chenfan Jia <https://github.com/jcf94>`_, \
-            `Jun Yang <https://github.com/yangjunpro>`_
-
-In this tutorial, we will demonstrate how to write a high performance matmul
-schedule on Volta/Turing GPUs with TVM Auto TensorCore CodeGen.
-This is a transparent solution to generate tensorcore kernel
-with most transformations done in ir passes.
-Users can also write schedule with tensorization to generate TensorCore code.
-Both solutions use the same tensorcore intrinsics.
-Please refer to :ref:`opt-conv-tensorcore` tutorial for more details.
-"""
-
-################################################################
-# Preparation and Algorithm
-# -------------------------
-# 2 kinds of input data types are supported: float16 and int8.
-# For float16, the accumulator is float32.
-# For int8, the accumulator is int32.
-# For data layouts, 'N' means None-transpose while 'T' means Transpose.
-
-import logging
-import sys
-
-import numpy as np
-import tvm
-from tvm import te
-
-from tvm import autotvm
-from tvm.contrib import nvcc
-import tvm.testing
-
-
-def matmul_nn(A, B, L, dtype="float16", layout="NN"):
-    k = te.reduce_axis((0, L), name="k")
-    if dtype == "float16":
-        out_type = "float"
-    elif dtype == "int8":
-        out_type = "int"
-    elif dtype == "int4" or dtype == "int1":
-        out_type = "int"
-    if layout == "NN":
-        return te.compute(
-            (N, M), lambda i, j: te.sum(A[i, k].astype(out_type) * B[k, j].astype(out_type), axis=k)
-        )
-    if layout == "NT":
-        return te.compute(
-            (N, M), lambda i, j: te.sum(A[k, i].astype(out_type) * B[k, j].astype(out_type), axis=k)
-        )
-    if layout == "TN":
-        return te.compute(
-            (N, M), lambda i, j: te.sum(A[i, k].astype(out_type) * B[j, k].astype(out_type), axis=k)
-        )
-    if layout == "TT":
-        return te.compute(
-            (N, M), lambda i, j: te.sum(A[k, i].astype(out_type) * B[j, k].astype(out_type), axis=k)
-        )
-
-
-###############################################################################
-# Scheduling the Computation
-# --------------------------
-# This schedule is no different than a non-tensorcore matmul schedule on GPU.
-# Please refer to :ref:`opt-gemm` tutorial for basics of optimizing matmul schedule.
-# When the "tensor_core" pragma is set, the "rewrite for tensorcore" ir pass
-# will automatically transform the schedule for tensorcore codegen,
-# otherwise normal CUDA code, with lower performance but equal functionality, will be generated.
-#
-# .. note::
-#
-#   *Requirements of TesnsorCore*
-#
-#   Note that in the following 2 cases, even though the "tensor_core" pragma is set, TVM will still fall back to normal CUDA codegen:
-#   (1) The m, n or k of input matrices is not multiple of 16;
-#   (2) The warp tile size is not 16x16x16 on CUDA9, or not one of {16x16x16, 32x8x16, 8x32x16} on CUDA version >= 10.0.
-#
-# In this schedule, storage_align is used to reduce bank conflicts of shared memory. Please refer to this
-# `doc <https://tvm.apache.org/docs/api/python/te.html#tvm.te.Stage.storage_align>`_
-# for the usage of storage_align primitive. In short, we need to add an offset to some shared memory buffer
-# to reduce bank conflicts.
-# According to the `wmma doc <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#wmma-description>`_,
-# the stride of load_matrix_sync must be a multiple of 16 bytes,
-# so we choose 8 as offset for float16 and 16 as offset for int8.
-#
-# We use AutoTVM to search for best configurations in this schedule.
-
-
-@autotvm.template("tutorial/auto_tensorcore/test_gemm")
-def test_gemm(N, L, M, dtype, layout):
-    if layout == "NN":
-        shape_a = (N, L)
-        shape_b = (L, M)
-    elif layout == "NT":
-        shape_a = (L, N)
-        shape_b = (L, M)
-    elif layout == "TN":
-        shape_a = (N, L)
-        shape_b = (M, L)
-    elif layout == "TT":
-        shape_a = (L, N)
-        shape_b = (M, L)
-    else:
-        print("Unsupported layout:", layout)
-        sys.exit(1)
-    A = te.placeholder(shape_a, name="A", dtype=dtype)
-    B = te.placeholder(shape_b, name="B", dtype=dtype)
-    C = matmul_nn(A, B, L, dtype, layout)
-
-    s = te.create_schedule(C.op)
-    y, x = s[C].op.axis
-    k = s[C].op.reduce_axis[0]
-
-    # storage_align params
-    factor = 16
-    offset = 8
-    if dtype == "int8":
-        factor = 32
-        offset = 16
-    elif dtype == "int4":
-        factor = 64
-        offset = 32
-    elif dtype == "int1":
-        factor = 256
-        offset = 128
-
-    # create cache stages
-    AA = s.cache_read(A, "shared", [C])
-    if layout == "NN" or layout == "TN":
-        s[AA].storage_align(AA.op.axis[0], factor, offset)
-    AL = s.cache_read(AA, "local", [C])
-    BB = s.cache_read(B, "shared", [C])
-    if layout == "TT" or layout == "NT":
-        s[BB].storage_align(BB.op.axis[0], factor, offset)
-    BL = s.cache_read(BB, "local", [C])
-    CL = s.cache_write(C, "local")
-
-    # autotvm search space definition
-    cfg = autotvm.get_config()
-
-    cfg.define_knob("bx", [2, 4, 8])
-    cfg.define_knob("by", [8, 16, 32, 64])
-    cfg.define_knob("step_k", [1, 2, 4, 8, 16, 32])
-    cfg.define_knob("v", [4, 8, 16, 32])
-    by = cfg["by"].val
-    bx = cfg["bx"].val
-    step_k = cfg["step_k"].val
-    v = cfg["v"].val
-
-    # thread tile
-    TX = 8
-    TY = 1
-    if dtype == "int4" or dtype == "int1":
-        TX = 2
-    # warp tile
-    warp_tile_m = 16  # it could also be 8 or 32 on CUDA version >= 10.0
-    warp_tile_k = 16  # it must be 16 for fp16/int8 data type
-    if dtype == "int4":
-        warp_tile_m = 8
-        warp_tile_k = 32
-    elif dtype == "int1":
-        warp_tile_m = 8
-        warp_tile_k = 128
-    # block tile
-    tile_x = bx * TX
-    tile_y = by * TY
-
-    yo, ty = s[C].split(y, tile_y)
-    ty, yi = s[C].split(ty, TY)
-
-    # schedule for C stage
-    xo, xi = s[C].split(x, tile_x)
-    WX = min(warp_tile_m, tile_x)
-    tz, xi = s[C].split(xi, WX)
-    tx, xi = s[C].split(xi, TX)
-    s[C].reorder(yo, xo, tz, ty, tx, yi, xi)
-    s[C].bind(yo, te.thread_axis("blockIdx.y"))
-    s[C].bind(xo, te.thread_axis("blockIdx.x"))
-    s[C].bind(ty, te.thread_axis("threadIdx.y"))
-    s[C].bind(tz, te.thread_axis("threadIdx.z"))
-    s[C].bind(tx, te.thread_axis("threadIdx.x"))
-
-    # schedule for CL stage
-    ko, ki = s[CL].split(k, step_k * warp_tile_k)
-    kl, ki = s[CL].split(ki, warp_tile_k)
-    s[CL].compute_at(s[C], tx)
-    yo, xo = CL.op.axis
-    s[CL].reorder(ko, kl, ki, yo, xo)
-
-    # schedule for AA stage
-    s[AA].compute_at(s[CL], ko)
-    xo, xi = s[AA].split(s[AA].op.axis[1], factor=bx * v)
-    tz, tx = s[AA].split(xi, factor=(WX // TX) * v)
-    tx, vec = s[AA].split(tx, factor=v)
-    fused = s[AA].fuse(s[AA].op.axis[0], xo)
-    _, ty = s[AA].split(fused, factor=by)
-    s[AA].bind(ty, te.thread_axis("threadIdx.y"))
-    s[AA].bind(tz, te.thread_axis("threadIdx.z"))
-    s[AA].bind(tx, te.thread_axis("threadIdx.x"))
-    # vectorization is very important for float16/int8 inputs
-    s[AA].vectorize(vec)
-
-    # schedule for BB stage
-    s[BB].compute_at(s[CL], ko)
-    xo, xi = s[BB].split(s[BB].op.axis[1], factor=bx * v)
-    tz, tx = s[BB].split(xi, factor=(WX // TX) * v)
-    tx, vec = s[BB].split(tx, factor=v)
-    fused = s[BB].fuse(s[BB].op.axis[0], xo)
-    _, ty = s[BB].split(fused, factor=by)
-    s[BB].bind(ty, te.thread_axis("threadIdx.y"))
-    s[BB].bind(tz, te.thread_axis("threadIdx.z"))
-    s[BB].bind(tx, te.thread_axis("threadIdx.x"))
-    s[BB].vectorize(vec)
-
-    s[AL].compute_at(s[CL], kl)
-    s[BL].compute_at(s[CL], kl)
-
-    # set the 'tensor_core' pragma for tensorcore codegen
-    s[CL].pragma(ko, "tensor_core")
-
-    return s, [A, B, C]
-
-
-###############################################################################
-# AutoTune and Test
-# -----------------
-# Finally we use a tuner to tune the schedule, generate code with best config
-# and run the kernel to compare with numpy to check whether the results are correct.
-
-# check whether the gpu has tensorcore
-if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-    raise Exception("skip building this tutorial because cuda is not enabled..")
-
-dev = tvm.gpu()
-if not nvcc.have_tensorcore(dev.compute_version):
-    raise Exception("the gpu has no tensorcore, skipping...")
-
-M, N, L = 512, 32, 512
-dtype = "float16"
-layout = "NN"
-if len(sys.argv) >= 4:
-    M, N, L = int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3])
-if len(sys.argv) >= 5:
-    dtype = sys.argv[4]
-if len(sys.argv) >= 6:
-    layout = sys.argv[5]
-
-# check whether current gpu arch support support current dtype's wmma codegen
-cuda_compute_capability = tvm.runtime._ffi_api.GetDeviceAttr(2, 0, 4)
-major, minor = nvcc.parse_compute_version(cuda_compute_capability)
-if dtype == "int8":
-    assert major == 7 and minor >= 2
-elif dtype == "int4" or dtype == "int1":
-    # int4/int1 only support layout TN
-    assert major == 7 and minor == 5 and layout == "TN"
-
-
-def tune_and_evaluate(M, N, L, dtype, layout):
-    task = autotvm.task.create(
-        "tutorial/auto_tensorcore/test_gemm", args=(N, L, M, dtype, layout), target="cuda"
-    )
-    print(task.config_space)
-
-    logging.getLogger("autotvm").setLevel(logging.DEBUG)
-    logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout))
-
-    measure_option = autotvm.measure_option(builder="local", runner=autotvm.LocalRunner(number=5))
-
-    tuner = autotvm.tuner.XGBTuner(task)
-    tuner.tune(
-        n_trial=1000,
-        measure_option=measure_option,
-        callbacks=[autotvm.callback.log_to_file("matmul.log")],
-    )
-
-    dispatch_context = autotvm.apply_history_best("matmul.log")
-    best_config = dispatch_context.query(task.target, task.workload)
-    print("\nBest config:")
-    print(best_config)
-    with autotvm.apply_history_best("matmul.log"):
-        with tvm.target.Target("cuda"):
-            s, arg_bufs = test_gemm(N, L, M, dtype, layout)
-            print(tvm.lower(s, arg_bufs, simple_mode=True))
-            func = tvm.build(s, arg_bufs)
-    dev_module = func.imported_modules[0]
-    print(dev_module.get_source())
-
-    # check correctness
-    if layout == "NN":
-        shape_a = (N, L)
-        shape_b = (L, M)
-    elif layout == "NT":
-        shape_a = (L, N)
-        shape_b = (L, M)
-    elif layout == "TN":
-        shape_a = (N, L)
-        shape_b = (M, L)
-    elif layout == "TT":
-        shape_a = (L, N)
-        shape_b = (M, L)
-
-    a_np = None
-    b_np = None
-    c_np = None
-    c_np_type = None
-    if dtype == "float16":
-        c_np_type = np.float32
-        a_np = np.random.uniform(size=shape_a).astype(np.float16)
-        b_np = np.random.uniform(size=shape_b).astype(np.float16)
-        if layout == "NN":
-            c_np = np.dot(a_np, b_np)
-        elif layout == "NT":
-            c_np = np.dot(a_np.T, b_np)
-        elif layout == "TN":
-            c_np = np.dot(a_np, b_np.T)
-        elif layout == "TT":
-            c_np = np.dot(a_np.T, b_np.T)
-    elif dtype == "int8":
-        c_np_type = np.int32
-        a_np = np.random.randint(low=-128, high=127, size=shape_a).astype(np.int8)
-        b_np = np.random.randint(low=-128, high=127, size=shape_b).astype(np.int8)
-        if layout == "NN":
-            c_np = np.dot(a_np.astype(np.int32), b_np.astype(np.int32))
-        elif layout == "NT":
-            c_np = np.dot(a_np.astype(np.int32).T, b_np.astype(np.int32))
-        elif layout == "TN":
-            c_np = np.dot(a_np.astype(np.int32), b_np.astype(np.int32).T)
-        elif layout == "TT":
-            c_np = np.dot(a_np.astype(np.int32).T, b_np.astype(np.int32).T)
-    elif dtype == "int4":
-        c_np_type = np.int32
-        a_np_int = np.random.randint(low=-8, high=7, size=shape_a).astype(np.int32)
-        b_np_int = np.random.randint(low=-8, high=7, size=shape_b).astype(np.int32)
-        # "TN"
-        c_np = np.dot(a_np_int.astype(np.int32), b_np_int.astype(np.int32).T)
-        a_np = np.zeros(shape=(N, int(L / 8)), dtype=np.int32)
-        b_np = np.zeros(shape=(M, int(L / 8)), dtype=np.int32)
-        # a_np --> col_major
-        for i in range(N):
-            for j in range(int(L / 8)):
-                for k in range(8):
-                    a_np[i, j] = a_np[i, j] | ((a_np_int[i, j * 8 + k] & 0xF) << ((7 - k) * 4))
-
-        # b_np --> row_major
-        for i in range(M):
-            for j in range(int(L / 8)):
-                for k in range(8):
-                    b_np[i, j] = b_np[i, j] | ((b_np_int[i, j * 8 + k] & 0xF) << ((7 - k) * 4))
-    elif dtype == "int1":
-        c_np_type = np.int32
-        a_np_int = np.random.randint(low=0, high=1, size=shape_a).astype(np.int32)
-        b_np_int = np.random.randint(low=0, high=1, size=shape_b).astype(np.int32)
-        # "TN"
-        c_np = np.dot(a_np_int.astype(np.int32), b_np_int.astype(np.int32).T)
-        a_np = np.zeros(shape=(N, int(L / 32)), dtype=np.int32)
-        b_np = np.zeros(shape=(M, int(L / 32)), dtype=np.int32)
-        for i in range(N):
-            for j in range(int(L / 32)):
-                for k in range(32):
-                    a_np[i, j] = a_np[i, j] | ((a_np_int[i, j * 32 + k] & 0xF) << (31 - k))
-
-        for i in range(M):
-            for j in range(int(L / 32)):
-                for k in range(32):
-                    b_np[i, j] = b_np[i, j] | ((b_np_int[i, j * 32 + k] & 0xF) << (31 - k))
-
-    c_tvm = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np_type), device=dev)
-    a_tvm = tvm.nd.array(a_np, device=dev)
-    b_tvm = tvm.nd.array(b_np, device=dev)
-    func(a_tvm, b_tvm, c_tvm)
-
-    tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-3)
-
-    evaluator = func.time_evaluator(func.entry_name, dev, number=100)
-    print("Time cost of this operator: %f" % evaluator(a_tvm, b_tvm, c_tvm).mean)
-
-
-# We do not run the tuning in our webpage server since it takes some time.
-# Uncomment the following line to run it by yourself.
-
-# tune_and_evaluate(M, N, L, dtype, layout)
-
-######################################################################
-# Sample Output
-# -------------
-# .. code-block:: bash
-#
-#    Best config:
-#    [('bx', 4), ('by', 32), ('step_k', 16), ('v', 8)],,None,40
-#    Finish loading 162 records
-#    produce compute {
-#      // attr [iter_var(blockIdx.y, , blockIdx.y)] thread_extent = 1
-#      // attr [compute.local] storage_scope = "wmma.accumulator"
-#      allocate compute.local[float32 * 256]
-#      // attr [A.shared] storage_scope = "shared"
-#      allocate A.shared[float16 * 8448]
-#      // attr [B.shared] storage_scope = "shared"
-#      allocate B.shared[float16 * 8192]
-#      // attr [A.shared.local] storage_scope = "wmma.matrix_b"
-#      allocate A.shared.local[float16 * 256]
-#      // attr [B.shared.local] storage_scope = "wmma.matrix_a"
-#      allocate B.shared.local[float16 * 256]
-#      // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 16
-#      // attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent = 2
-#      // attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 32
-#      // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 2
-#      produce compute.local {
-#        for (j.c.init, 0, 1) {
-#          tvm_fill_fragment(compute.local, 16, 16, 16, 0, 0f)
-#        }
-#        // attr [iter_var(k.outer, )] pragma_tensor_core = 1
-#        for (k.outer, 0, 2) {
-#          produce A.shared {
-#            for (ax0.ax1.outer.fused.outer, 0, 8) {
-#              // attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 32
-#              // attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent = 2
-#              // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 2
-#              A.shared[ramp((((((ax0.ax1.outer.fused.outer*1056) + (floordiv(threadIdx.y, 8)*264)) + (floormod(threadIdx.y, 8)*32)) + (threadIdx.z*16)) + (threadIdx.x*8)), 1, 8)] = A[ramp(((((((ax0.ax1.outer.fused.outer*2048) + (floordiv(threadIdx.y, 8)*512)) + (k.outer*256)) + (floormod(threadIdx.y, 8)*32)) + (threadIdx.z*16)) + (threadIdx.x*8)), 1, 8)]
-#            }
-#          }
-#          produce B.shared {
-#            for (ax0.ax1.outer.fused.outer, 0, 8) {
-#              // attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 32
-#              // attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent = 2
-#              // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 2
-#              B.shared[ramp(((((ax0.ax1.outer.fused.outer*1024) + (threadIdx.y*32)) + (threadIdx.z*16)) + (threadIdx.x*8)), 1, 8)] = B[ramp(((((((k.outer*131072) + (ax0.ax1.outer.fused.outer*16384)) + (threadIdx.y*512)) + (blockIdx.x*32)) + (threadIdx.z*16)) + (threadIdx.x*8)), 1, 8)]
-#            }
-#          }
-#          for (k.inner.outer, 0, 16) {
-#            produce A.shared.local {
-#              for (ax1, 0, 1) {
-#                tvm_load_matrix_sync(A.shared.local, 16, 16, 16, 0, &(A.shared[(((threadIdx.y/16)*4224) + (k.inner.outer*16))]), 264, "col_major")
-#              }
-#            }
-#            produce B.shared.local {
-#              for (ax0, 0, 1) {
-#                for (ax1, 0, 1) {
-#                  tvm_load_matrix_sync(B.shared.local, 16, 16, 16, 0, &(B.shared[((k.inner.outer*512) + (threadIdx.z*16))]), 32, "col_major")
-#                }
-#              }
-#            }
-#            for (k.inner.inner, 0, 1) {
-#              for (j.c, 0, 1) {
-#                tvm_mma_sync(compute.local, 0, B.shared.local, 0, A.shared.local, 0, compute.local, 0)
-#              }
-#            }
-#          }
-#        }
-#      }
-#      for (j.inner.inner.inner, 0, 1) {
-#        tvm_store_matrix_sync(compute.local, 16, 16, 16, 0, &(compute[((((threadIdx.y/16)*8192) + (blockIdx.x*32)) + (threadIdx.z*16))]), 512, "col_major")
-#      }
-#    }
-#
-#    #include <cuda_fp16.h>
-#    __device__ half max(const half a, const half b)
-#    {
-#      return __hgt(__half(a), __half(b)) ? a : b;
-#    }
-#    __device__ half min(const half a, const half b)
-#    {
-#      return __hlt(__half(a), __half(b)) ? a : b;
-#    }
-#    __device__ half operator+(const volatile __half &a,  const volatile __half &b)
-#    {
-#      return __hadd(a, b);
-#    }
-#    __device__ half operator<=(const volatile __half &a,  const volatile __half &b)
-#    {
-#      return __hlt(a, b);
-#    }
-#    __device__ half operator*(const volatile __half &a,  const volatile __half &b)
-#    {
-#      return __hmul(a, b);
-#    }
-#    #include <mma.h>
-#    extern "C" __global__ void default_function_kernel0( half* __restrict__ A,  half* __restrict__ B,  float* __restrict__ compute) {
-#      nvcuda::wmma::fragment<nvcuda::wmma::accumulator, 16, 16, 16, float> compute_local[1];
-#      __shared__ half A_shared[8448];
-#      __shared__ half B_shared[8192];
-#      nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, 16, 16, 16, half, nvcuda::wmma::col_major> A_shared_local[1];
-#      nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, 16, 16, 16, half, nvcuda::wmma::col_major> B_shared_local[1];
-#      for (int j_c_init = 0; j_c_init < 1; ++j_c_init) {
-#        (void)nvcuda::wmma::fill_fragment(compute_local[0], 0.000000e+00f);
-#      }
-#      for (int k_outer = 0; k_outer < 2; ++k_outer) {
-#        __syncthreads();
-#        for (int ax0_ax1_outer_fused_outer = 0; ax0_ax1_outer_fused_outer < 8; ++ax0_ax1_outer_fused_outer) {
-#          ((__shared__ float4*)(A_shared + (((((ax0_ax1_outer_fused_outer * 1056) + ((((int)threadIdx.y) >> 3) * 264)) + ((((int)threadIdx.y) & 7) * 32)) + (((int)threadIdx.z) * 16)) + (((int)threadIdx.x) * 8))))[0] = (( float4*)(A + ((((((ax0_ax1_outer_fused_outer * 2048) + ((((int)threadIdx.y) >> 3) * 512)) + (k_outer * 256)) + ((((int)threadIdx.y) & 7) * 32)) + (((int)threadIdx.z) * 16)) + (((int)threadIdx.x) * 8))))[0];
-#        }
-#        for (int ax0_ax1_outer_fused_outer1 = 0; ax0_ax1_outer_fused_outer1 < 8; ++ax0_ax1_outer_fused_outer1) {
-#          ((__shared__ float4*)(B_shared + ((((ax0_ax1_outer_fused_outer1 * 1024) + (((int)threadIdx.y) * 32)) + (((int)threadIdx.z) * 16)) + (((int)threadIdx.x) * 8))))[0] = (( float4*)(B + ((((((k_outer * 131072) + (ax0_ax1_outer_fused_outer1 * 16384)) + (((int)threadIdx.y) * 512)) + (((int)blockIdx.x) * 32)) + (((int)threadIdx.z) * 16)) + (((int)threadIdx.x) * 8))))[0];
-#        }
-#        __syncthreads();
-#        for (int k_inner_outer = 0; k_inner_outer < 16; ++k_inner_outer) {
-#          for (int ax1 = 0; ax1 < 1; ++ax1) {
-#            (void)nvcuda::wmma::load_matrix_sync(A_shared_local[0], &(A_shared[(((((int)threadIdx.y) / 16) * 4224) + (k_inner_outer * 16))]), 264);
-#          }
-#          for (int ax0 = 0; ax0 < 1; ++ax0) {
-#            for (int ax11 = 0; ax11 < 1; ++ax11) {
-#              (void)nvcuda::wmma::load_matrix_sync(B_shared_local[0], &(B_shared[((k_inner_outer * 512) + (((int)threadIdx.z) * 16))]), 32);
-#            }
-#          }
-#          for (int k_inner_inner = 0; k_inner_inner < 1; ++k_inner_inner) {
-#            for (int j_c = 0; j_c < 1; ++j_c) {
-#              (void)nvcuda::wmma::mma_sync(compute_local[0], B_shared_local[0], A_shared_local[0], compute_local[0]);
-#            }
-#          }
-#        }
-#      }
-#      for (int j_inner_inner_inner = 0; j_inner_inner_inner < 1; ++j_inner_inner_inner) {
-#        (void)nvcuda::wmma::store_matrix_sync(&(compute[((((((int)threadIdx.y) / 16) * 8192) + (((int)blockIdx.x) * 32)) + (((int)threadIdx.z) * 16))]), compute_local[0], 512, nvcuda::wmma::mem_col_major);
-#      }
-#    }
-#
-#
-#    Time cost of this operator: 0.000008
-
-###############################################################################
-# Summary
-# -------
-# This tutorial demonstrates how to use the AutoTensorCoreCodeGen of TVM
-# to generate tensorcore kernels.
diff --git a/docs/_downloads/272a5a893d007658546dc0eaf0a7aeed/tune_relay_cuda.py b/docs/_downloads/272a5a893d007658546dc0eaf0a7aeed/tune_relay_cuda.py
index 50485c4..65991cc 100644
--- a/docs/_downloads/272a5a893d007658546dc0eaf0a7aeed/tune_relay_cuda.py
+++ b/docs/_downloads/272a5a893d007658546dc0eaf0a7aeed/tune_relay_cuda.py
@@ -345,13 +345,13 @@ def tune_and_evaluate(tuning_opt):
 #
 # .. code-block:: bash
 #
-#     python -m tvm.exec.rpc_server --tracker=0.0.0.0:9190 --key=1080ti
+#     python -m tvm.exec.rpc_server --tracker=127.0.0.1:9190 --key=1080ti
 #
 # After registering devices, we can confirm it by querying rpc_tracker
 #
 # .. code-block:: bash
 #
-#   python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190
+#   python -m tvm.exec.query_rpc_tracker --host=127.0.0.1 --port=9190
 #
 # For example, if we have four 1080ti, two titanx and one gfx900, the output can be
 #
@@ -378,7 +378,7 @@ tuning_option = {
         builder=autotvm.LocalBuilder(timeout=10),
         runner=autotvm.RPCRunner(
             "1080ti",  # change the device key to your key
-            "0.0.0.0",
+            "127.0.0.1",
             9190,
             number=20,
             repeat=3,
diff --git a/docs/_downloads/2c0ed53a9ebd68caf76cd8235fae2711/tune_relay_mobile_gpu.ipynb b/docs/_downloads/2c0ed53a9ebd68caf76cd8235fae2711/tune_relay_mobile_gpu.ipynb
index 369ce55..39d048a 100644
--- a/docs/_downloads/2c0ed53a9ebd68caf76cd8235fae2711/tune_relay_mobile_gpu.ipynb
+++ b/docs/_downloads/2c0ed53a9ebd68caf76cd8235fae2711/tune_relay_mobile_gpu.ipynb
@@ -90,7 +90,7 @@
       },
       "outputs": [],
       "source": [
-        "#### DEVICE CONFIG ####\n# Replace \"aarch64-linux-gnu\" with the correct target of your board.\n# This target host is used for cross compilation. You can query it by :code:`gcc -v` on your device.\ntarget = tvm.target.Target(\"opencl -device=mali\", host=\"llvm -mtriple=aarch64-linux-gnu\")\n\n# Also replace this with the device key in your tracker\ndevice_key = \"rk3399\"\n\n# Set this to True if you use android phone\nuse_android = False\n\n#### TUNING OPTION ####\nnetwork =  [...]
+        "#### DEVICE CONFIG ####\n# Replace \"aarch64-linux-gnu\" with the correct target of your board.\n# This target host is used for cross compilation. You can query it by :code:`gcc -v` on your device.\ntarget = tvm.target.Target(\"opencl -device=mali\", host=\"llvm -mtriple=aarch64-linux-gnu\")\n\n# Also replace this with the device key in your tracker\ndevice_key = \"rk3399\"\n\n# Set this to True if you use android phone\nuse_android = False\n\n#### TUNING OPTION ####\nnetwork =  [...]
       ]
     },
     {
@@ -133,7 +133,7 @@
       },
       "outputs": [],
       "source": [
-        "def tune_and_evaluate(tuning_opt):\n    # extract workloads from relay program\n    print(\"Extract tasks...\")\n    mod, params, input_shape, _ = get_network(network, batch_size=1)\n    tasks = autotvm.task.extract_from_program(\n        mod[\"main\"],\n        target=target,\n        params=params,\n        ops=(relay.op.get(\"nn.conv2d\"),),\n    )\n\n    # run tuning tasks\n    print(\"Tuning...\")\n    tune_tasks(tasks, **tuning_opt)\n\n    # compile kernels with history be [...]
+        "def tune_and_evaluate(tuning_opt):\n    # extract workloads from relay program\n    print(\"Extract tasks...\")\n    mod, params, input_shape, _ = get_network(network, batch_size=1)\n    tasks = autotvm.task.extract_from_program(\n        mod[\"main\"],\n        target=target,\n        params=params,\n        ops=(relay.op.get(\"nn.conv2d\"),),\n    )\n\n    # run tuning tasks\n    print(\"Tuning...\")\n    tune_tasks(tasks, **tuning_opt)\n\n    # compile kernels with history be [...]
       ]
     },
     {
diff --git a/docs/_downloads/33a19782c8aaf9fc62e565c57df5caca/deploy_sparse.py b/docs/_downloads/33a19782c8aaf9fc62e565c57df5caca/deploy_sparse.py
index 1fcb1b3..92f4511 100644
--- a/docs/_downloads/33a19782c8aaf9fc62e565c57df5caca/deploy_sparse.py
+++ b/docs/_downloads/33a19782c8aaf9fc62e565c57df5caca/deploy_sparse.py
@@ -198,7 +198,7 @@ def import_graphdef(
             with open(os.path.join(abs_path, relay_params), "wb") as fo:
                 fo.write(runtime.save_param_dict(params))
 
-    return mod, params, shape_dict
+    return mod, dict(params.items()), shape_dict
 
 
 ###############################################################################
diff --git a/docs/_downloads/38606228ff7130fbd6473b7c0625ddcd/deploy_model_on_android.ipynb b/docs/_downloads/38606228ff7130fbd6473b7c0625ddcd/deploy_model_on_android.ipynb
index e40b1da..e1fb0a2 100644
--- a/docs/_downloads/38606228ff7130fbd6473b7c0625ddcd/deploy_model_on_android.ipynb
+++ b/docs/_downloads/38606228ff7130fbd6473b7c0625ddcd/deploy_model_on_android.ipynb
@@ -137,7 +137,7 @@
       },
       "outputs": [],
       "source": [
-        "tracker_host = os.environ.get(\"TVM_TRACKER_HOST\", \"0.0.0.0\")\ntracker_port = int(os.environ.get(\"TVM_TRACKER_PORT\", 9190))\nkey = \"android\"\n\nif local_demo:\n    remote = rpc.LocalSession()\nelse:\n    tracker = rpc.connect_tracker(tracker_host, tracker_port)\n    # When running a heavy model, we should increase the `session_timeout`\n    remote = tracker.request(key, priority=0, session_timeout=60)\n\nif local_demo:\n    dev = remote.cpu(0)\nelif test_target == \"openc [...]
+        "tracker_host = os.environ.get(\"TVM_TRACKER_HOST\", \"127.0.0.1\")\ntracker_port = int(os.environ.get(\"TVM_TRACKER_PORT\", 9190))\nkey = \"android\"\n\nif local_demo:\n    remote = rpc.LocalSession()\nelse:\n    tracker = rpc.connect_tracker(tracker_host, tracker_port)\n    # When running a heavy model, we should increase the `session_timeout`\n    remote = tracker.request(key, priority=0, session_timeout=60)\n\nif local_demo:\n    dev = remote.cpu(0)\nelif test_target == \"ope [...]
       ]
     },
     {
diff --git a/docs/_downloads/3961fdfa7abff1b6dc996faa43b4c40f/deploy_model_on_android.py b/docs/_downloads/3961fdfa7abff1b6dc996faa43b4c40f/deploy_model_on_android.py
index 158280f..864e813 100644
--- a/docs/_downloads/3961fdfa7abff1b6dc996faa43b4c40f/deploy_model_on_android.py
+++ b/docs/_downloads/3961fdfa7abff1b6dc996faa43b4c40f/deploy_model_on_android.py
@@ -289,7 +289,7 @@ lib.export_library(lib_fname, fcompile)
 # With RPC, you can deploy the model remotely from your host machine
 # to the remote android device.
 
-tracker_host = os.environ.get("TVM_TRACKER_HOST", "0.0.0.0")
+tracker_host = os.environ.get("TVM_TRACKER_HOST", "127.0.0.1")
 tracker_port = int(os.environ.get("TVM_TRACKER_PORT", 9190))
 key = "android"
 
diff --git a/docs/_downloads/48bd751ebaae08fce134e559f86a25cc/tune_relay_vta.ipynb b/docs/_downloads/48bd751ebaae08fce134e559f86a25cc/tune_relay_vta.ipynb
index f5c8ee8..9b9e62c 100644
--- a/docs/_downloads/48bd751ebaae08fce134e559f86a25cc/tune_relay_vta.ipynb
+++ b/docs/_downloads/48bd751ebaae08fce134e559f86a25cc/tune_relay_vta.ipynb
@@ -83,7 +83,7 @@
       },
       "outputs": [],
       "source": [
-        "# Tracker host and port can be set by your environment\ntracker_host = os.environ.get(\"TVM_TRACKER_HOST\", \"0.0.0.0\")\ntracker_port = int(os.environ.get(\"TVM_TRACKER_PORT\", 9190))\n\n# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file\nenv = vta.get_env()\n\n# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.\n# Set ``device=arm_cpu`` to run inference on the CPU\n# or ``device=vta`` to run inference on the F [...]
+        "# Tracker host and port can be set by your environment\ntracker_host = os.environ.get(\"TVM_TRACKER_HOST\", \"127.0.0.1\")\ntracker_port = int(os.environ.get(\"TVM_TRACKER_PORT\", 9190))\n\n# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file\nenv = vta.get_env()\n\n# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.\n# Set ``device=arm_cpu`` to run inference on the CPU\n# or ``device=vta`` to run inference on the [...]
       ]
     },
     {
diff --git a/docs/_downloads/612f9e42b0247df5c8ab277534e2af65/tune_relay_vta.py b/docs/_downloads/612f9e42b0247df5c8ab277534e2af65/tune_relay_vta.py
index 7deb740..38633b0 100644
--- a/docs/_downloads/612f9e42b0247df5c8ab277534e2af65/tune_relay_vta.py
+++ b/docs/_downloads/612f9e42b0247df5c8ab277534e2af65/tune_relay_vta.py
@@ -180,7 +180,7 @@ def compile_network(env, target, model, start_pack, stop_pack):
 # Here we use an Pynq-Z1 board as an example.
 
 # Tracker host and port can be set by your environment
-tracker_host = os.environ.get("TVM_TRACKER_HOST", "0.0.0.0")
+tracker_host = os.environ.get("TVM_TRACKER_HOST", "127.0.0.1")
 tracker_port = int(os.environ.get("TVM_TRACKER_PORT", 9190))
 
 # Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
diff --git a/docs/_downloads/739deb9ab034a5315ce6ba6bf7e5ff44/tune_relay_cuda.ipynb b/docs/_downloads/739deb9ab034a5315ce6ba6bf7e5ff44/tune_relay_cuda.ipynb
index 924fe57..13ce133 100644
--- a/docs/_downloads/739deb9ab034a5315ce6ba6bf7e5ff44/tune_relay_cuda.ipynb
+++ b/docs/_downloads/739deb9ab034a5315ce6ba6bf7e5ff44/tune_relay_cuda.ipynb
@@ -140,7 +140,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Scale up measurement by using multiple devices\n----------------------------------------------\nIf you have multiple devices, you can use all of them for measurement.\nTVM uses the RPC Tracker to manage distributed devices.\nThe RPC Tracker is a centralized controller node. We can register all devices to\nthe tracker. For example, if we have 10 GPU cards, we can register all of them\nto the tracker, and run 10 measurements in parallel, accelerating the tuning process.\n\nTo star [...]
+        "Scale up measurement by using multiple devices\n----------------------------------------------\nIf you have multiple devices, you can use all of them for measurement.\nTVM uses the RPC Tracker to manage distributed devices.\nThe RPC Tracker is a centralized controller node. We can register all devices to\nthe tracker. For example, if we have 10 GPU cards, we can register all of them\nto the tracker, and run 10 measurements in parallel, accelerating the tuning process.\n\nTo star [...]
       ]
     },
     {
@@ -151,7 +151,7 @@
       },
       "outputs": [],
       "source": [
-        "tuning_option = {\n    \"log_filename\": log_file,\n    \"tuner\": \"xgb\",\n    \"n_trial\": 2000,\n    \"early_stopping\": 600,\n    \"measure_option\": autotvm.measure_option(\n        builder=autotvm.LocalBuilder(timeout=10),\n        runner=autotvm.RPCRunner(\n            \"1080ti\",  # change the device key to your key\n            \"0.0.0.0\",\n            9190,\n            number=20,\n            repeat=3,\n            timeout=4,\n            min_repeat_ms=150,\n        [...]
+        "tuning_option = {\n    \"log_filename\": log_file,\n    \"tuner\": \"xgb\",\n    \"n_trial\": 2000,\n    \"early_stopping\": 600,\n    \"measure_option\": autotvm.measure_option(\n        builder=autotvm.LocalBuilder(timeout=10),\n        runner=autotvm.RPCRunner(\n            \"1080ti\",  # change the device key to your key\n            \"127.0.0.1\",\n            9190,\n            number=20,\n            repeat=3,\n            timeout=4,\n            min_repeat_ms=150,\n      [...]
       ]
     }
   ],
diff --git a/docs/_downloads/78bebde8ea0f8558ac1a6fe12999f99f/tune_network_mali.py b/docs/_downloads/78bebde8ea0f8558ac1a6fe12999f99f/tune_network_mali.py
index 35751fa..8275f96 100644
--- a/docs/_downloads/78bebde8ea0f8558ac1a6fe12999f99f/tune_network_mali.py
+++ b/docs/_downloads/78bebde8ea0f8558ac1a6fe12999f99f/tune_network_mali.py
@@ -23,7 +23,7 @@ Auto-tuning for specific devices and workloads is critical for getting the
 best performance. This is a tutorial on how to tune a whole neural
 network for mali GPU with the auto-scheduler.
 
-To auto-tune a neural network, we partition the network into small subgraphs and 
+To auto-tune a neural network, we partition the network into small subgraphs and
 tune them independently. Each subgraph is treated as one search task.
 A task scheduler slices the time and dynamically allocates time resources to
 these tasks. The task scheduler predicts the impact of each task on the end-to-end
@@ -180,11 +180,11 @@ for idx, task in enumerate(tasks):
 #   .. code-block:: python
 #
 #     from tvm.auto_scheduler.utils import request_remote
-#     remote = request_remote(device_key, "0.0.0.0", 9190)
+#     remote = request_remote(device_key, "127.0.0.1", 9190)
 #     dev = remote.cl()
 #     max_shared_memory_per_block = dev.max_shared_memory_per_block
 #     # There is no explicit local memory limition
-#     # so we can use INT32_MAX to disalbe the check on local_memory.
+#     # so we can use INT32_MAX to disable the check on local_memory.
 #     max_local_memory_per_block = 2147483647 # INT32_MAX
 #     max_threads_per_block = dev.max_threads_per_block
 #     max_vthread_extent = int(dev.warp_size / 4) if int(dev.warp_size / 4) > 1 else dev.warp_size
@@ -228,7 +228,7 @@ def tune_and_evaluate():
         num_measure_trials=200,  # change this to 20000 to achieve the best performance
         builder=auto_scheduler.LocalBuilder(build_func="ndk" if use_ndk else "default"),
         runner=auto_scheduler.RPCRunner(
-            device_key, host="0.0.0.0", port=9190, repeat=3, timeout=50
+            device_key, host="127.0.0.1", port=9190, repeat=3, timeout=50
         ),
         measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
     )
@@ -247,7 +247,7 @@ def tune_and_evaluate():
     print("=============== Request Remote ===============")
     from tvm.auto_scheduler.utils import request_remote
 
-    remote = request_remote(device_key, "0.0.0.0", 9190)
+    remote = request_remote(device_key, "127.0.0.1", 9190)
     dev = remote.cl()
     from tvm.contrib import utils, ndk
 
diff --git a/docs/_downloads/870680567a5bf1e4697356b416e302b4/opt_matmul_auto_tensorcore.ipynb b/docs/_downloads/870680567a5bf1e4697356b416e302b4/opt_matmul_auto_tensorcore.ipynb
deleted file mode 100644
index 227b021..0000000
--- a/docs/_downloads/870680567a5bf1e4697356b416e302b4/opt_matmul_auto_tensorcore.ipynb
+++ /dev/null
@@ -1,111 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "collapsed": false
-      },
-      "outputs": [],
-      "source": [
-        "%matplotlib inline"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "\n\nHow to optimize matmul with Auto TensorCore CodeGen\n===================================================\n**Author**: `Minmin Sun <https://github.com/minminsun>`_,             `Lanbo Li <https://github.com/Orion34C>`_,             `Chenfan Jia <https://github.com/jcf94>`_,             `Jun Yang <https://github.com/yangjunpro>`_\n\nIn this tutorial, we will demonstrate how to write a high performance matmul\nschedule on Volta/Turing GPUs with TVM Auto TensorCore CodeGen.\nThi [...]
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "Preparation and Algorithm\n-------------------------\n2 kinds of input data types are supported: float16 and int8.\nFor float16, the accumulator is float32.\nFor int8, the accumulator is int32.\nFor data layouts, 'N' means None-transpose while 'T' means Transpose.\n\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "collapsed": false
-      },
-      "outputs": [],
-      "source": [
-        "import logging\nimport sys\n\nimport numpy as np\nimport tvm\nfrom tvm import te\n\nfrom tvm import autotvm\nfrom tvm.contrib import nvcc\nimport tvm.testing\n\n\ndef matmul_nn(A, B, L, dtype=\"float16\", layout=\"NN\"):\n    k = te.reduce_axis((0, L), name=\"k\")\n    if dtype == \"float16\":\n        out_type = \"float\"\n    elif dtype == \"int8\":\n        out_type = \"int\"\n    elif dtype == \"int4\" or dtype == \"int1\":\n        out_type = \"int\"\n    if layout == \"NN\ [...]
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "Scheduling the Computation\n--------------------------\nThis schedule is no different than a non-tensorcore matmul schedule on GPU.\nPlease refer to `opt-gemm` tutorial for basics of optimizing matmul schedule.\nWhen the \"tensor_core\" pragma is set, the \"rewrite for tensorcore\" ir pass\nwill automatically transform the schedule for tensorcore codegen,\notherwise normal CUDA code, with lower performance but equal functionality, will be generated.\n\n<div class=\"alert alert-i [...]
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "collapsed": false
-      },
-      "outputs": [],
-      "source": [
-        "@autotvm.template(\"tutorial/auto_tensorcore/test_gemm\")\ndef test_gemm(N, L, M, dtype, layout):\n    if layout == \"NN\":\n        shape_a = (N, L)\n        shape_b = (L, M)\n    elif layout == \"NT\":\n        shape_a = (L, N)\n        shape_b = (L, M)\n    elif layout == \"TN\":\n        shape_a = (N, L)\n        shape_b = (M, L)\n    elif layout == \"TT\":\n        shape_a = (L, N)\n        shape_b = (M, L)\n    else:\n        print(\"Unsupported layout:\", layout)\n        [...]
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "AutoTune and Test\n-----------------\nFinally we use a tuner to tune the schedule, generate code with best config\nand run the kernel to compare with numpy to check whether the results are correct.\n\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "collapsed": false
-      },
-      "outputs": [],
-      "source": [
-        "# check whether the gpu has tensorcore\nif not tvm.gpu(0).exist or not tvm.runtime.enabled(\"cuda\"):\n    raise Exception(\"skip building this tutorial because cuda is not enabled..\")\n\ndev = tvm.gpu()\nif not nvcc.have_tensorcore(dev.compute_version):\n    raise Exception(\"the gpu has no tensorcore, skipping...\")\n\nM, N, L = 512, 32, 512\ndtype = \"float16\"\nlayout = \"NN\"\nif len(sys.argv) >= 4:\n    M, N, L = int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3])\nif le [...]
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "Sample Output\n-------------\n.. code-block:: bash\n\n   Best config:\n   [('bx', 4), ('by', 32), ('step_k', 16), ('v', 8)],,None,40\n   Finish loading 162 records\n   produce compute {\n     // attr [iter_var(blockIdx.y, , blockIdx.y)] thread_extent = 1\n     // attr [compute.local] storage_scope = \"wmma.accumulator\"\n     allocate compute.local[float32 * 256]\n     // attr [A.shared] storage_scope = \"shared\"\n     allocate A.shared[float16 * 8448]\n     // attr [B.shared]  [...]
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "Summary\n-------\nThis tutorial demonstrates how to use the AutoTensorCoreCodeGen of TVM\nto generate tensorcore kernels.\n\n"
-      ]
-    }
-  ],
-  "metadata": {
-    "kernelspec": {
-      "display_name": "Python 3",
-      "language": "python",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.6.12"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
\ No newline at end of file
diff --git a/docs/_downloads/87b9e8307245d848689e4cdc3e6fa9bf/deploy_sparse.ipynb b/docs/_downloads/87b9e8307245d848689e4cdc3e6fa9bf/deploy_sparse.ipynb
index 4f8ec52..bdc451b 100644
--- a/docs/_downloads/87b9e8307245d848689e4cdc3e6fa9bf/deploy_sparse.ipynb
+++ b/docs/_downloads/87b9e8307245d848689e4cdc3e6fa9bf/deploy_sparse.ipynb
@@ -87,7 +87,7 @@
       },
       "outputs": [],
       "source": [
-        "def import_graphdef(\n    name,\n    batch_size,\n    seq_len,\n    save_relay=True,\n    relay_file=\"model.json\",\n    relay_params=\"model.params\",\n):\n    abs_path = os.path.dirname(os.path.abspath(__file__))\n    shape_dict = {\"input_1\": (batch_size, seq_len)}\n    relay_file = (\"%s_%d_%d_%s\" % (name, batch_size, seq_len, relay_file)).replace(\"/\", \"_\")\n    relay_params = (\"%s_%d_%d_%s\" % (name, batch_size, seq_len, relay_params)).replace(\"/\", \"_\")\n    if  [...]
+        "def import_graphdef(\n    name,\n    batch_size,\n    seq_len,\n    save_relay=True,\n    relay_file=\"model.json\",\n    relay_params=\"model.params\",\n):\n    abs_path = os.path.dirname(os.path.abspath(__file__))\n    shape_dict = {\"input_1\": (batch_size, seq_len)}\n    relay_file = (\"%s_%d_%d_%s\" % (name, batch_size, seq_len, relay_file)).replace(\"/\", \"_\")\n    relay_params = (\"%s_%d_%d_%s\" % (name, batch_size, seq_len, relay_params)).replace(\"/\", \"_\")\n    if  [...]
       ]
     },
     {
diff --git a/docs/_downloads/b78890bb249aab574c50f16eb0be62a9/tune_network_arm.ipynb b/docs/_downloads/b78890bb249aab574c50f16eb0be62a9/tune_network_arm.ipynb
index 2f44b30..a448a02 100644
--- a/docs/_downloads/b78890bb249aab574c50f16eb0be62a9/tune_network_arm.ipynb
+++ b/docs/_downloads/b78890bb249aab574c50f16eb0be62a9/tune_network_arm.ipynb
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\nAuto-scheduling a Neural Network for ARM CPU\n=============================================\n**Author**: `Thierry Moreau <https://github.com/tmoreau89>_`,             `Lianmin Zheng <https://github.com/merrymercy>_`,             `Chengfan Jia <https://github.com/jcf94/>`_\n\nAuto-tuning for specific devices and workloads is critical for getting the\nbest performance. This is a tutorial on how to tune a whole neural\nnetwork for ARM CPU with the auto-scheduler via RPC.\n\nTo au [...]
+        "\nAuto-scheduling a Neural Network for ARM CPU\n=============================================\n**Author**: `Thierry Moreau <https://github.com/tmoreau89>_`,             `Lianmin Zheng <https://github.com/merrymercy>_`,             `Chengfan Jia <https://github.com/jcf94/>`_\n\nAuto-tuning for specific devices and workloads is critical for getting the\nbest performance. This is a tutorial on how to tune a whole neural\nnetwork for ARM CPU with the auto-scheduler via RPC.\n\nTo au [...]
       ]
     },
     {
@@ -76,7 +76,7 @@
       },
       "outputs": [],
       "source": [
-        "#### DEVICE CONFIG ####\n\n# Replace \"aarch64-linux-gnu\" with the correct target of your board.\n# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.\n# FIXME(tmoreau89, merrymercy): We leave '-device=arm_cpu' out of the target string\n#                               because we're sharing x86 op strategy.\ntarget = tvm.target.Target(\"llvm -mtriple=aarch64-linux-gnu -mattr=+neon\")\n\n# Also replace this with the device key, rpc host  [...]
+        "#### DEVICE CONFIG ####\n\n# Replace \"aarch64-linux-gnu\" with the correct target of your board.\n# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.\n# FIXME(tmoreau89, merrymercy): We leave '-device=arm_cpu' out of the target string\n#                               because we're sharing x86 op strategy.\ntarget = tvm.target.Target(\"llvm -mtriple=aarch64-linux-gnu -mattr=+neon\")\n\n# Also replace this with the device key, rpc host  [...]
       ]
     },
     {
diff --git a/docs/_downloads/baf1373314e0e040008107ff2571b4cd/tune_relay_arm.py b/docs/_downloads/baf1373314e0e040008107ff2571b4cd/tune_relay_arm.py
index 9223eb3..68d263b 100644
--- a/docs/_downloads/baf1373314e0e040008107ff2571b4cd/tune_relay_arm.py
+++ b/docs/_downloads/baf1373314e0e040008107ff2571b4cd/tune_relay_arm.py
@@ -224,7 +224,7 @@ tuning_option = {
         builder=autotvm.LocalBuilder(build_func="ndk" if use_android else "default"),
         runner=autotvm.RPCRunner(
             device_key,
-            host="0.0.0.0",
+            host="127.0.0.1",
             port=9190,
             number=5,
             timeout=10,
@@ -343,7 +343,7 @@ def tune_and_evaluate(tuning_opt):
 
         # upload module to device
         print("Upload...")
-        remote = autotvm.measure.request_remote(device_key, "0.0.0.0", 9190, timeout=10000)
+        remote = autotvm.measure.request_remote(device_key, "127.0.0.1", 9190, timeout=10000)
         remote.upload(tmp.relpath(filename))
         rlib = remote.load_module(filename)
 
diff --git a/docs/_downloads/e41367a7f459e4f4dca82180009c1539/tune_relay_mobile_gpu.py b/docs/_downloads/e41367a7f459e4f4dca82180009c1539/tune_relay_mobile_gpu.py
index 2b10987..790c2ff 100644
--- a/docs/_downloads/e41367a7f459e4f4dca82180009c1539/tune_relay_mobile_gpu.py
+++ b/docs/_downloads/e41367a7f459e4f4dca82180009c1539/tune_relay_mobile_gpu.py
@@ -225,7 +225,7 @@ tuning_option = {
         builder=autotvm.LocalBuilder(build_func="ndk" if use_android else "default"),
         runner=autotvm.RPCRunner(
             device_key,
-            host="0.0.0.0",
+            host="127.0.0.1",
             port=9190,
             number=10,
             timeout=5,
@@ -340,7 +340,7 @@ def tune_and_evaluate(tuning_opt):
 
         # upload module to device
         print("Upload...")
-        remote = autotvm.measure.request_remote(device_key, "0.0.0.0", 9190, timeout=10000)
+        remote = autotvm.measure.request_remote(device_key, "127.0.0.1", 9190, timeout=10000)
         remote.upload(tmp.relpath(filename))
         rlib = remote.load_module(filename)
 
diff --git a/docs/_downloads/f8f7a2adf30f5033603d79cdbacd9235/tune_relay_arm.ipynb b/docs/_downloads/f8f7a2adf30f5033603d79cdbacd9235/tune_relay_arm.ipynb
index 16fac91..5020a05 100644
--- a/docs/_downloads/f8f7a2adf30f5033603d79cdbacd9235/tune_relay_arm.ipynb
+++ b/docs/_downloads/f8f7a2adf30f5033603d79cdbacd9235/tune_relay_arm.ipynb
@@ -83,7 +83,7 @@
       },
       "outputs": [],
       "source": [
-        "#### DEVICE CONFIG ####\n\n# Replace \"aarch64-linux-gnu\" with the correct target of your board.\n# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.\ntarget = tvm.target.Target(\"llvm -device=arm_cpu -mtriple=aarch64-linux-gnu\")\n\n# Also replace this with the device key in your tracker\ndevice_key = \"rk3399\"\n\n# Set this to True if you use android phone\nuse_android = False\n\n#### TUNING OPTION ####\nnetwork = \"resnet-18\"\nlo [...]
+        "#### DEVICE CONFIG ####\n\n# Replace \"aarch64-linux-gnu\" with the correct target of your board.\n# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.\ntarget = tvm.target.Target(\"llvm -device=arm_cpu -mtriple=aarch64-linux-gnu\")\n\n# Also replace this with the device key in your tracker\ndevice_key = \"rk3399\"\n\n# Set this to True if you use android phone\nuse_android = False\n\n#### TUNING OPTION ####\nnetwork = \"resnet-18\"\nlo [...]
       ]
     },
     {
@@ -126,7 +126,7 @@
       },
       "outputs": [],
       "source": [
-        "def tune_and_evaluate(tuning_opt):\n    # extract workloads from relay program\n    print(\"Extract tasks...\")\n    mod, params, input_shape, _ = get_network(network, batch_size=1)\n    tasks = autotvm.task.extract_from_program(\n        mod[\"main\"], target=target, params=params, ops=(relay.op.get(\"nn.conv2d\"),)\n    )\n\n    # run tuning tasks\n    print(\"Tuning...\")\n    tune_tasks(tasks, **tuning_opt)\n\n    # compile kernels with history best records\n    with autotvm [...]
+        "def tune_and_evaluate(tuning_opt):\n    # extract workloads from relay program\n    print(\"Extract tasks...\")\n    mod, params, input_shape, _ = get_network(network, batch_size=1)\n    tasks = autotvm.task.extract_from_program(\n        mod[\"main\"], target=target, params=params, ops=(relay.op.get(\"nn.conv2d\"),)\n    )\n\n    # run tuning tasks\n    print(\"Tuning...\")\n    tune_tasks(tasks, **tuning_opt)\n\n    # compile kernels with history best records\n    with autotvm [...]
       ]
     },
     {
diff --git a/docs/_downloads/fb88afbf9be39a834109b9b842f12fd0/tune_network_arm.py b/docs/_downloads/fb88afbf9be39a834109b9b842f12fd0/tune_network_arm.py
index 46d95c3..d6d8097 100644
--- a/docs/_downloads/fb88afbf9be39a834109b9b842f12fd0/tune_network_arm.py
+++ b/docs/_downloads/fb88afbf9be39a834109b9b842f12fd0/tune_network_arm.py
@@ -25,7 +25,7 @@ Auto-tuning for specific devices and workloads is critical for getting the
 best performance. This is a tutorial on how to tune a whole neural
 network for ARM CPU with the auto-scheduler via RPC.
 
-To auto-tune a neural network, we partition the network into small subgraphs and 
+To auto-tune a neural network, we partition the network into small subgraphs and
 tune them independently. Each subgraph is treated as one search task.
 A task scheduler slices the time and dynamically allocates time resources to
 these tasks. The task scheduler predicts the impact of each task on the end-to-end
@@ -234,7 +234,7 @@ target = tvm.target.Target("llvm -mtriple=aarch64-linux-gnu -mattr=+neon")
 
 # Also replace this with the device key, rpc host and rpc port in your tracker
 device_key = "rasp4b-64"
-rpc_host = "0.0.0.0"
+rpc_host = "127.0.0.1"
 rpc_port = 9191
 
 # Set this to True if you use ndk tools for cross compiling
diff --git a/docs/_images/sphx_glr_opt_matmul_auto_tensorcore_thumb.png b/docs/_images/sphx_glr_opt_matmul_auto_tensorcore_thumb.png
deleted file mode 100644
index 233f8e6..0000000
Binary files a/docs/_images/sphx_glr_opt_matmul_auto_tensorcore_thumb.png and /dev/null differ
diff --git a/docs/_sources/deploy/arm_compute_lib.rst.txt b/docs/_sources/deploy/arm_compute_lib.rst.txt
index 4e43682..1abc31b 100644
--- a/docs/_sources/deploy/arm_compute_lib.rst.txt
+++ b/docs/_sources/deploy/arm_compute_lib.rst.txt
@@ -178,7 +178,7 @@ An example configuration for `test_config.json`:
 
     {
       "connection_type": "local",
-      "host": "localhost",
+      "host": "127.0.0.1",
       "port": 9090,
       "target": "llvm -mtriple=aarch64-linux-gnu -mattr=+neon",
       "device_key": "",
diff --git a/docs/_sources/dev/index.rst.txt b/docs/_sources/dev/index.rst.txt
index c297d32..ed0f1a1 100644
--- a/docs/_sources/dev/index.rst.txt
+++ b/docs/_sources/dev/index.rst.txt
@@ -27,7 +27,7 @@ This page is organized as follows:
 - The `Logical Architecture Components`_ section describes the logical components.
   The sections after are specific guides focused on each logical component, organized
   by the component's name.
-- Feel free to also checkout the :ref:`dev-how-to` for useful development tips.
+- Feel free to also check out the :ref:`dev-how-to` for useful development tips.
 
 This guide provides a few complementary views of the architecture.
 First, we review a single end-to-end compilation flow and discuss the key data structures and the transformations.
@@ -42,7 +42,7 @@ In this guide, we will study an example compilation flow in the compiler. The fi
 
 - Import: The frontend component ingests a model into an IRModule, which contains a collection of functions that internally represent the model.
 - Transformation: The compiler transforms an IRModule to another functionally equivalent or approximately
-  equivalent(e.g. in the case of quantization) IRModule. Many of the transformatons are target (backend) independent.
+  equivalent(e.g. in the case of quantization) IRModule. Many of the transformations are target (backend) independent.
   We also allow target to affect the configuration of the transformation pipeline.
 - Target Translation: The compiler translates(codegen) the IRModule to an executable format specified by the target.
   The target translation result is encapsulated as a `runtime.Module` that can be exported, loaded, and executed on the target runtime environment.
@@ -103,7 +103,7 @@ Many low-level optimizations can be handled in the target phase by the LLVM, CUD
 Search-space and Learning-based Transformations
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The transformation passes we described so far are deterministic and rule-based. One design goal of the TVM stack is to support high-performance code optimizations for different hardware platforms. To do so, we will need to investigate as many optimizations choices as possible, including but not limited to, multi-dimensional tensor access, loop tiling behavior, special accelerator memory hierarchy, and threading.
+The transformation passes we described so far are deterministic and rule-based. One design goal of the TVM stack is to support high-performance code optimizations for different hardware platforms. To do so, we will need to investigate as many optimization choices as possible, including but not limited to, multi-dimensional tensor access, loop tiling behavior, special accelerator memory hierarchy, and threading.
 
 It is hard to define a heuristic to make all of the choices. Instead, we will take a search and learning-based approach.
 We first define a collection of actions we can take to transform a program. Example actions include loop transformations, inlining,
diff --git a/docs/_sources/dev/pass_infra.rst.txt b/docs/_sources/dev/pass_infra.rst.txt
index 3680cb8..6bd4689 100644
--- a/docs/_sources/dev/pass_infra.rst.txt
+++ b/docs/_sources/dev/pass_infra.rst.txt
@@ -394,7 +394,7 @@ Python Frontend
 
 Only some simple APIs are needed for the frontend side. For example, we can
 provide users the following APIs to create and execute a pass (full
-implementation is provided in `python/tvm/relay/transform.py`_ and
+implementation is provided in `python/tvm/relay/transform/transform.py`_ and
 `python/tvm/ir/transform.py`_). The backend
 receives the information and decides which function it should use to create
 a Pass object.
@@ -460,7 +460,7 @@ users so that they can customize their own pass or pass pipeline.
 
 For all the passes that are implemented in the C++ backend, we provide
 corresponding Python APIs in `python/tvm/ir/transform.py`_ and
-`python/tvm/relay/transform.py`_, respectively. For instance,
+`python/tvm/relay/transform/transform.py`_, respectively. For instance,
 const folding has a Python API like the following:
 
 .. code:: python
@@ -538,7 +538,7 @@ optimization pipeline and debug Relay and tir passes, please refer to the
 
 .. _src/relay/pass/fold_constant.cc: https://github.com/apache/tvm/blob/main/src/relay/pass/fold_constant.cc
 
-.. _python/tvm/relay/transform.py: https://github.com/apache/tvm/blob/main/python/tvm/relay/transform.py
+.. _python/tvm/relay/transform/transform.py: https://github.com/apache/tvm/blob/main/python/tvm/relay/transform/transform.py
 
 .. _include/tvm/relay/transform.h: https://github.com/apache/tvm/blob/main/include/tvm/relay/transform.h
 
diff --git a/docs/_sources/dev/runtime.rst.txt b/docs/_sources/dev/runtime.rst.txt
index c044646..fc03ed8 100644
--- a/docs/_sources/dev/runtime.rst.txt
+++ b/docs/_sources/dev/runtime.rst.txt
@@ -138,11 +138,9 @@ This philosophy of embedded API is very like Lua, except that we don't have a ne
 
 One fun fact about PackedFunc is that we use it for both compiler and deployment stack.
 
-- All compiler pass functions of TVM are exposed to frontend as PackedFunc, see `here`_
+- All compiler pass functions of TVM are exposed to frontend as PackedFunc
 - The compiled module also returns the compiled function as PackedFunc
 
-.. _here: https://github.com/apache/tvm/tree/main/src/api
-
 To keep the runtime minimum, we isolated the IR Object support from the deployment runtime. The resulting runtime takes around 200K - 600K depending on how many runtime driver modules (e.g., CUDA) get included.
 
 The overhead of calling into PackedFunc vs. a normal function is small, as it is only saving a few values on the stack.
@@ -279,7 +277,7 @@ Each argument in PackedFunc contains a union value `TVMValue`_
 and a type code. This design allows the dynamically typed language to convert to the corresponding type directly, and statically typed language to
 do runtime type checking during conversion.
 
-.. _TVMValue: https://github.com/apache/tvm/blob/main/include/tvm/runtime/c_runtime_api.h#L122
+.. _TVMValue: https://github.com/apache/tvm/blob/main/include/tvm/runtime/c_runtime_api.h#L135
 
 The relevant files are
 
diff --git a/docs/_sources/install/from_source.rst.txt b/docs/_sources/install/from_source.rst.txt
index c1e455b..bc6cdb9 100644
--- a/docs/_sources/install/from_source.rst.txt
+++ b/docs/_sources/install/from_source.rst.txt
@@ -89,6 +89,11 @@ The configuration of TVM can be modified by `config.cmake`.
   - Change ``set(USE_CUDA OFF)`` to ``set(USE_CUDA ON)`` to enable CUDA backend. Do the same for other backends and libraries
     you want to build for (OpenCL, RCOM, METAL, VULKAN, ...).
   - To help with debugging, ensure the embedded graph executor and debugging functions are enabled with ``set(USE_GRAPH_EXECUTOR ON)`` and ``set(USE_PROFILER ON)``
+  - To debug with IRs, ``set(USE_RELAY_DEBUG ON)`` and set environment variable `TVM_LOG_DEBUG`.
+
+      .. code:: bash
+
+          export TVM_LOG_DEBUG=1
 
 - TVM requires LLVM for for CPU codegen. We highly recommend you to build with the LLVM support on.
 
diff --git a/docs/_sources/tutorials/auto_scheduler/sg_execution_times.rst.txt b/docs/_sources/tutorials/auto_scheduler/sg_execution_times.rst.txt
index 4a5eefe..4347d0d 100644
--- a/docs/_sources/tutorials/auto_scheduler/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorials/auto_scheduler/sg_execution_times.rst.txt
@@ -5,11 +5,11 @@
 
 Computation times
 =================
-**02:49.036** total execution time for **tutorials_auto_scheduler** files:
-
-- **01:29.784**: :ref:`sphx_glr_tutorials_auto_scheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``)
-- **00:37.456**: :ref:`sphx_glr_tutorials_auto_scheduler_tune_network_x86.py` (``tune_network_x86.py``)
-- **00:21.163**: :ref:`sphx_glr_tutorials_auto_scheduler_tune_network_cuda.py` (``tune_network_cuda.py``)
-- **00:15.855**: :ref:`sphx_glr_tutorials_auto_scheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``)
-- **00:03.142**: :ref:`sphx_glr_tutorials_auto_scheduler_tune_network_arm.py` (``tune_network_arm.py``)
-- **00:01.636**: :ref:`sphx_glr_tutorials_auto_scheduler_tune_network_mali.py` (``tune_network_mali.py``)
+**03:43.360** total execution time for **tutorials_auto_scheduler** files:
+
+- **02:20.264**: :ref:`sphx_glr_tutorials_auto_scheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``)
+- **00:36.433**: :ref:`sphx_glr_tutorials_auto_scheduler_tune_network_x86.py` (``tune_network_x86.py``)
+- **00:21.478**: :ref:`sphx_glr_tutorials_auto_scheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``)
+- **00:20.485**: :ref:`sphx_glr_tutorials_auto_scheduler_tune_network_cuda.py` (``tune_network_cuda.py``)
+- **00:03.082**: :ref:`sphx_glr_tutorials_auto_scheduler_tune_network_arm.py` (``tune_network_arm.py``)
+- **00:01.618**: :ref:`sphx_glr_tutorials_auto_scheduler_tune_network_mali.py` (``tune_network_mali.py``)
diff --git a/docs/_sources/tutorials/auto_scheduler/tune_conv2d_layer_cuda.rst.txt b/docs/_sources/tutorials/auto_scheduler/tune_conv2d_layer_cuda.rst.txt
index 72014aa..8b86024 100644
--- a/docs/_sources/tutorials/auto_scheduler/tune_conv2d_layer_cuda.rst.txt
+++ b/docs/_sources/tutorials/auto_scheduler/tune_conv2d_layer_cuda.rst.txt
@@ -191,6 +191,7 @@ file and apply it.
 
 
 
+
 We can lower the schedule to see the IR after auto-scheduling.
 The auto-scheduler correctly performs optimizations including multi-level tiling,
 cooperative fetching, unrolling and operator fusion.
@@ -215,16 +216,16 @@ cooperative fetching, unrolling and operator fusion.
     Lowered TIR:
     primfn(data_1: handle, kernel_1: handle, bias_1: handle, compute_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
-      buffers = {bias: Buffer(bias_2: Pointer(float32), float32, [1, 512, 1, 1], []),
-                 compute: Buffer(compute_2: Pointer(float32), float32, [1, 512, 7, 7], []),
+      buffers = {compute: Buffer(compute_2: Pointer(float32), float32, [1, 512, 7, 7], []),
+                 bias: Buffer(bias_2: Pointer(float32), float32, [1, 512, 1, 1], []),
                  kernel: Buffer(kernel_2: Pointer(float32), float32, [512, 512, 3, 3], []),
                  data: Buffer(data_2: Pointer(float32), float32, [1, 512, 7, 7], [])}
       buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, compute_1: compute} {
-      attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 28;
+      attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 56;
       attr [compute_3: Pointer(float32)] "storage_scope" = "local";
-      allocate(compute_3, float32, [14]);
+      allocate(compute_3, float32, [7]);
       attr [pad_temp.shared: Pointer(float32)] "storage_scope" = "shared";
-      allocate(pad_temp.shared, float32, [72]);
+      allocate(pad_temp.shared, float32, [144]);
       attr [kernel.shared: Pointer(float32)] "storage_scope" = "shared";
       allocate(kernel.shared, float32, [3072]);
       attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64 {
@@ -235,468 +236,457 @@ cooperative fetching, unrolling and operator fusion.
         compute_3[4] = 0f32
         compute_3[5] = 0f32
         compute_3[6] = 0f32
-        compute_3[7] = 0f32
-        compute_3[8] = 0f32
-        compute_3[9] = 0f32
-        compute_3[10] = 0f32
-        compute_3[11] = 0f32
-        compute_3[12] = 0f32
-        compute_3[13] = 0f32
-        for (rc.outer.outer: int32, 0, 64) {
-          for (ry.outer.outer: int32, 0, 3) {
-            attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64 {
-              if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
-                pad_temp.shared[(threadIdx.x_1*4)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1*4), 9))) && (floormod((threadIdx.x_1*4), 9) < 8)), (float32*)data_2[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1*4), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod((threadIdx.x_1*4), 9)) - 8)], 0f32, dtype=float32)
-              }
-              if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
-                pad_temp.shared[((threadIdx.x_1*4) + 1)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 1), 9))) && (floormod(((threadIdx.x_1*4) + 1), 9) < 8)), (float32*)data_2[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 1), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 1), 9)) - 8)], 0f32, dtype=float32)
-              }
-              if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
-                pad_temp.shared[((threadIdx.x_1*4) + 2)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 2), 9))) && (floormod(((threadIdx.x_1*4) + 2), 9) < 8)), (float32*)data_2[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 2), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 2), 9)) - 8)], 0f32, dtype=float32)
-              }
-              if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
-                pad_temp.shared[((threadIdx.x_1*4) + 3)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 3), 9))) && (floormod(((threadIdx.x_1*4) + 3), 9) < 8)), (float32*)data_2[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 3), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 3), 9)) - 8)], 0f32, dtype=float32)
-              }
+        for (rc.outer.outer: int32, 0, 32) {
+          for (rx.outer.outer: int32, 0, 3) {
+            attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+            pad_temp.shared[threadIdx.x_1] = @tir.if_then_else(((((1 <= floormod(threadIdx.x_1, 9)) && (floormod(threadIdx.x_1, 9) < 8)) && (1 <= (rx.outer.outer + floormod(blockIdx.x, 7)))) && ((rx.outer.outer + floormod(blockIdx.x, 7)) < 8)), (float32*)data_2[((((((rc.outer.outer*784) + (floordiv(threadIdx.x_1, 9)*49)) + (floormod(threadIdx.x_1, 9)*7)) + rx.outer.outer) + floormod(blockIdx.x, 7)) - 8)], 0f32, dtype=float32)
+            attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+            pad_temp.shared[(threadIdx.x_1 + 64)] = @tir.if_then_else(((((1 <= floormod((threadIdx.x_1 + 1), 9)) && (floormod((threadIdx.x_1 + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod(blockIdx.x, 7)))) && ((rx.outer.outer + floormod(blockIdx.x, 7)) < 8)), (float32*)data_2[((((((rc.outer.outer*784) + (floordiv((threadIdx.x_1 + 64), 9)*49)) + (floormod((threadIdx.x_1 + 1), 9)*7)) + rx.outer.outer) + floormod(blockIdx.x, 7)) - 8)], 0f32, dtype=float32)
+            attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+            if @tir.likely((threadIdx.x_1 < 16), dtype=bool) {
+              pad_temp.shared[(threadIdx.x_1 + 128)] = @tir.if_then_else(((((1 <= floormod((threadIdx.x_1 + 2), 9)) && (floormod((threadIdx.x_1 + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod(blockIdx.x, 7)))) && ((rx.outer.outer + floormod(blockIdx.x, 7)) < 8)), (float32*)data_2[((((((rc.outer.outer*784) + (floordiv((threadIdx.x_1 + 128), 9)*49)) + (floormod((threadIdx.x_1 + 2), 9)*7)) + rx.outer.outer) + floormod(blockIdx.x, 7)) - 8)], 0f32, dtype=float32)
             }
             attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[threadIdx.x_2] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + (ry.outer.outer*3)) + floormod(threadIdx.x_2, 3))]
+            kernel.shared[threadIdx.x_2] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv(threadIdx.x_2, 48)*4608)) + (rc.outer.outer*144)) + (floormod(threadIdx.x_2, 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 64)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 64), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 1), 3))]
+            kernel.shared[(threadIdx.x_2 + 64)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 64), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 16), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 128)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 128), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 2), 3))]
+            kernel.shared[(threadIdx.x_2 + 128)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 128), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 32), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 192)] = (float32*)kernel_2[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + (ry.outer.outer*3)) + floormod(threadIdx.x_2, 3)) + 36864)]
+            kernel.shared[(threadIdx.x_2 + 192)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*294912) + (floordiv(threadIdx.x_2, 48)*4608)) + (rc.outer.outer*144)) + (floormod(threadIdx.x_2, 48)*3)) + rx.outer.outer) + 18432)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 256)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 256), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 1), 3))]
+            kernel.shared[(threadIdx.x_2 + 256)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 256), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 16), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 320)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 320), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 2), 3))]
+            kernel.shared[(threadIdx.x_2 + 320)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 320), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 32), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 384)] = (float32*)kernel_2[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + (ry.outer.outer*3)) + floormod(threadIdx.x_2, 3)) + 73728)]
+            kernel.shared[(threadIdx.x_2 + 384)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*294912) + (floordiv(threadIdx.x_2, 48)*4608)) + (rc.outer.outer*144)) + (floormod(threadIdx.x_2, 48)*3)) + rx.outer.outer) + 36864)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 448)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 448), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 1), 3))]
+            kernel.shared[(threadIdx.x_2 + 448)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 448), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 16), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 512)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 512), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 2), 3))]
+            kernel.shared[(threadIdx.x_2 + 512)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 512), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 32), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 576)] = (float32*)kernel_2[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + (ry.outer.outer*3)) + floormod(threadIdx.x_2, 3)) + 110592)]
+            kernel.shared[(threadIdx.x_2 + 576)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*294912) + (floordiv(threadIdx.x_2, 48)*4608)) + (rc.outer.outer*144)) + (floormod(threadIdx.x_2, 48)*3)) + rx.outer.outer) + 55296)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 640)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 640), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 1), 3))]
+            kernel.shared[(threadIdx.x_2 + 640)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 640), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 16), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 704)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 704), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 2), 3))]
+            kernel.shared[(threadIdx.x_2 + 704)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 704), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 32), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 768)] = (float32*)kernel_2[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + (ry.outer.outer*3)) + floormod(threadIdx.x_2, 3)) + 147456)]
+            kernel.shared[(threadIdx.x_2 + 768)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*294912) + (floordiv(threadIdx.x_2, 48)*4608)) + (rc.outer.outer*144)) + (floormod(threadIdx.x_2, 48)*3)) + rx.outer.outer) + 73728)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 832)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 832), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 1), 3))]
+            kernel.shared[(threadIdx.x_2 + 832)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 832), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 16), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 896)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 896), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 2), 3))]
+            kernel.shared[(threadIdx.x_2 + 896)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 896), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 32), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 960)] = (float32*)kernel_2[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + (ry.outer.outer*3)) + floormod(threadIdx.x_2, 3)) + 184320)]
+            kernel.shared[(threadIdx.x_2 + 960)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*294912) + (floordiv(threadIdx.x_2, 48)*4608)) + (rc.outer.outer*144)) + (floormod(threadIdx.x_2, 48)*3)) + rx.outer.outer) + 92160)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 1024)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1024), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 1), 3))]
+            kernel.shared[(threadIdx.x_2 + 1024)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 1024), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 16), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 1088)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1088), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 2), 3))]
+            kernel.shared[(threadIdx.x_2 + 1088)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 1088), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 32), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 1152)] = (float32*)kernel_2[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + (ry.outer.outer*3)) + floormod(threadIdx.x_2, 3)) + 221184)]
+            kernel.shared[(threadIdx.x_2 + 1152)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*294912) + (floordiv(threadIdx.x_2, 48)*4608)) + (rc.outer.outer*144)) + (floormod(threadIdx.x_2, 48)*3)) + rx.outer.outer) + 110592)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 1216)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1216), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 1), 3))]
+            kernel.shared[(threadIdx.x_2 + 1216)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 1216), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 16), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 1280)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1280), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 2), 3))]
+            kernel.shared[(threadIdx.x_2 + 1280)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 1280), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 32), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 1344)] = (float32*)kernel_2[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + (ry.outer.outer*3)) + floormod(threadIdx.x_2, 3)) + 258048)]
+            kernel.shared[(threadIdx.x_2 + 1344)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*294912) + (floordiv(threadIdx.x_2, 48)*4608)) + (rc.outer.outer*144)) + (floormod(threadIdx.x_2, 48)*3)) + rx.outer.outer) + 129024)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 1408)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1408), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 1), 3))]
+            kernel.shared[(threadIdx.x_2 + 1408)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 1408), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 16), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 1472)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1472), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 2), 3))]
+            kernel.shared[(threadIdx.x_2 + 1472)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 1472), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 32), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 1536)] = (float32*)kernel_2[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + (ry.outer.outer*3)) + floormod(threadIdx.x_2, 3)) + 294912)]
+            kernel.shared[(threadIdx.x_2 + 1536)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*294912) + (floordiv(threadIdx.x_2, 48)*4608)) + (rc.outer.outer*144)) + (floormod(threadIdx.x_2, 48)*3)) + rx.outer.outer) + 147456)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 1600)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1600), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 1), 3))]
+            kernel.shared[(threadIdx.x_2 + 1600)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 1600), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 16), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 1664)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1664), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 2), 3))]
+            kernel.shared[(threadIdx.x_2 + 1664)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 1664), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 32), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 1728)] = (float32*)kernel_2[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + (ry.outer.outer*3)) + floormod(threadIdx.x_2, 3)) + 331776)]
+            kernel.shared[(threadIdx.x_2 + 1728)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*294912) + (floordiv(threadIdx.x_2, 48)*4608)) + (rc.outer.outer*144)) + (floormod(threadIdx.x_2, 48)*3)) + rx.outer.outer) + 165888)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 1792)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1792), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 1), 3))]
+            kernel.shared[(threadIdx.x_2 + 1792)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 1792), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 16), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 1856)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1856), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 2), 3))]
+            kernel.shared[(threadIdx.x_2 + 1856)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 1856), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 32), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 1920)] = (float32*)kernel_2[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + (ry.outer.outer*3)) + floormod(threadIdx.x_2, 3)) + 368640)]
+            kernel.shared[(threadIdx.x_2 + 1920)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*294912) + (floordiv(threadIdx.x_2, 48)*4608)) + (rc.outer.outer*144)) + (floormod(threadIdx.x_2, 48)*3)) + rx.outer.outer) + 184320)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 1984)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1984), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 1), 3))]
+            kernel.shared[(threadIdx.x_2 + 1984)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 1984), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 16), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 2048)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2048), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 2), 3))]
+            kernel.shared[(threadIdx.x_2 + 2048)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 2048), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 32), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 2112)] = (float32*)kernel_2[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + (ry.outer.outer*3)) + floormod(threadIdx.x_2, 3)) + 405504)]
+            kernel.shared[(threadIdx.x_2 + 2112)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*294912) + (floordiv(threadIdx.x_2, 48)*4608)) + (rc.outer.outer*144)) + (floormod(threadIdx.x_2, 48)*3)) + rx.outer.outer) + 202752)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 2176)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2176), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 1), 3))]
+            kernel.shared[(threadIdx.x_2 + 2176)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 2176), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 16), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 2240)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2240), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 2), 3))]
+            kernel.shared[(threadIdx.x_2 + 2240)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 2240), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 32), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 2304)] = (float32*)kernel_2[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + (ry.outer.outer*3)) + floormod(threadIdx.x_2, 3)) + 442368)]
+            kernel.shared[(threadIdx.x_2 + 2304)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*294912) + (floordiv(threadIdx.x_2, 48)*4608)) + (rc.outer.outer*144)) + (floormod(threadIdx.x_2, 48)*3)) + rx.outer.outer) + 221184)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 2368)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2368), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 1), 3))]
+            kernel.shared[(threadIdx.x_2 + 2368)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 2368), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 16), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 2432)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2432), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 2), 3))]
+            kernel.shared[(threadIdx.x_2 + 2432)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 2432), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 32), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 2496)] = (float32*)kernel_2[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + (ry.outer.outer*3)) + floormod(threadIdx.x_2, 3)) + 479232)]
+            kernel.shared[(threadIdx.x_2 + 2496)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*294912) + (floordiv(threadIdx.x_2, 48)*4608)) + (rc.outer.outer*144)) + (floormod(threadIdx.x_2, 48)*3)) + rx.outer.outer) + 239616)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 2560)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2560), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 1), 3))]
+            kernel.shared[(threadIdx.x_2 + 2560)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 2560), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 16), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 2624)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2624), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 2), 3))]
+            kernel.shared[(threadIdx.x_2 + 2624)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 2624), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 32), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 2688)] = (float32*)kernel_2[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + (ry.outer.outer*3)) + floormod(threadIdx.x_2, 3)) + 516096)]
+            kernel.shared[(threadIdx.x_2 + 2688)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*294912) + (floordiv(threadIdx.x_2, 48)*4608)) + (rc.outer.outer*144)) + (floormod(threadIdx.x_2, 48)*3)) + rx.outer.outer) + 258048)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 2752)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2752), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 1), 3))]
+            kernel.shared[(threadIdx.x_2 + 2752)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 2752), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 16), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 2816)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2816), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 2), 3))]
+            kernel.shared[(threadIdx.x_2 + 2816)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 2816), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 32), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 2880)] = (float32*)kernel_2[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + (ry.outer.outer*3)) + floormod(threadIdx.x_2, 3)) + 552960)]
+            kernel.shared[(threadIdx.x_2 + 2880)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*294912) + (floordiv(threadIdx.x_2, 48)*4608)) + (rc.outer.outer*144)) + (floormod(threadIdx.x_2, 48)*3)) + rx.outer.outer) + 276480)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 2944)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2944), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 1), 3))]
+            kernel.shared[(threadIdx.x_2 + 2944)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 2944), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 16), 48)*3)) + rx.outer.outer)]
             attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-            kernel.shared[(threadIdx.x_2 + 3008)] = (float32*)kernel_2[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 3008), 24)*4608)) + (rc.outer.outer*72)) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + (ry.outer.outer*3)) + floormod((threadIdx.x_2 + 2), 3))]
+            kernel.shared[(threadIdx.x_2 + 3008)] = (float32*)kernel_2[(((((floordiv(blockIdx.x, 7)*294912) + (floordiv((threadIdx.x_2 + 3008), 48)*4608)) + (rc.outer.outer*144)) + (floormod((threadIdx.x_2 + 32), 48)*3)) + rx.outer.outer)]
             compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[0]*(float32*)kernel.shared[(threadIdx.x*48)]))
-            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[9]*(float32*)kernel.shared[((threadIdx.x*48) + 3)]))
             compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[1]*(float32*)kernel.shared[(threadIdx.x*48)]))
-            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[10]*(float32*)kernel.shared[((threadIdx.x*48) + 3)]))
             compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[2]*(float32*)kernel.shared[(threadIdx.x*48)]))
-            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[11]*(float32*)kernel.shared[((threadIdx.x*48) + 3)]))
             compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[3]*(float32*)kernel.shared[(threadIdx.x*48)]))
-            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[12]*(float32*)kernel.shared[((threadIdx.x*48) + 3)]))
             compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[4]*(float32*)kernel.shared[(threadIdx.x*48)]))
-            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[13]*(float32*)kernel.shared[((threadIdx.x*48) + 3)]))
             compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[5]*(float32*)kernel.shared[(threadIdx.x*48)]))
-            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[14]*(float32*)kernel.shared[((threadIdx.x*48) + 3)]))
             compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[6]*(float32*)kernel.shared[(threadIdx.x*48)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[9]*(float32*)kernel.shared[((threadIdx.x*48) + 3)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[10]*(float32*)kernel.shared[((threadIdx.x*48) + 3)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[11]*(float32*)kernel.shared[((threadIdx.x*48) + 3)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[12]*(float32*)kernel.shared[((threadIdx.x*48) + 3)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[13]*(float32*)kernel.shared[((threadIdx.x*48) + 3)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[14]*(float32*)kernel.shared[((threadIdx.x*48) + 3)]))
             compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[15]*(float32*)kernel.shared[((threadIdx.x*48) + 3)]))
-            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[0]*(float32*)kernel.shared[((threadIdx.x*48) + 24)]))
-            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[9]*(float32*)kernel.shared[((threadIdx.x*48) + 27)]))
-            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[1]*(float32*)kernel.shared[((threadIdx.x*48) + 24)]))
-            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[10]*(float32*)kernel.shared[((threadIdx.x*48) + 27)]))
-            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[2]*(float32*)kernel.shared[((threadIdx.x*48) + 24)]))
-            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[11]*(float32*)kernel.shared[((threadIdx.x*48) + 27)]))
-            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[3]*(float32*)kernel.shared[((threadIdx.x*48) + 24)]))
-            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[12]*(float32*)kernel.shared[((threadIdx.x*48) + 27)]))
-            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[4]*(float32*)kernel.shared[((threadIdx.x*48) + 24)]))
-            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[13]*(float32*)kernel.shared[((threadIdx.x*48) + 27)]))
-            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[5]*(float32*)kernel.shared[((threadIdx.x*48) + 24)]))
-            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[14]*(float32*)kernel.shared[((threadIdx.x*48) + 27)]))
-            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[6]*(float32*)kernel.shared[((threadIdx.x*48) + 24)]))
-            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[15]*(float32*)kernel.shared[((threadIdx.x*48) + 27)]))
             compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[1]*(float32*)kernel.shared[((threadIdx.x*48) + 1)]))
-            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[10]*(float32*)kernel.shared[((threadIdx.x*48) + 4)]))
             compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[2]*(float32*)kernel.shared[((threadIdx.x*48) + 1)]))
-            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[11]*(float32*)kernel.shared[((threadIdx.x*48) + 4)]))
             compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[3]*(float32*)kernel.shared[((threadIdx.x*48) + 1)]))
-            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[12]*(float32*)kernel.shared[((threadIdx.x*48) + 4)]))
             compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[4]*(float32*)kernel.shared[((threadIdx.x*48) + 1)]))
-            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[13]*(float32*)kernel.shared[((threadIdx.x*48) + 4)]))
             compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[5]*(float32*)kernel.shared[((threadIdx.x*48) + 1)]))
-            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[14]*(float32*)kernel.shared[((threadIdx.x*48) + 4)]))
             compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[6]*(float32*)kernel.shared[((threadIdx.x*48) + 1)]))
-            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[15]*(float32*)kernel.shared[((threadIdx.x*48) + 4)]))
             compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[7]*(float32*)kernel.shared[((threadIdx.x*48) + 1)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[10]*(float32*)kernel.shared[((threadIdx.x*48) + 4)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[11]*(float32*)kernel.shared[((threadIdx.x*48) + 4)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[12]*(float32*)kernel.shared[((threadIdx.x*48) + 4)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[13]*(float32*)kernel.shared[((threadIdx.x*48) + 4)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[14]*(float32*)kernel.shared[((threadIdx.x*48) + 4)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[15]*(float32*)kernel.shared[((threadIdx.x*48) + 4)]))
             compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[16]*(float32*)kernel.shared[((threadIdx.x*48) + 4)]))
-            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[1]*(float32*)kernel.shared[((threadIdx.x*48) + 25)]))
-            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[10]*(float32*)kernel.shared[((threadIdx.x*48) + 28)]))
-            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[2]*(float32*)kernel.shared[((threadIdx.x*48) + 25)]))
-            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[11]*(float32*)kernel.shared[((threadIdx.x*48) + 28)]))
-            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[3]*(float32*)kernel.shared[((threadIdx.x*48) + 25)]))
-            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[12]*(float32*)kernel.shared[((threadIdx.x*48) + 28)]))
-            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[4]*(float32*)kernel.shared[((threadIdx.x*48) + 25)]))
-            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[13]*(float32*)kernel.shared[((threadIdx.x*48) + 28)]))
-            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[5]*(float32*)kernel.shared[((threadIdx.x*48) + 25)]))
-            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[14]*(float32*)kernel.shared[((threadIdx.x*48) + 28)]))
-            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[6]*(float32*)kernel.shared[((threadIdx.x*48) + 25)]))
-            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[15]*(float32*)kernel.shared[((threadIdx.x*48) + 28)]))
-            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[7]*(float32*)kernel.shared[((threadIdx.x*48) + 25)]))
-            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[16]*(float32*)kernel.shared[((threadIdx.x*48) + 28)]))
             compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[2]*(float32*)kernel.shared[((threadIdx.x*48) + 2)]))
-            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[11]*(float32*)kernel.shared[((threadIdx.x*48) + 5)]))
             compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[3]*(float32*)kernel.shared[((threadIdx.x*48) + 2)]))
-            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[12]*(float32*)kernel.shared[((threadIdx.x*48) + 5)]))
             compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[4]*(float32*)kernel.shared[((threadIdx.x*48) + 2)]))
-            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[13]*(float32*)kernel.shared[((threadIdx.x*48) + 5)]))
             compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[5]*(float32*)kernel.shared[((threadIdx.x*48) + 2)]))
-            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[14]*(float32*)kernel.shared[((threadIdx.x*48) + 5)]))
             compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[6]*(float32*)kernel.shared[((threadIdx.x*48) + 2)]))
-            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[15]*(float32*)kernel.shared[((threadIdx.x*48) + 5)]))
             compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[7]*(float32*)kernel.shared[((threadIdx.x*48) + 2)]))
-            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[16]*(float32*)kernel.shared[((threadIdx.x*48) + 5)]))
             compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[8]*(float32*)kernel.shared[((threadIdx.x*48) + 2)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[11]*(float32*)kernel.shared[((threadIdx.x*48) + 5)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[12]*(float32*)kernel.shared[((threadIdx.x*48) + 5)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[13]*(float32*)kernel.shared[((threadIdx.x*48) + 5)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[14]*(float32*)kernel.shared[((threadIdx.x*48) + 5)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[15]*(float32*)kernel.shared[((threadIdx.x*48) + 5)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[16]*(float32*)kernel.shared[((threadIdx.x*48) + 5)]))
             compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[17]*(float32*)kernel.shared[((threadIdx.x*48) + 5)]))
-            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[2]*(float32*)kernel.shared[((threadIdx.x*48) + 26)]))
-            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[11]*(float32*)kernel.shared[((threadIdx.x*48) + 29)]))
-            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[3]*(float32*)kernel.shared[((threadIdx.x*48) + 26)]))
-            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[12]*(float32*)kernel.shared[((threadIdx.x*48) + 29)]))
-            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[4]*(float32*)kernel.shared[((threadIdx.x*48) + 26)]))
-            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[13]*(float32*)kernel.shared[((threadIdx.x*48) + 29)]))
-            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[5]*(float32*)kernel.shared[((threadIdx.x*48) + 26)]))
-            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[14]*(float32*)kernel.shared[((threadIdx.x*48) + 29)]))
-            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[6]*(float32*)kernel.shared[((threadIdx.x*48) + 26)]))
-            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[15]*(float32*)kernel.shared[((threadIdx.x*48) + 29)]))
-            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[7]*(float32*)kernel.shared[((threadIdx.x*48) + 26)]))
-            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[16]*(float32*)kernel.shared[((threadIdx.x*48) + 29)]))
-            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[8]*(float32*)kernel.shared[((threadIdx.x*48) + 26)]))
-            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[17]*(float32*)kernel.shared[((threadIdx.x*48) + 29)]))
             compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[18]*(float32*)kernel.shared[((threadIdx.x*48) + 6)]))
-            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[27]*(float32*)kernel.shared[((threadIdx.x*48) + 9)]))
             compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[19]*(float32*)kernel.shared[((threadIdx.x*48) + 6)]))
-            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[28]*(float32*)kernel.shared[((threadIdx.x*48) + 9)]))
             compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[20]*(float32*)kernel.shared[((threadIdx.x*48) + 6)]))
-            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[29]*(float32*)kernel.shared[((threadIdx.x*48) + 9)]))
             compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[21]*(float32*)kernel.shared[((threadIdx.x*48) + 6)]))
-            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[30]*(float32*)kernel.shared[((threadIdx.x*48) + 9)]))
             compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[22]*(float32*)kernel.shared[((threadIdx.x*48) + 6)]))
-            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[31]*(float32*)kernel.shared[((threadIdx.x*48) + 9)]))
             compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[23]*(float32*)kernel.shared[((threadIdx.x*48) + 6)]))
-            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[32]*(float32*)kernel.shared[((threadIdx.x*48) + 9)]))
             compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[24]*(float32*)kernel.shared[((threadIdx.x*48) + 6)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[27]*(float32*)kernel.shared[((threadIdx.x*48) + 9)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[28]*(float32*)kernel.shared[((threadIdx.x*48) + 9)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[29]*(float32*)kernel.shared[((threadIdx.x*48) + 9)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[30]*(float32*)kernel.shared[((threadIdx.x*48) + 9)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[31]*(float32*)kernel.shared[((threadIdx.x*48) + 9)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[32]*(float32*)kernel.shared[((threadIdx.x*48) + 9)]))
             compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[33]*(float32*)kernel.shared[((threadIdx.x*48) + 9)]))
-            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[18]*(float32*)kernel.shared[((threadIdx.x*48) + 30)]))
-            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[27]*(float32*)kernel.shared[((threadIdx.x*48) + 33)]))
-            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[19]*(float32*)kernel.shared[((threadIdx.x*48) + 30)]))
-            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[28]*(float32*)kernel.shared[((threadIdx.x*48) + 33)]))
-            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[20]*(float32*)kernel.shared[((threadIdx.x*48) + 30)]))
-            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[29]*(float32*)kernel.shared[((threadIdx.x*48) + 33)]))
-            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[21]*(float32*)kernel.shared[((threadIdx.x*48) + 30)]))
-            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[30]*(float32*)kernel.shared[((threadIdx.x*48) + 33)]))
-            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[22]*(float32*)kernel.shared[((threadIdx.x*48) + 30)]))
-            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[31]*(float32*)kernel.shared[((threadIdx.x*48) + 33)]))
-            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[23]*(float32*)kernel.shared[((threadIdx.x*48) + 30)]))
-            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[32]*(float32*)kernel.shared[((threadIdx.x*48) + 33)]))
-            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[24]*(float32*)kernel.shared[((threadIdx.x*48) + 30)]))
-            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[33]*(float32*)kernel.shared[((threadIdx.x*48) + 33)]))
             compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[19]*(float32*)kernel.shared[((threadIdx.x*48) + 7)]))
-            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[28]*(float32*)kernel.shared[((threadIdx.x*48) + 10)]))
             compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[20]*(float32*)kernel.shared[((threadIdx.x*48) + 7)]))
-            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[29]*(float32*)kernel.shared[((threadIdx.x*48) + 10)]))
             compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[21]*(float32*)kernel.shared[((threadIdx.x*48) + 7)]))
-            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[30]*(float32*)kernel.shared[((threadIdx.x*48) + 10)]))
             compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[22]*(float32*)kernel.shared[((threadIdx.x*48) + 7)]))
-            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[31]*(float32*)kernel.shared[((threadIdx.x*48) + 10)]))
             compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[23]*(float32*)kernel.shared[((threadIdx.x*48) + 7)]))
-            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[32]*(float32*)kernel.shared[((threadIdx.x*48) + 10)]))
             compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[24]*(float32*)kernel.shared[((threadIdx.x*48) + 7)]))
-            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[33]*(float32*)kernel.shared[((threadIdx.x*48) + 10)]))
             compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[25]*(float32*)kernel.shared[((threadIdx.x*48) + 7)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[28]*(float32*)kernel.shared[((threadIdx.x*48) + 10)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[29]*(float32*)kernel.shared[((threadIdx.x*48) + 10)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[30]*(float32*)kernel.shared[((threadIdx.x*48) + 10)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[31]*(float32*)kernel.shared[((threadIdx.x*48) + 10)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[32]*(float32*)kernel.shared[((threadIdx.x*48) + 10)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[33]*(float32*)kernel.shared[((threadIdx.x*48) + 10)]))
             compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[34]*(float32*)kernel.shared[((threadIdx.x*48) + 10)]))
-            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[19]*(float32*)kernel.shared[((threadIdx.x*48) + 31)]))
-            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[28]*(float32*)kernel.shared[((threadIdx.x*48) + 34)]))
-            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[20]*(float32*)kernel.shared[((threadIdx.x*48) + 31)]))
-            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[29]*(float32*)kernel.shared[((threadIdx.x*48) + 34)]))
-            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[21]*(float32*)kernel.shared[((threadIdx.x*48) + 31)]))
-            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[30]*(float32*)kernel.shared[((threadIdx.x*48) + 34)]))
-            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[22]*(float32*)kernel.shared[((threadIdx.x*48) + 31)]))
-            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[31]*(float32*)kernel.shared[((threadIdx.x*48) + 34)]))
-            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[23]*(float32*)kernel.shared[((threadIdx.x*48) + 31)]))
-            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[32]*(float32*)kernel.shared[((threadIdx.x*48) + 34)]))
-            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[24]*(float32*)kernel.shared[((threadIdx.x*48) + 31)]))
-            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[33]*(float32*)kernel.shared[((threadIdx.x*48) + 34)]))
-            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[25]*(float32*)kernel.shared[((threadIdx.x*48) + 31)]))
-            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[34]*(float32*)kernel.shared[((threadIdx.x*48) + 34)]))
             compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[20]*(float32*)kernel.shared[((threadIdx.x*48) + 8)]))
-            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[29]*(float32*)kernel.shared[((threadIdx.x*48) + 11)]))
             compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[21]*(float32*)kernel.shared[((threadIdx.x*48) + 8)]))
-            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[30]*(float32*)kernel.shared[((threadIdx.x*48) + 11)]))
             compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[22]*(float32*)kernel.shared[((threadIdx.x*48) + 8)]))
-            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[31]*(float32*)kernel.shared[((threadIdx.x*48) + 11)]))
             compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[23]*(float32*)kernel.shared[((threadIdx.x*48) + 8)]))
-            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[32]*(float32*)kernel.shared[((threadIdx.x*48) + 11)]))
             compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[24]*(float32*)kernel.shared[((threadIdx.x*48) + 8)]))
-            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[33]*(float32*)kernel.shared[((threadIdx.x*48) + 11)]))
             compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[25]*(float32*)kernel.shared[((threadIdx.x*48) + 8)]))
-            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[34]*(float32*)kernel.shared[((threadIdx.x*48) + 11)]))
             compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[26]*(float32*)kernel.shared[((threadIdx.x*48) + 8)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[29]*(float32*)kernel.shared[((threadIdx.x*48) + 11)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[30]*(float32*)kernel.shared[((threadIdx.x*48) + 11)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[31]*(float32*)kernel.shared[((threadIdx.x*48) + 11)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[32]*(float32*)kernel.shared[((threadIdx.x*48) + 11)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[33]*(float32*)kernel.shared[((threadIdx.x*48) + 11)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[34]*(float32*)kernel.shared[((threadIdx.x*48) + 11)]))
             compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[35]*(float32*)kernel.shared[((threadIdx.x*48) + 11)]))
-            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[20]*(float32*)kernel.shared[((threadIdx.x*48) + 32)]))
-            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[29]*(float32*)kernel.shared[((threadIdx.x*48) + 35)]))
-            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[21]*(float32*)kernel.shared[((threadIdx.x*48) + 32)]))
-            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[30]*(float32*)kernel.shared[((threadIdx.x*48) + 35)]))
-            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[22]*(float32*)kernel.shared[((threadIdx.x*48) + 32)]))
-            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[31]*(float32*)kernel.shared[((threadIdx.x*48) + 35)]))
-            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[23]*(float32*)kernel.shared[((threadIdx.x*48) + 32)]))
-            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[32]*(float32*)kernel.shared[((threadIdx.x*48) + 35)]))
-            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[24]*(float32*)kernel.shared[((threadIdx.x*48) + 32)]))
-            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[33]*(float32*)kernel.shared[((threadIdx.x*48) + 35)]))
-            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[25]*(float32*)kernel.shared[((threadIdx.x*48) + 32)]))
-            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[34]*(float32*)kernel.shared[((threadIdx.x*48) + 35)]))
-            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[26]*(float32*)kernel.shared[((threadIdx.x*48) + 32)]))
-            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[35]*(float32*)kernel.shared[((threadIdx.x*48) + 35)]))
             compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[36]*(float32*)kernel.shared[((threadIdx.x*48) + 12)]))
-            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[45]*(float32*)kernel.shared[((threadIdx.x*48) + 15)]))
             compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[37]*(float32*)kernel.shared[((threadIdx.x*48) + 12)]))
-            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[46]*(float32*)kernel.shared[((threadIdx.x*48) + 15)]))
             compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[38]*(float32*)kernel.shared[((threadIdx.x*48) + 12)]))
-            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[47]*(float32*)kernel.shared[((threadIdx.x*48) + 15)]))
             compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[39]*(float32*)kernel.shared[((threadIdx.x*48) + 12)]))
-            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[48]*(float32*)kernel.shared[((threadIdx.x*48) + 15)]))
             compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[40]*(float32*)kernel.shared[((threadIdx.x*48) + 12)]))
-            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[49]*(float32*)kernel.shared[((threadIdx.x*48) + 15)]))
             compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[41]*(float32*)kernel.shared[((threadIdx.x*48) + 12)]))
-            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[50]*(float32*)kernel.shared[((threadIdx.x*48) + 15)]))
             compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[42]*(float32*)kernel.shared[((threadIdx.x*48) + 12)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[45]*(float32*)kernel.shared[((threadIdx.x*48) + 15)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[46]*(float32*)kernel.shared[((threadIdx.x*48) + 15)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[47]*(float32*)kernel.shared[((threadIdx.x*48) + 15)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[48]*(float32*)kernel.shared[((threadIdx.x*48) + 15)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[49]*(float32*)kernel.shared[((threadIdx.x*48) + 15)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[50]*(float32*)kernel.shared[((threadIdx.x*48) + 15)]))
             compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[51]*(float32*)kernel.shared[((threadIdx.x*48) + 15)]))
-            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[36]*(float32*)kernel.shared[((threadIdx.x*48) + 36)]))
-            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[45]*(float32*)kernel.shared[((threadIdx.x*48) + 39)]))
-            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[37]*(float32*)kernel.shared[((threadIdx.x*48) + 36)]))
-            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[46]*(float32*)kernel.shared[((threadIdx.x*48) + 39)]))
-            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[38]*(float32*)kernel.shared[((threadIdx.x*48) + 36)]))
-            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[47]*(float32*)kernel.shared[((threadIdx.x*48) + 39)]))
-            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[39]*(float32*)kernel.shared[((threadIdx.x*48) + 36)]))
-            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[48]*(float32*)kernel.shared[((threadIdx.x*48) + 39)]))
-            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[40]*(float32*)kernel.shared[((threadIdx.x*48) + 36)]))
-            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[49]*(float32*)kernel.shared[((threadIdx.x*48) + 39)]))
-            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[41]*(float32*)kernel.shared[((threadIdx.x*48) + 36)]))
-            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[50]*(float32*)kernel.shared[((threadIdx.x*48) + 39)]))
-            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[42]*(float32*)kernel.shared[((threadIdx.x*48) + 36)]))
-            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[51]*(float32*)kernel.shared[((threadIdx.x*48) + 39)]))
             compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[37]*(float32*)kernel.shared[((threadIdx.x*48) + 13)]))
-            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[46]*(float32*)kernel.shared[((threadIdx.x*48) + 16)]))
             compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[38]*(float32*)kernel.shared[((threadIdx.x*48) + 13)]))
-            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[47]*(float32*)kernel.shared[((threadIdx.x*48) + 16)]))
             compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[39]*(float32*)kernel.shared[((threadIdx.x*48) + 13)]))
-            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[48]*(float32*)kernel.shared[((threadIdx.x*48) + 16)]))
             compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[40]*(float32*)kernel.shared[((threadIdx.x*48) + 13)]))
-            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[49]*(float32*)kernel.shared[((threadIdx.x*48) + 16)]))
             compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[41]*(float32*)kernel.shared[((threadIdx.x*48) + 13)]))
-            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[50]*(float32*)kernel.shared[((threadIdx.x*48) + 16)]))
             compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[42]*(float32*)kernel.shared[((threadIdx.x*48) + 13)]))
-            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[51]*(float32*)kernel.shared[((threadIdx.x*48) + 16)]))
             compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[43]*(float32*)kernel.shared[((threadIdx.x*48) + 13)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[46]*(float32*)kernel.shared[((threadIdx.x*48) + 16)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[47]*(float32*)kernel.shared[((threadIdx.x*48) + 16)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[48]*(float32*)kernel.shared[((threadIdx.x*48) + 16)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[49]*(float32*)kernel.shared[((threadIdx.x*48) + 16)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[50]*(float32*)kernel.shared[((threadIdx.x*48) + 16)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[51]*(float32*)kernel.shared[((threadIdx.x*48) + 16)]))
             compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[52]*(float32*)kernel.shared[((threadIdx.x*48) + 16)]))
-            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[37]*(float32*)kernel.shared[((threadIdx.x*48) + 37)]))
-            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[46]*(float32*)kernel.shared[((threadIdx.x*48) + 40)]))
-            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[38]*(float32*)kernel.shared[((threadIdx.x*48) + 37)]))
-            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[47]*(float32*)kernel.shared[((threadIdx.x*48) + 40)]))
-            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[39]*(float32*)kernel.shared[((threadIdx.x*48) + 37)]))
-            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[48]*(float32*)kernel.shared[((threadIdx.x*48) + 40)]))
-            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[40]*(float32*)kernel.shared[((threadIdx.x*48) + 37)]))
-            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[49]*(float32*)kernel.shared[((threadIdx.x*48) + 40)]))
-            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[41]*(float32*)kernel.shared[((threadIdx.x*48) + 37)]))
-            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[50]*(float32*)kernel.shared[((threadIdx.x*48) + 40)]))
-            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[42]*(float32*)kernel.shared[((threadIdx.x*48) + 37)]))
-            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[51]*(float32*)kernel.shared[((threadIdx.x*48) + 40)]))
-            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[43]*(float32*)kernel.shared[((threadIdx.x*48) + 37)]))
-            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[52]*(float32*)kernel.shared[((threadIdx.x*48) + 40)]))
             compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[38]*(float32*)kernel.shared[((threadIdx.x*48) + 14)]))
-            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[47]*(float32*)kernel.shared[((threadIdx.x*48) + 17)]))
             compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[39]*(float32*)kernel.shared[((threadIdx.x*48) + 14)]))
-            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[48]*(float32*)kernel.shared[((threadIdx.x*48) + 17)]))
             compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[40]*(float32*)kernel.shared[((threadIdx.x*48) + 14)]))
-            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[49]*(float32*)kernel.shared[((threadIdx.x*48) + 17)]))
             compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[41]*(float32*)kernel.shared[((threadIdx.x*48) + 14)]))
-            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[50]*(float32*)kernel.shared[((threadIdx.x*48) + 17)]))
             compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[42]*(float32*)kernel.shared[((threadIdx.x*48) + 14)]))
-            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[51]*(float32*)kernel.shared[((threadIdx.x*48) + 17)]))
             compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[43]*(float32*)kernel.shared[((threadIdx.x*48) + 14)]))
-            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[52]*(float32*)kernel.shared[((threadIdx.x*48) + 17)]))
             compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[44]*(float32*)kernel.shared[((threadIdx.x*48) + 14)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[47]*(float32*)kernel.shared[((threadIdx.x*48) + 17)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[48]*(float32*)kernel.shared[((threadIdx.x*48) + 17)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[49]*(float32*)kernel.shared[((threadIdx.x*48) + 17)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[50]*(float32*)kernel.shared[((threadIdx.x*48) + 17)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[51]*(float32*)kernel.shared[((threadIdx.x*48) + 17)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[52]*(float32*)kernel.shared[((threadIdx.x*48) + 17)]))
             compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[53]*(float32*)kernel.shared[((threadIdx.x*48) + 17)]))
-            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[38]*(float32*)kernel.shared[((threadIdx.x*48) + 38)]))
-            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[47]*(float32*)kernel.shared[((threadIdx.x*48) + 41)]))
-            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[39]*(float32*)kernel.shared[((threadIdx.x*48) + 38)]))
-            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[48]*(float32*)kernel.shared[((threadIdx.x*48) + 41)]))
-            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[40]*(float32*)kernel.shared[((threadIdx.x*48) + 38)]))
-            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[49]*(float32*)kernel.shared[((threadIdx.x*48) + 41)]))
-            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[41]*(float32*)kernel.shared[((threadIdx.x*48) + 38)]))
-            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[50]*(float32*)kernel.shared[((threadIdx.x*48) + 41)]))
-            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[42]*(float32*)kernel.shared[((threadIdx.x*48) + 38)]))
-            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[51]*(float32*)kernel.shared[((threadIdx.x*48) + 41)]))
-            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[43]*(float32*)kernel.shared[((threadIdx.x*48) + 38)]))
-            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[52]*(float32*)kernel.shared[((threadIdx.x*48) + 41)]))
-            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[44]*(float32*)kernel.shared[((threadIdx.x*48) + 38)]))
-            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[53]*(float32*)kernel.shared[((threadIdx.x*48) + 41)]))
             compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[54]*(float32*)kernel.shared[((threadIdx.x*48) + 18)]))
-            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[63]*(float32*)kernel.shared[((threadIdx.x*48) + 21)]))
             compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[55]*(float32*)kernel.shared[((threadIdx.x*48) + 18)]))
-            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[64]*(float32*)kernel.shared[((threadIdx.x*48) + 21)]))
             compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[56]*(float32*)kernel.shared[((threadIdx.x*48) + 18)]))
-            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[65]*(float32*)kernel.shared[((threadIdx.x*48) + 21)]))
             compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[57]*(float32*)kernel.shared[((threadIdx.x*48) + 18)]))
-            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[66]*(float32*)kernel.shared[((threadIdx.x*48) + 21)]))
             compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[58]*(float32*)kernel.shared[((threadIdx.x*48) + 18)]))
-            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[67]*(float32*)kernel.shared[((threadIdx.x*48) + 21)]))
             compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[59]*(float32*)kernel.shared[((threadIdx.x*48) + 18)]))
-            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[68]*(float32*)kernel.shared[((threadIdx.x*48) + 21)]))
             compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[60]*(float32*)kernel.shared[((threadIdx.x*48) + 18)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[63]*(float32*)kernel.shared[((threadIdx.x*48) + 21)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[64]*(float32*)kernel.shared[((threadIdx.x*48) + 21)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[65]*(float32*)kernel.shared[((threadIdx.x*48) + 21)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[66]*(float32*)kernel.shared[((threadIdx.x*48) + 21)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[67]*(float32*)kernel.shared[((threadIdx.x*48) + 21)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[68]*(float32*)kernel.shared[((threadIdx.x*48) + 21)]))
             compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[69]*(float32*)kernel.shared[((threadIdx.x*48) + 21)]))
-            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[54]*(float32*)kernel.shared[((threadIdx.x*48) + 42)]))
-            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[63]*(float32*)kernel.shared[((threadIdx.x*48) + 45)]))
-            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[55]*(float32*)kernel.shared[((threadIdx.x*48) + 42)]))
-            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[64]*(float32*)kernel.shared[((threadIdx.x*48) + 45)]))
-            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[56]*(float32*)kernel.shared[((threadIdx.x*48) + 42)]))
-            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[65]*(float32*)kernel.shared[((threadIdx.x*48) + 45)]))
-            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[57]*(float32*)kernel.shared[((threadIdx.x*48) + 42)]))
-            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[66]*(float32*)kernel.shared[((threadIdx.x*48) + 45)]))
-            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[58]*(float32*)kernel.shared[((threadIdx.x*48) + 42)]))
-            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[67]*(float32*)kernel.shared[((threadIdx.x*48) + 45)]))
-            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[59]*(float32*)kernel.shared[((threadIdx.x*48) + 42)]))
-            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[68]*(float32*)kernel.shared[((threadIdx.x*48) + 45)]))
-            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[60]*(float32*)kernel.shared[((threadIdx.x*48) + 42)]))
-            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[69]*(float32*)kernel.shared[((threadIdx.x*48) + 45)]))
             compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[55]*(float32*)kernel.shared[((threadIdx.x*48) + 19)]))
-            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[64]*(float32*)kernel.shared[((threadIdx.x*48) + 22)]))
             compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[56]*(float32*)kernel.shared[((threadIdx.x*48) + 19)]))
-            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[65]*(float32*)kernel.shared[((threadIdx.x*48) + 22)]))
             compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[57]*(float32*)kernel.shared[((threadIdx.x*48) + 19)]))
-            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[66]*(float32*)kernel.shared[((threadIdx.x*48) + 22)]))
             compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[58]*(float32*)kernel.shared[((threadIdx.x*48) + 19)]))
-            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[67]*(float32*)kernel.shared[((threadIdx.x*48) + 22)]))
             compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[59]*(float32*)kernel.shared[((threadIdx.x*48) + 19)]))
-            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[68]*(float32*)kernel.shared[((threadIdx.x*48) + 22)]))
             compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[60]*(float32*)kernel.shared[((threadIdx.x*48) + 19)]))
-            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[69]*(float32*)kernel.shared[((threadIdx.x*48) + 22)]))
             compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[61]*(float32*)kernel.shared[((threadIdx.x*48) + 19)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[64]*(float32*)kernel.shared[((threadIdx.x*48) + 22)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[65]*(float32*)kernel.shared[((threadIdx.x*48) + 22)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[66]*(float32*)kernel.shared[((threadIdx.x*48) + 22)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[67]*(float32*)kernel.shared[((threadIdx.x*48) + 22)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[68]*(float32*)kernel.shared[((threadIdx.x*48) + 22)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[69]*(float32*)kernel.shared[((threadIdx.x*48) + 22)]))
             compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[70]*(float32*)kernel.shared[((threadIdx.x*48) + 22)]))
-            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[55]*(float32*)kernel.shared[((threadIdx.x*48) + 43)]))
-            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[64]*(float32*)kernel.shared[((threadIdx.x*48) + 46)]))
-            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[56]*(float32*)kernel.shared[((threadIdx.x*48) + 43)]))
-            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[65]*(float32*)kernel.shared[((threadIdx.x*48) + 46)]))
-            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[57]*(float32*)kernel.shared[((threadIdx.x*48) + 43)]))
-            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[66]*(float32*)kernel.shared[((threadIdx.x*48) + 46)]))
-            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[58]*(float32*)kernel.shared[((threadIdx.x*48) + 43)]))
-            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[67]*(float32*)kernel.shared[((threadIdx.x*48) + 46)]))
-            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[59]*(float32*)kernel.shared[((threadIdx.x*48) + 43)]))
-            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[68]*(float32*)kernel.shared[((threadIdx.x*48) + 46)]))
-            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[60]*(float32*)kernel.shared[((threadIdx.x*48) + 43)]))
-            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[69]*(float32*)kernel.shared[((threadIdx.x*48) + 46)]))
-            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[61]*(float32*)kernel.shared[((threadIdx.x*48) + 43)]))
-            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[70]*(float32*)kernel.shared[((threadIdx.x*48) + 46)]))
             compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[56]*(float32*)kernel.shared[((threadIdx.x*48) + 20)]))
-            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[65]*(float32*)kernel.shared[((threadIdx.x*48) + 23)]))
             compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[57]*(float32*)kernel.shared[((threadIdx.x*48) + 20)]))
-            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[66]*(float32*)kernel.shared[((threadIdx.x*48) + 23)]))
             compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[58]*(float32*)kernel.shared[((threadIdx.x*48) + 20)]))
-            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[67]*(float32*)kernel.shared[((threadIdx.x*48) + 23)]))
             compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[59]*(float32*)kernel.shared[((threadIdx.x*48) + 20)]))
-            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[68]*(float32*)kernel.shared[((threadIdx.x*48) + 23)]))
             compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[60]*(float32*)kernel.shared[((threadIdx.x*48) + 20)]))
-            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[69]*(float32*)kernel.shared[((threadIdx.x*48) + 23)]))
             compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[61]*(float32*)kernel.shared[((threadIdx.x*48) + 20)]))
-            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[70]*(float32*)kernel.shared[((threadIdx.x*48) + 23)]))
             compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[62]*(float32*)kernel.shared[((threadIdx.x*48) + 20)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[65]*(float32*)kernel.shared[((threadIdx.x*48) + 23)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[66]*(float32*)kernel.shared[((threadIdx.x*48) + 23)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[67]*(float32*)kernel.shared[((threadIdx.x*48) + 23)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[68]*(float32*)kernel.shared[((threadIdx.x*48) + 23)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[69]*(float32*)kernel.shared[((threadIdx.x*48) + 23)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[70]*(float32*)kernel.shared[((threadIdx.x*48) + 23)]))
             compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[71]*(float32*)kernel.shared[((threadIdx.x*48) + 23)]))
-            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[56]*(float32*)kernel.shared[((threadIdx.x*48) + 44)]))
-            compute_3[7] = ((float32*)compute_3[7] + ((float32*)pad_temp.shared[65]*(float32*)kernel.shared[((threadIdx.x*48) + 47)]))
-            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[57]*(float32*)kernel.shared[((threadIdx.x*48) + 44)]))
-            compute_3[8] = ((float32*)compute_3[8] + ((float32*)pad_temp.shared[66]*(float32*)kernel.shared[((threadIdx.x*48) + 47)]))
-            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[58]*(float32*)kernel.shared[((threadIdx.x*48) + 44)]))
-            compute_3[9] = ((float32*)compute_3[9] + ((float32*)pad_temp.shared[67]*(float32*)kernel.shared[((threadIdx.x*48) + 47)]))
-            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[59]*(float32*)kernel.shared[((threadIdx.x*48) + 44)]))
-            compute_3[10] = ((float32*)compute_3[10] + ((float32*)pad_temp.shared[68]*(float32*)kernel.shared[((threadIdx.x*48) + 47)]))
-            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[60]*(float32*)kernel.shared[((threadIdx.x*48) + 44)]))
-            compute_3[11] = ((float32*)compute_3[11] + ((float32*)pad_temp.shared[69]*(float32*)kernel.shared[((threadIdx.x*48) + 47)]))
-            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[61]*(float32*)kernel.shared[((threadIdx.x*48) + 44)]))
-            compute_3[12] = ((float32*)compute_3[12] + ((float32*)pad_temp.shared[70]*(float32*)kernel.shared[((threadIdx.x*48) + 47)]))
-            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[62]*(float32*)kernel.shared[((threadIdx.x*48) + 44)]))
-            compute_3[13] = ((float32*)compute_3[13] + ((float32*)pad_temp.shared[71]*(float32*)kernel.shared[((threadIdx.x*48) + 47)]))
-          }
-        }
-        for (i1.inner: int32, 0, 2) {
-          for (i3.inner: int32, 0, 7) {
-            compute_2[(((((floordiv(blockIdx.x, 7)*6272) + (threadIdx.x*98)) + (i1.inner*49)) + (floormod(blockIdx.x, 7)*7)) + i3.inner)] = max(((float32*)compute_3[((i1.inner*7) + i3.inner)] + (float32*)bias_2[(((floordiv(blockIdx.x, 7)*128) + (threadIdx.x*2)) + i1.inner)]), 0f32)
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[72]*(float32*)kernel.shared[((threadIdx.x*48) + 24)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[73]*(float32*)kernel.shared[((threadIdx.x*48) + 24)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[74]*(float32*)kernel.shared[((threadIdx.x*48) + 24)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[75]*(float32*)kernel.shared[((threadIdx.x*48) + 24)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[76]*(float32*)kernel.shared[((threadIdx.x*48) + 24)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[77]*(float32*)kernel.shared[((threadIdx.x*48) + 24)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[78]*(float32*)kernel.shared[((threadIdx.x*48) + 24)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[81]*(float32*)kernel.shared[((threadIdx.x*48) + 27)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[82]*(float32*)kernel.shared[((threadIdx.x*48) + 27)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[83]*(float32*)kernel.shared[((threadIdx.x*48) + 27)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[84]*(float32*)kernel.shared[((threadIdx.x*48) + 27)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[85]*(float32*)kernel.shared[((threadIdx.x*48) + 27)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[86]*(float32*)kernel.shared[((threadIdx.x*48) + 27)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[87]*(float32*)kernel.shared[((threadIdx.x*48) + 27)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[73]*(float32*)kernel.shared[((threadIdx.x*48) + 25)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[74]*(float32*)kernel.shared[((threadIdx.x*48) + 25)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[75]*(float32*)kernel.shared[((threadIdx.x*48) + 25)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[76]*(float32*)kernel.shared[((threadIdx.x*48) + 25)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[77]*(float32*)kernel.shared[((threadIdx.x*48) + 25)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[78]*(float32*)kernel.shared[((threadIdx.x*48) + 25)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[79]*(float32*)kernel.shared[((threadIdx.x*48) + 25)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[82]*(float32*)kernel.shared[((threadIdx.x*48) + 28)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[83]*(float32*)kernel.shared[((threadIdx.x*48) + 28)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[84]*(float32*)kernel.shared[((threadIdx.x*48) + 28)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[85]*(float32*)kernel.shared[((threadIdx.x*48) + 28)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[86]*(float32*)kernel.shared[((threadIdx.x*48) + 28)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[87]*(float32*)kernel.shared[((threadIdx.x*48) + 28)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[88]*(float32*)kernel.shared[((threadIdx.x*48) + 28)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[74]*(float32*)kernel.shared[((threadIdx.x*48) + 26)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[75]*(float32*)kernel.shared[((threadIdx.x*48) + 26)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[76]*(float32*)kernel.shared[((threadIdx.x*48) + 26)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[77]*(float32*)kernel.shared[((threadIdx.x*48) + 26)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[78]*(float32*)kernel.shared[((threadIdx.x*48) + 26)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[79]*(float32*)kernel.shared[((threadIdx.x*48) + 26)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[80]*(float32*)kernel.shared[((threadIdx.x*48) + 26)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[83]*(float32*)kernel.shared[((threadIdx.x*48) + 29)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[84]*(float32*)kernel.shared[((threadIdx.x*48) + 29)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[85]*(float32*)kernel.shared[((threadIdx.x*48) + 29)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[86]*(float32*)kernel.shared[((threadIdx.x*48) + 29)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[87]*(float32*)kernel.shared[((threadIdx.x*48) + 29)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[88]*(float32*)kernel.shared[((threadIdx.x*48) + 29)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[89]*(float32*)kernel.shared[((threadIdx.x*48) + 29)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[90]*(float32*)kernel.shared[((threadIdx.x*48) + 30)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[91]*(float32*)kernel.shared[((threadIdx.x*48) + 30)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[92]*(float32*)kernel.shared[((threadIdx.x*48) + 30)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[93]*(float32*)kernel.shared[((threadIdx.x*48) + 30)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[94]*(float32*)kernel.shared[((threadIdx.x*48) + 30)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[95]*(float32*)kernel.shared[((threadIdx.x*48) + 30)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[96]*(float32*)kernel.shared[((threadIdx.x*48) + 30)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[99]*(float32*)kernel.shared[((threadIdx.x*48) + 33)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[100]*(float32*)kernel.shared[((threadIdx.x*48) + 33)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[101]*(float32*)kernel.shared[((threadIdx.x*48) + 33)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[102]*(float32*)kernel.shared[((threadIdx.x*48) + 33)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[103]*(float32*)kernel.shared[((threadIdx.x*48) + 33)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[104]*(float32*)kernel.shared[((threadIdx.x*48) + 33)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[105]*(float32*)kernel.shared[((threadIdx.x*48) + 33)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[91]*(float32*)kernel.shared[((threadIdx.x*48) + 31)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[92]*(float32*)kernel.shared[((threadIdx.x*48) + 31)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[93]*(float32*)kernel.shared[((threadIdx.x*48) + 31)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[94]*(float32*)kernel.shared[((threadIdx.x*48) + 31)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[95]*(float32*)kernel.shared[((threadIdx.x*48) + 31)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[96]*(float32*)kernel.shared[((threadIdx.x*48) + 31)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[97]*(float32*)kernel.shared[((threadIdx.x*48) + 31)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[100]*(float32*)kernel.shared[((threadIdx.x*48) + 34)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[101]*(float32*)kernel.shared[((threadIdx.x*48) + 34)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[102]*(float32*)kernel.shared[((threadIdx.x*48) + 34)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[103]*(float32*)kernel.shared[((threadIdx.x*48) + 34)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[104]*(float32*)kernel.shared[((threadIdx.x*48) + 34)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[105]*(float32*)kernel.shared[((threadIdx.x*48) + 34)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[106]*(float32*)kernel.shared[((threadIdx.x*48) + 34)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[92]*(float32*)kernel.shared[((threadIdx.x*48) + 32)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[93]*(float32*)kernel.shared[((threadIdx.x*48) + 32)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[94]*(float32*)kernel.shared[((threadIdx.x*48) + 32)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[95]*(float32*)kernel.shared[((threadIdx.x*48) + 32)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[96]*(float32*)kernel.shared[((threadIdx.x*48) + 32)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[97]*(float32*)kernel.shared[((threadIdx.x*48) + 32)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[98]*(float32*)kernel.shared[((threadIdx.x*48) + 32)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[101]*(float32*)kernel.shared[((threadIdx.x*48) + 35)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[102]*(float32*)kernel.shared[((threadIdx.x*48) + 35)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[103]*(float32*)kernel.shared[((threadIdx.x*48) + 35)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[104]*(float32*)kernel.shared[((threadIdx.x*48) + 35)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[105]*(float32*)kernel.shared[((threadIdx.x*48) + 35)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[106]*(float32*)kernel.shared[((threadIdx.x*48) + 35)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[107]*(float32*)kernel.shared[((threadIdx.x*48) + 35)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[108]*(float32*)kernel.shared[((threadIdx.x*48) + 36)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[109]*(float32*)kernel.shared[((threadIdx.x*48) + 36)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[110]*(float32*)kernel.shared[((threadIdx.x*48) + 36)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[111]*(float32*)kernel.shared[((threadIdx.x*48) + 36)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[112]*(float32*)kernel.shared[((threadIdx.x*48) + 36)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[113]*(float32*)kernel.shared[((threadIdx.x*48) + 36)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[114]*(float32*)kernel.shared[((threadIdx.x*48) + 36)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[117]*(float32*)kernel.shared[((threadIdx.x*48) + 39)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[118]*(float32*)kernel.shared[((threadIdx.x*48) + 39)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[119]*(float32*)kernel.shared[((threadIdx.x*48) + 39)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[120]*(float32*)kernel.shared[((threadIdx.x*48) + 39)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[121]*(float32*)kernel.shared[((threadIdx.x*48) + 39)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[122]*(float32*)kernel.shared[((threadIdx.x*48) + 39)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[123]*(float32*)kernel.shared[((threadIdx.x*48) + 39)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[109]*(float32*)kernel.shared[((threadIdx.x*48) + 37)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[110]*(float32*)kernel.shared[((threadIdx.x*48) + 37)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[111]*(float32*)kernel.shared[((threadIdx.x*48) + 37)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[112]*(float32*)kernel.shared[((threadIdx.x*48) + 37)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[113]*(float32*)kernel.shared[((threadIdx.x*48) + 37)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[114]*(float32*)kernel.shared[((threadIdx.x*48) + 37)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[115]*(float32*)kernel.shared[((threadIdx.x*48) + 37)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[118]*(float32*)kernel.shared[((threadIdx.x*48) + 40)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[119]*(float32*)kernel.shared[((threadIdx.x*48) + 40)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[120]*(float32*)kernel.shared[((threadIdx.x*48) + 40)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[121]*(float32*)kernel.shared[((threadIdx.x*48) + 40)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[122]*(float32*)kernel.shared[((threadIdx.x*48) + 40)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[123]*(float32*)kernel.shared[((threadIdx.x*48) + 40)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[124]*(float32*)kernel.shared[((threadIdx.x*48) + 40)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[110]*(float32*)kernel.shared[((threadIdx.x*48) + 38)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[111]*(float32*)kernel.shared[((threadIdx.x*48) + 38)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[112]*(float32*)kernel.shared[((threadIdx.x*48) + 38)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[113]*(float32*)kernel.shared[((threadIdx.x*48) + 38)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[114]*(float32*)kernel.shared[((threadIdx.x*48) + 38)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[115]*(float32*)kernel.shared[((threadIdx.x*48) + 38)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[116]*(float32*)kernel.shared[((threadIdx.x*48) + 38)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[119]*(float32*)kernel.shared[((threadIdx.x*48) + 41)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[120]*(float32*)kernel.shared[((threadIdx.x*48) + 41)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[121]*(float32*)kernel.shared[((threadIdx.x*48) + 41)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[122]*(float32*)kernel.shared[((threadIdx.x*48) + 41)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[123]*(float32*)kernel.shared[((threadIdx.x*48) + 41)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[124]*(float32*)kernel.shared[((threadIdx.x*48) + 41)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[125]*(float32*)kernel.shared[((threadIdx.x*48) + 41)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[126]*(float32*)kernel.shared[((threadIdx.x*48) + 42)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[127]*(float32*)kernel.shared[((threadIdx.x*48) + 42)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[128]*(float32*)kernel.shared[((threadIdx.x*48) + 42)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[129]*(float32*)kernel.shared[((threadIdx.x*48) + 42)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[130]*(float32*)kernel.shared[((threadIdx.x*48) + 42)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[131]*(float32*)kernel.shared[((threadIdx.x*48) + 42)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[132]*(float32*)kernel.shared[((threadIdx.x*48) + 42)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[135]*(float32*)kernel.shared[((threadIdx.x*48) + 45)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[136]*(float32*)kernel.shared[((threadIdx.x*48) + 45)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[137]*(float32*)kernel.shared[((threadIdx.x*48) + 45)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[138]*(float32*)kernel.shared[((threadIdx.x*48) + 45)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[139]*(float32*)kernel.shared[((threadIdx.x*48) + 45)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[140]*(float32*)kernel.shared[((threadIdx.x*48) + 45)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[141]*(float32*)kernel.shared[((threadIdx.x*48) + 45)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[127]*(float32*)kernel.shared[((threadIdx.x*48) + 43)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[128]*(float32*)kernel.shared[((threadIdx.x*48) + 43)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[129]*(float32*)kernel.shared[((threadIdx.x*48) + 43)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[130]*(float32*)kernel.shared[((threadIdx.x*48) + 43)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[131]*(float32*)kernel.shared[((threadIdx.x*48) + 43)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[132]*(float32*)kernel.shared[((threadIdx.x*48) + 43)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[133]*(float32*)kernel.shared[((threadIdx.x*48) + 43)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[136]*(float32*)kernel.shared[((threadIdx.x*48) + 46)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[137]*(float32*)kernel.shared[((threadIdx.x*48) + 46)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[138]*(float32*)kernel.shared[((threadIdx.x*48) + 46)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[139]*(float32*)kernel.shared[((threadIdx.x*48) + 46)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[140]*(float32*)kernel.shared[((threadIdx.x*48) + 46)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[141]*(float32*)kernel.shared[((threadIdx.x*48) + 46)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[142]*(float32*)kernel.shared[((threadIdx.x*48) + 46)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[128]*(float32*)kernel.shared[((threadIdx.x*48) + 44)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[129]*(float32*)kernel.shared[((threadIdx.x*48) + 44)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[130]*(float32*)kernel.shared[((threadIdx.x*48) + 44)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[131]*(float32*)kernel.shared[((threadIdx.x*48) + 44)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[132]*(float32*)kernel.shared[((threadIdx.x*48) + 44)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[133]*(float32*)kernel.shared[((threadIdx.x*48) + 44)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[134]*(float32*)kernel.shared[((threadIdx.x*48) + 44)]))
+            compute_3[0] = ((float32*)compute_3[0] + ((float32*)pad_temp.shared[137]*(float32*)kernel.shared[((threadIdx.x*48) + 47)]))
+            compute_3[1] = ((float32*)compute_3[1] + ((float32*)pad_temp.shared[138]*(float32*)kernel.shared[((threadIdx.x*48) + 47)]))
+            compute_3[2] = ((float32*)compute_3[2] + ((float32*)pad_temp.shared[139]*(float32*)kernel.shared[((threadIdx.x*48) + 47)]))
+            compute_3[3] = ((float32*)compute_3[3] + ((float32*)pad_temp.shared[140]*(float32*)kernel.shared[((threadIdx.x*48) + 47)]))
+            compute_3[4] = ((float32*)compute_3[4] + ((float32*)pad_temp.shared[141]*(float32*)kernel.shared[((threadIdx.x*48) + 47)]))
+            compute_3[5] = ((float32*)compute_3[5] + ((float32*)pad_temp.shared[142]*(float32*)kernel.shared[((threadIdx.x*48) + 47)]))
+            compute_3[6] = ((float32*)compute_3[6] + ((float32*)pad_temp.shared[143]*(float32*)kernel.shared[((threadIdx.x*48) + 47)]))
           }
         }
+        compute_2[(((floordiv(blockIdx.x, 7)*3136) + (threadIdx.x*49)) + floormod(blockIdx.x, 7))] = max(((float32*)compute_3[0] + (float32*)bias_2[((floordiv(blockIdx.x, 7)*64) + threadIdx.x)]), 0f32)
+        compute_2[((((floordiv(blockIdx.x, 7)*3136) + (threadIdx.x*49)) + floormod(blockIdx.x, 7)) + 7)] = max(((float32*)compute_3[1] + (float32*)bias_2[((floordiv(blockIdx.x, 7)*64) + threadIdx.x)]), 0f32)
+        compute_2[((((floordiv(blockIdx.x, 7)*3136) + (threadIdx.x*49)) + floormod(blockIdx.x, 7)) + 14)] = max(((float32*)compute_3[2] + (float32*)bias_2[((floordiv(blockIdx.x, 7)*64) + threadIdx.x)]), 0f32)
+        compute_2[((((floordiv(blockIdx.x, 7)*3136) + (threadIdx.x*49)) + floormod(blockIdx.x, 7)) + 21)] = max(((float32*)compute_3[3] + (float32*)bias_2[((floordiv(blockIdx.x, 7)*64) + threadIdx.x)]), 0f32)
+        compute_2[((((floordiv(blockIdx.x, 7)*3136) + (threadIdx.x*49)) + floormod(blockIdx.x, 7)) + 28)] = max(((float32*)compute_3[4] + (float32*)bias_2[((floordiv(blockIdx.x, 7)*64) + threadIdx.x)]), 0f32)
+        compute_2[((((floordiv(blockIdx.x, 7)*3136) + (threadIdx.x*49)) + floormod(blockIdx.x, 7)) + 35)] = max(((float32*)compute_3[5] + (float32*)bias_2[((floordiv(blockIdx.x, 7)*64) + threadIdx.x)]), 0f32)
+        compute_2[((((floordiv(blockIdx.x, 7)*3136) + (threadIdx.x*49)) + floormod(blockIdx.x, 7)) + 42)] = max(((float32*)compute_3[6] + (float32*)bias_2[((floordiv(blockIdx.x, 7)*64) + threadIdx.x)]), 0f32)
       }
     }
 
@@ -748,7 +738,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 0.407 ms
+    Execution time of this operator: 0.322 ms
 
 
 
@@ -793,34 +783,34 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
     compute_nn_o_o_o_i, compute_nn_o_o_i = s[compute].split(compute_nn_o_o_i, factor=1)
     compute_nn_o_o_o_o, compute_nn_o_o_o_i = s[compute].split(compute_nn_o_o_o_i, factor=1)
     compute_ff_o_i, compute_ff_i = s[compute].split(compute_ff, factor=1)
-    compute_ff_o_o_i, compute_ff_o_i = s[compute].split(compute_ff_o_i, factor=2)
+    compute_ff_o_o_i, compute_ff_o_i = s[compute].split(compute_ff_o_i, factor=1)
     compute_ff_o_o_o_i, compute_ff_o_o_i = s[compute].split(compute_ff_o_o_i, factor=64)
     compute_ff_o_o_o_o, compute_ff_o_o_o_i = s[compute].split(compute_ff_o_o_o_i, factor=1)
     compute_yy_o_i, compute_yy_i = s[compute].split(compute_yy, factor=1)
     compute_yy_o_o_i, compute_yy_o_i = s[compute].split(compute_yy_o_i, factor=1)
     compute_yy_o_o_o_i, compute_yy_o_o_i = s[compute].split(compute_yy_o_o_i, factor=1)
-    compute_yy_o_o_o_o, compute_yy_o_o_o_i = s[compute].split(compute_yy_o_o_o_i, factor=1)
+    compute_yy_o_o_o_o, compute_yy_o_o_o_i = s[compute].split(compute_yy_o_o_o_i, factor=7)
     compute_xx_o_i, compute_xx_i = s[compute].split(compute_xx, factor=1)
-    compute_xx_o_o_i, compute_xx_o_i = s[compute].split(compute_xx_o_i, factor=7)
+    compute_xx_o_o_i, compute_xx_o_i = s[compute].split(compute_xx_o_i, factor=1)
     compute_xx_o_o_o_i, compute_xx_o_o_i = s[compute].split(compute_xx_o_o_i, factor=1)
     compute_xx_o_o_o_o, compute_xx_o_o_o_i = s[compute].split(compute_xx_o_o_o_i, factor=1)
     compute_rc_o_i, compute_rc_i = s[compute].split(compute_rc, factor=2)
-    compute_rc_o_o, compute_rc_o_i = s[compute].split(compute_rc_o_i, factor=4)
+    compute_rc_o_o, compute_rc_o_i = s[compute].split(compute_rc_o_i, factor=8)
     compute_ry_o_i, compute_ry_i = s[compute].split(compute_ry, factor=1)
-    compute_ry_o_o, compute_ry_o_i = s[compute].split(compute_ry_o_i, factor=1)
+    compute_ry_o_o, compute_ry_o_i = s[compute].split(compute_ry_o_i, factor=3)
     compute_rx_o_i, compute_rx_i = s[compute].split(compute_rx, factor=1)
-    compute_rx_o_o, compute_rx_o_i = s[compute].split(compute_rx_o_i, factor=3)
+    compute_rx_o_o, compute_rx_o_i = s[compute].split(compute_rx_o_i, factor=1)
     s[compute].reorder(compute_nn_o_o_o_o, compute_ff_o_o_o_o, compute_yy_o_o_o_o, compute_xx_o_o_o_o, compute_nn_o_o_o_i, compute_ff_o_o_o_i, compute_yy_o_o_o_i, compute_xx_o_o_o_i, compute_nn_o_o_i, compute_ff_o_o_i, compute_yy_o_o_i, compute_xx_o_o_i, compute_rc_o_o, compute_ry_o_o, compute_rx_o_o, compute_rc_o_i, compute_ry_o_i, compute_rx_o_i, compute_nn_o_i, compute_ff_o_i, compute_yy_o_i, compute_xx_o_i, compute_rc_i, compute_ry_i, compute_rx_i, compute_nn_i, compute_ff_i, compute [...]
     compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1)
     compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
     compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
-    compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=2)
+    compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=1)
     compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=64)
     compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
     compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
     compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=1)
-    compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
-    compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
+    compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=7)
+    compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
     compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
     compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
     s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
@@ -844,7 +834,7 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
     kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
     s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
     pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
-    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=4)
+    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
     s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
     pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
     s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
@@ -867,8 +857,8 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
       #define uint64_t unsigned long long
     #endif
     extern "C" __global__ void default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
-      float compute1[14];
-      __shared__ float pad_temp_shared[72];
+      float compute1[7];
+      __shared__ float pad_temp_shared[144];
       __shared__ float kernel_shared[3072];
       compute1[(0)] = 0.000000e+00f;
       compute1[(1)] = 0.000000e+00f;
@@ -877,420 +867,408 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
       compute1[(4)] = 0.000000e+00f;
       compute1[(5)] = 0.000000e+00f;
       compute1[(6)] = 0.000000e+00f;
-      compute1[(7)] = 0.000000e+00f;
-      compute1[(8)] = 0.000000e+00f;
-      compute1[(9)] = 0.000000e+00f;
-      compute1[(10)] = 0.000000e+00f;
-      compute1[(11)] = 0.000000e+00f;
-      compute1[(12)] = 0.000000e+00f;
-      compute1[(13)] = 0.000000e+00f;
-      for (int rc_outer_outer = 0; rc_outer_outer < 64; ++rc_outer_outer) {
-        for (int ry_outer_outer = 0; ry_outer_outer < 3; ++ry_outer_outer) {
+      for (int rc_outer_outer = 0; rc_outer_outer < 32; ++rc_outer_outer) {
+        for (int rx_outer_outer = 0; rx_outer_outer < 3; ++rx_outer_outer) {
           __syncthreads();
-          if (((int)threadIdx.x) < 18) {
-            pad_temp_shared[((((int)threadIdx.x) * 4))] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) * 4) % 9))) && (((((int)threadIdx.x) * 4) % 9) < 8)) ? data[(((((((rc_outer_outer * 392) + (((((int)threadIdx.x) * 4) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) * 4) % 9)) - 8))] : 0.000000e+00f);
-          }
-          if (((int)threadIdx.x) < 18) {
-            pad_temp_shared[(((((int)threadIdx.x) * 4) + 1))] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 1) % 9))) && ((((((int)threadIdx.x) * 4) + 1) % 9) < 8)) ? data[(((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 1) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 1) % 9)) - 8))] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x))] = (((((1 <= (((int)threadIdx.x) % 9)) && ((((int)threadIdx.x) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)blockIdx.x) % 7)))) && ((rx_outer_outer + (((int)blockIdx.x) % 7)) < 8)) ? data[(((((((rc_outer_outer * 784) + ((((int)threadIdx.x) / 9) * 49)) + ((((int)threadIdx.x) % 9) * 7)) + rx_outer_outer) + (((int)blockIdx.x) % 7)) - 8))] : 0.000000e+00f);
+          pad_temp_shared[((((int)threadIdx.x) + 64))] = (((((1 <= ((((int)threadIdx.x) + 1) % 9)) && (((((int)threadIdx.x) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)blockIdx.x) % 7)))) && ((rx_outer_outer + (((int)blockIdx.x) % 7)) < 8)) ? data[(((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 64) / 9) * 49)) + (((((int)threadIdx.x) + 1) % 9) * 7)) + rx_outer_outer) + (((int)blockIdx.x) % 7)) - 8))] : 0.000000e+00f);
+          if (((int)threadIdx.x) < 16) {
+            pad_temp_shared[((((int)threadIdx.x) + 128))] = (((((1 <= ((((int)threadIdx.x) + 2) % 9)) && (((((int)threadIdx.x) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)blockIdx.x) % 7)))) && ((rx_outer_outer + (((int)blockIdx.x) % 7)) < 8)) ? data[(((((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 128) / 9) * 49)) + (((((int)threadIdx.x) + 2) % 9) * 7)) + rx_outer_outer) + (((int)blockIdx.x) % 7)) - 8))] : 0.000000e+00f);
           }
-          if (((int)threadIdx.x) < 18) {
-            pad_temp_shared[(((((int)threadIdx.x) * 4) + 2))] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 2) % 9))) && ((((((int)threadIdx.x) * 4) + 2) % 9) < 8)) ? data[(((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 2) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 2) % 9)) - 8))] : 0.000000e+00f);
-          }
-          if (((int)threadIdx.x) < 18) {
-            pad_temp_shared[(((((int)threadIdx.x) * 4) + 3))] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 3) % 9))) && ((((((int)threadIdx.x) * 4) + 3) % 9) < 8)) ? data[(((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 3) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 3) % 9)) - 8))] : 0.000000e+00f);
-          }
-          kernel_shared[(((int)threadIdx.x))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 64))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 64) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 128))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 128) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 192))] = kernel[(((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 36864))];
-          kernel_shared[((((int)threadIdx.x) + 256))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 256) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 320))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 320) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 384))] = kernel[(((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 73728))];
-          kernel_shared[((((int)threadIdx.x) + 448))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 448) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 512))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 512) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 576))] = kernel[(((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 110592))];
-          kernel_shared[((((int)threadIdx.x) + 640))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 640) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 704))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 704) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 768))] = kernel[(((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 147456))];
-          kernel_shared[((((int)threadIdx.x) + 832))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 832) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 896))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 896) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 960))] = kernel[(((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 184320))];
-          kernel_shared[((((int)threadIdx.x) + 1024))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1024) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 1088))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1088) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 1152))] = kernel[(((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 221184))];
-          kernel_shared[((((int)threadIdx.x) + 1216))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1216) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 1280))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1280) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 1344))] = kernel[(((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 258048))];
-          kernel_shared[((((int)threadIdx.x) + 1408))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1408) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 1472))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1472) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 1536))] = kernel[(((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 294912))];
-          kernel_shared[((((int)threadIdx.x) + 1600))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1600) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 1664))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1664) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 1728))] = kernel[(((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 331776))];
-          kernel_shared[((((int)threadIdx.x) + 1792))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1792) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 1856))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1856) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 1920))] = kernel[(((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 368640))];
-          kernel_shared[((((int)threadIdx.x) + 1984))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1984) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 2048))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2048) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 2112))] = kernel[(((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 405504))];
-          kernel_shared[((((int)threadIdx.x) + 2176))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2176) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 2240))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2240) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 2304))] = kernel[(((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 442368))];
-          kernel_shared[((((int)threadIdx.x) + 2368))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2368) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 2432))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2432) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 2496))] = kernel[(((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 479232))];
-          kernel_shared[((((int)threadIdx.x) + 2560))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2560) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 2624))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2624) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 2688))] = kernel[(((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 516096))];
-          kernel_shared[((((int)threadIdx.x) + 2752))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2752) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 2816))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2816) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 2880))] = kernel[(((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 552960))];
-          kernel_shared[((((int)threadIdx.x) + 2944))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2944) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3)))];
-          kernel_shared[((((int)threadIdx.x) + 3008))] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 3008) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3)))];
+          kernel_shared[(((int)threadIdx.x))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 64))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 64) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 128))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 128) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 32) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 192))] = kernel[((((((((((int)blockIdx.x) / 7) * 294912) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) % 48) * 3)) + rx_outer_outer) + 18432))];
+          kernel_shared[((((int)threadIdx.x) + 256))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 256) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 320))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 320) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 32) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 384))] = kernel[((((((((((int)blockIdx.x) / 7) * 294912) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) % 48) * 3)) + rx_outer_outer) + 36864))];
+          kernel_shared[((((int)threadIdx.x) + 448))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 448) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 512))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 512) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 32) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 576))] = kernel[((((((((((int)blockIdx.x) / 7) * 294912) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) % 48) * 3)) + rx_outer_outer) + 55296))];
+          kernel_shared[((((int)threadIdx.x) + 640))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 640) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 704))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 704) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 32) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 768))] = kernel[((((((((((int)blockIdx.x) / 7) * 294912) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) % 48) * 3)) + rx_outer_outer) + 73728))];
+          kernel_shared[((((int)threadIdx.x) + 832))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 832) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 896))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 896) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 32) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 960))] = kernel[((((((((((int)blockIdx.x) / 7) * 294912) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) % 48) * 3)) + rx_outer_outer) + 92160))];
+          kernel_shared[((((int)threadIdx.x) + 1024))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 1024) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 1088))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 1088) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 32) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 1152))] = kernel[((((((((((int)blockIdx.x) / 7) * 294912) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) % 48) * 3)) + rx_outer_outer) + 110592))];
+          kernel_shared[((((int)threadIdx.x) + 1216))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 1216) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 1280))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 1280) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 32) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 1344))] = kernel[((((((((((int)blockIdx.x) / 7) * 294912) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) % 48) * 3)) + rx_outer_outer) + 129024))];
+          kernel_shared[((((int)threadIdx.x) + 1408))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 1408) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 1472))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 1472) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 32) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 1536))] = kernel[((((((((((int)blockIdx.x) / 7) * 294912) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) % 48) * 3)) + rx_outer_outer) + 147456))];
+          kernel_shared[((((int)threadIdx.x) + 1600))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 1600) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 1664))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 1664) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 32) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 1728))] = kernel[((((((((((int)blockIdx.x) / 7) * 294912) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) % 48) * 3)) + rx_outer_outer) + 165888))];
+          kernel_shared[((((int)threadIdx.x) + 1792))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 1792) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 1856))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 1856) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 32) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 1920))] = kernel[((((((((((int)blockIdx.x) / 7) * 294912) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) % 48) * 3)) + rx_outer_outer) + 184320))];
+          kernel_shared[((((int)threadIdx.x) + 1984))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 1984) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 2048))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 2048) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 32) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 2112))] = kernel[((((((((((int)blockIdx.x) / 7) * 294912) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) % 48) * 3)) + rx_outer_outer) + 202752))];
+          kernel_shared[((((int)threadIdx.x) + 2176))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 2176) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 2240))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 2240) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 32) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 2304))] = kernel[((((((((((int)blockIdx.x) / 7) * 294912) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) % 48) * 3)) + rx_outer_outer) + 221184))];
+          kernel_shared[((((int)threadIdx.x) + 2368))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 2368) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 2432))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 2432) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 32) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 2496))] = kernel[((((((((((int)blockIdx.x) / 7) * 294912) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) % 48) * 3)) + rx_outer_outer) + 239616))];
+          kernel_shared[((((int)threadIdx.x) + 2560))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 2560) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 2624))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 2624) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 32) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 2688))] = kernel[((((((((((int)blockIdx.x) / 7) * 294912) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) % 48) * 3)) + rx_outer_outer) + 258048))];
+          kernel_shared[((((int)threadIdx.x) + 2752))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 2752) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 2816))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 2816) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 32) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 2880))] = kernel[((((((((((int)blockIdx.x) / 7) * 294912) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) % 48) * 3)) + rx_outer_outer) + 276480))];
+          kernel_shared[((((int)threadIdx.x) + 2944))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 2944) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 16) % 48) * 3)) + rx_outer_outer))];
+          kernel_shared[((((int)threadIdx.x) + 3008))] = kernel[(((((((((int)blockIdx.x) / 7) * 294912) + (((((int)threadIdx.x) + 3008) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 32) % 48) * 3)) + rx_outer_outer))];
           __syncthreads();
           compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(0)] * kernel_shared[((((int)threadIdx.x) * 48))]));
-          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(9)] * kernel_shared[(((((int)threadIdx.x) * 48) + 3))]));
           compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(1)] * kernel_shared[((((int)threadIdx.x) * 48))]));
-          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(10)] * kernel_shared[(((((int)threadIdx.x) * 48) + 3))]));
           compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(2)] * kernel_shared[((((int)threadIdx.x) * 48))]));
-          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(11)] * kernel_shared[(((((int)threadIdx.x) * 48) + 3))]));
           compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(3)] * kernel_shared[((((int)threadIdx.x) * 48))]));
-          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(12)] * kernel_shared[(((((int)threadIdx.x) * 48) + 3))]));
           compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(4)] * kernel_shared[((((int)threadIdx.x) * 48))]));
-          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(13)] * kernel_shared[(((((int)threadIdx.x) * 48) + 3))]));
           compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(5)] * kernel_shared[((((int)threadIdx.x) * 48))]));
-          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(14)] * kernel_shared[(((((int)threadIdx.x) * 48) + 3))]));
           compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(6)] * kernel_shared[((((int)threadIdx.x) * 48))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(9)] * kernel_shared[(((((int)threadIdx.x) * 48) + 3))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(10)] * kernel_shared[(((((int)threadIdx.x) * 48) + 3))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(11)] * kernel_shared[(((((int)threadIdx.x) * 48) + 3))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(12)] * kernel_shared[(((((int)threadIdx.x) * 48) + 3))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(13)] * kernel_shared[(((((int)threadIdx.x) * 48) + 3))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(14)] * kernel_shared[(((((int)threadIdx.x) * 48) + 3))]));
           compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(15)] * kernel_shared[(((((int)threadIdx.x) * 48) + 3))]));
-          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[(0)] * kernel_shared[(((((int)threadIdx.x) * 48) + 24))]));
-          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[(9)] * kernel_shared[(((((int)threadIdx.x) * 48) + 27))]));
-          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[(1)] * kernel_shared[(((((int)threadIdx.x) * 48) + 24))]));
-          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[(10)] * kernel_shared[(((((int)threadIdx.x) * 48) + 27))]));
-          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[(2)] * kernel_shared[(((((int)threadIdx.x) * 48) + 24))]));
-          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[(11)] * kernel_shared[(((((int)threadIdx.x) * 48) + 27))]));
-          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[(3)] * kernel_shared[(((((int)threadIdx.x) * 48) + 24))]));
-          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[(12)] * kernel_shared[(((((int)threadIdx.x) * 48) + 27))]));
-          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[(4)] * kernel_shared[(((((int)threadIdx.x) * 48) + 24))]));
-          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[(13)] * kernel_shared[(((((int)threadIdx.x) * 48) + 27))]));
-          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[(5)] * kernel_shared[(((((int)threadIdx.x) * 48) + 24))]));
-          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[(14)] * kernel_shared[(((((int)threadIdx.x) * 48) + 27))]));
-          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[(6)] * kernel_shared[(((((int)threadIdx.x) * 48) + 24))]));
-          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[(15)] * kernel_shared[(((((int)threadIdx.x) * 48) + 27))]));
           compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(1)] * kernel_shared[(((((int)threadIdx.x) * 48) + 1))]));
-          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(10)] * kernel_shared[(((((int)threadIdx.x) * 48) + 4))]));
           compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(2)] * kernel_shared[(((((int)threadIdx.x) * 48) + 1))]));
-          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(11)] * kernel_shared[(((((int)threadIdx.x) * 48) + 4))]));
           compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(3)] * kernel_shared[(((((int)threadIdx.x) * 48) + 1))]));
-          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(12)] * kernel_shared[(((((int)threadIdx.x) * 48) + 4))]));
           compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(4)] * kernel_shared[(((((int)threadIdx.x) * 48) + 1))]));
-          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(13)] * kernel_shared[(((((int)threadIdx.x) * 48) + 4))]));
           compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(5)] * kernel_shared[(((((int)threadIdx.x) * 48) + 1))]));
-          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(14)] * kernel_shared[(((((int)threadIdx.x) * 48) + 4))]));
           compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(6)] * kernel_shared[(((((int)threadIdx.x) * 48) + 1))]));
-          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(15)] * kernel_shared[(((((int)threadIdx.x) * 48) + 4))]));
           compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(7)] * kernel_shared[(((((int)threadIdx.x) * 48) + 1))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(10)] * kernel_shared[(((((int)threadIdx.x) * 48) + 4))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(11)] * kernel_shared[(((((int)threadIdx.x) * 48) + 4))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(12)] * kernel_shared[(((((int)threadIdx.x) * 48) + 4))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(13)] * kernel_shared[(((((int)threadIdx.x) * 48) + 4))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(14)] * kernel_shared[(((((int)threadIdx.x) * 48) + 4))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(15)] * kernel_shared[(((((int)threadIdx.x) * 48) + 4))]));
           compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(16)] * kernel_shared[(((((int)threadIdx.x) * 48) + 4))]));
-          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[(1)] * kernel_shared[(((((int)threadIdx.x) * 48) + 25))]));
-          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[(10)] * kernel_shared[(((((int)threadIdx.x) * 48) + 28))]));
-          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[(2)] * kernel_shared[(((((int)threadIdx.x) * 48) + 25))]));
-          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[(11)] * kernel_shared[(((((int)threadIdx.x) * 48) + 28))]));
-          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[(3)] * kernel_shared[(((((int)threadIdx.x) * 48) + 25))]));
-          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[(12)] * kernel_shared[(((((int)threadIdx.x) * 48) + 28))]));
-          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[(4)] * kernel_shared[(((((int)threadIdx.x) * 48) + 25))]));
-          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[(13)] * kernel_shared[(((((int)threadIdx.x) * 48) + 28))]));
-          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[(5)] * kernel_shared[(((((int)threadIdx.x) * 48) + 25))]));
-          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[(14)] * kernel_shared[(((((int)threadIdx.x) * 48) + 28))]));
-          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[(6)] * kernel_shared[(((((int)threadIdx.x) * 48) + 25))]));
-          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[(15)] * kernel_shared[(((((int)threadIdx.x) * 48) + 28))]));
-          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[(7)] * kernel_shared[(((((int)threadIdx.x) * 48) + 25))]));
-          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[(16)] * kernel_shared[(((((int)threadIdx.x) * 48) + 28))]));
           compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(2)] * kernel_shared[(((((int)threadIdx.x) * 48) + 2))]));
-          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(11)] * kernel_shared[(((((int)threadIdx.x) * 48) + 5))]));
           compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(3)] * kernel_shared[(((((int)threadIdx.x) * 48) + 2))]));
-          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(12)] * kernel_shared[(((((int)threadIdx.x) * 48) + 5))]));
           compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(4)] * kernel_shared[(((((int)threadIdx.x) * 48) + 2))]));
-          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(13)] * kernel_shared[(((((int)threadIdx.x) * 48) + 5))]));
           compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(5)] * kernel_shared[(((((int)threadIdx.x) * 48) + 2))]));
-          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(14)] * kernel_shared[(((((int)threadIdx.x) * 48) + 5))]));
           compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(6)] * kernel_shared[(((((int)threadIdx.x) * 48) + 2))]));
-          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(15)] * kernel_shared[(((((int)threadIdx.x) * 48) + 5))]));
           compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(7)] * kernel_shared[(((((int)threadIdx.x) * 48) + 2))]));
-          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(16)] * kernel_shared[(((((int)threadIdx.x) * 48) + 5))]));
           compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(8)] * kernel_shared[(((((int)threadIdx.x) * 48) + 2))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(11)] * kernel_shared[(((((int)threadIdx.x) * 48) + 5))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(12)] * kernel_shared[(((((int)threadIdx.x) * 48) + 5))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(13)] * kernel_shared[(((((int)threadIdx.x) * 48) + 5))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(14)] * kernel_shared[(((((int)threadIdx.x) * 48) + 5))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(15)] * kernel_shared[(((((int)threadIdx.x) * 48) + 5))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(16)] * kernel_shared[(((((int)threadIdx.x) * 48) + 5))]));
           compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(17)] * kernel_shared[(((((int)threadIdx.x) * 48) + 5))]));
-          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[(2)] * kernel_shared[(((((int)threadIdx.x) * 48) + 26))]));
-          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[(11)] * kernel_shared[(((((int)threadIdx.x) * 48) + 29))]));
-          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[(3)] * kernel_shared[(((((int)threadIdx.x) * 48) + 26))]));
-          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[(12)] * kernel_shared[(((((int)threadIdx.x) * 48) + 29))]));
-          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[(4)] * kernel_shared[(((((int)threadIdx.x) * 48) + 26))]));
-          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[(13)] * kernel_shared[(((((int)threadIdx.x) * 48) + 29))]));
-          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[(5)] * kernel_shared[(((((int)threadIdx.x) * 48) + 26))]));
-          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[(14)] * kernel_shared[(((((int)threadIdx.x) * 48) + 29))]));
-          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[(6)] * kernel_shared[(((((int)threadIdx.x) * 48) + 26))]));
-          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[(15)] * kernel_shared[(((((int)threadIdx.x) * 48) + 29))]));
-          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[(7)] * kernel_shared[(((((int)threadIdx.x) * 48) + 26))]));
-          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[(16)] * kernel_shared[(((((int)threadIdx.x) * 48) + 29))]));
-          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[(8)] * kernel_shared[(((((int)threadIdx.x) * 48) + 26))]));
-          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[(17)] * kernel_shared[(((((int)threadIdx.x) * 48) + 29))]));
           compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(18)] * kernel_shared[(((((int)threadIdx.x) * 48) + 6))]));
-          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(27)] * kernel_shared[(((((int)threadIdx.x) * 48) + 9))]));
           compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(19)] * kernel_shared[(((((int)threadIdx.x) * 48) + 6))]));
-          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(28)] * kernel_shared[(((((int)threadIdx.x) * 48) + 9))]));
           compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(20)] * kernel_shared[(((((int)threadIdx.x) * 48) + 6))]));
-          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(29)] * kernel_shared[(((((int)threadIdx.x) * 48) + 9))]));
           compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(21)] * kernel_shared[(((((int)threadIdx.x) * 48) + 6))]));
-          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(30)] * kernel_shared[(((((int)threadIdx.x) * 48) + 9))]));
           compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(22)] * kernel_shared[(((((int)threadIdx.x) * 48) + 6))]));
-          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(31)] * kernel_shared[(((((int)threadIdx.x) * 48) + 9))]));
           compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(23)] * kernel_shared[(((((int)threadIdx.x) * 48) + 6))]));
-          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(32)] * kernel_shared[(((((int)threadIdx.x) * 48) + 9))]));
           compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(24)] * kernel_shared[(((((int)threadIdx.x) * 48) + 6))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(27)] * kernel_shared[(((((int)threadIdx.x) * 48) + 9))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(28)] * kernel_shared[(((((int)threadIdx.x) * 48) + 9))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(29)] * kernel_shared[(((((int)threadIdx.x) * 48) + 9))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(30)] * kernel_shared[(((((int)threadIdx.x) * 48) + 9))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(31)] * kernel_shared[(((((int)threadIdx.x) * 48) + 9))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(32)] * kernel_shared[(((((int)threadIdx.x) * 48) + 9))]));
           compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(33)] * kernel_shared[(((((int)threadIdx.x) * 48) + 9))]));
-          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[(18)] * kernel_shared[(((((int)threadIdx.x) * 48) + 30))]));
-          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[(27)] * kernel_shared[(((((int)threadIdx.x) * 48) + 33))]));
-          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[(19)] * kernel_shared[(((((int)threadIdx.x) * 48) + 30))]));
-          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[(28)] * kernel_shared[(((((int)threadIdx.x) * 48) + 33))]));
-          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[(20)] * kernel_shared[(((((int)threadIdx.x) * 48) + 30))]));
-          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[(29)] * kernel_shared[(((((int)threadIdx.x) * 48) + 33))]));
-          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[(21)] * kernel_shared[(((((int)threadIdx.x) * 48) + 30))]));
-          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[(30)] * kernel_shared[(((((int)threadIdx.x) * 48) + 33))]));
-          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[(22)] * kernel_shared[(((((int)threadIdx.x) * 48) + 30))]));
-          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[(31)] * kernel_shared[(((((int)threadIdx.x) * 48) + 33))]));
-          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[(23)] * kernel_shared[(((((int)threadIdx.x) * 48) + 30))]));
-          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[(32)] * kernel_shared[(((((int)threadIdx.x) * 48) + 33))]));
-          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[(24)] * kernel_shared[(((((int)threadIdx.x) * 48) + 30))]));
-          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[(33)] * kernel_shared[(((((int)threadIdx.x) * 48) + 33))]));
           compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(19)] * kernel_shared[(((((int)threadIdx.x) * 48) + 7))]));
-          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(28)] * kernel_shared[(((((int)threadIdx.x) * 48) + 10))]));
           compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(20)] * kernel_shared[(((((int)threadIdx.x) * 48) + 7))]));
-          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(29)] * kernel_shared[(((((int)threadIdx.x) * 48) + 10))]));
           compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(21)] * kernel_shared[(((((int)threadIdx.x) * 48) + 7))]));
-          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(30)] * kernel_shared[(((((int)threadIdx.x) * 48) + 10))]));
           compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(22)] * kernel_shared[(((((int)threadIdx.x) * 48) + 7))]));
-          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(31)] * kernel_shared[(((((int)threadIdx.x) * 48) + 10))]));
           compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(23)] * kernel_shared[(((((int)threadIdx.x) * 48) + 7))]));
-          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(32)] * kernel_shared[(((((int)threadIdx.x) * 48) + 10))]));
           compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(24)] * kernel_shared[(((((int)threadIdx.x) * 48) + 7))]));
-          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(33)] * kernel_shared[(((((int)threadIdx.x) * 48) + 10))]));
           compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(25)] * kernel_shared[(((((int)threadIdx.x) * 48) + 7))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(28)] * kernel_shared[(((((int)threadIdx.x) * 48) + 10))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(29)] * kernel_shared[(((((int)threadIdx.x) * 48) + 10))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(30)] * kernel_shared[(((((int)threadIdx.x) * 48) + 10))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(31)] * kernel_shared[(((((int)threadIdx.x) * 48) + 10))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(32)] * kernel_shared[(((((int)threadIdx.x) * 48) + 10))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(33)] * kernel_shared[(((((int)threadIdx.x) * 48) + 10))]));
           compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(34)] * kernel_shared[(((((int)threadIdx.x) * 48) + 10))]));
-          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[(19)] * kernel_shared[(((((int)threadIdx.x) * 48) + 31))]));
-          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[(28)] * kernel_shared[(((((int)threadIdx.x) * 48) + 34))]));
-          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[(20)] * kernel_shared[(((((int)threadIdx.x) * 48) + 31))]));
-          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[(29)] * kernel_shared[(((((int)threadIdx.x) * 48) + 34))]));
-          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[(21)] * kernel_shared[(((((int)threadIdx.x) * 48) + 31))]));
-          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[(30)] * kernel_shared[(((((int)threadIdx.x) * 48) + 34))]));
-          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[(22)] * kernel_shared[(((((int)threadIdx.x) * 48) + 31))]));
-          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[(31)] * kernel_shared[(((((int)threadIdx.x) * 48) + 34))]));
-          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[(23)] * kernel_shared[(((((int)threadIdx.x) * 48) + 31))]));
-          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[(32)] * kernel_shared[(((((int)threadIdx.x) * 48) + 34))]));
-          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[(24)] * kernel_shared[(((((int)threadIdx.x) * 48) + 31))]));
-          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[(33)] * kernel_shared[(((((int)threadIdx.x) * 48) + 34))]));
-          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[(25)] * kernel_shared[(((((int)threadIdx.x) * 48) + 31))]));
-          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[(34)] * kernel_shared[(((((int)threadIdx.x) * 48) + 34))]));
           compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(20)] * kernel_shared[(((((int)threadIdx.x) * 48) + 8))]));
-          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(29)] * kernel_shared[(((((int)threadIdx.x) * 48) + 11))]));
           compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(21)] * kernel_shared[(((((int)threadIdx.x) * 48) + 8))]));
-          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(30)] * kernel_shared[(((((int)threadIdx.x) * 48) + 11))]));
           compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(22)] * kernel_shared[(((((int)threadIdx.x) * 48) + 8))]));
-          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(31)] * kernel_shared[(((((int)threadIdx.x) * 48) + 11))]));
           compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(23)] * kernel_shared[(((((int)threadIdx.x) * 48) + 8))]));
-          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(32)] * kernel_shared[(((((int)threadIdx.x) * 48) + 11))]));
           compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(24)] * kernel_shared[(((((int)threadIdx.x) * 48) + 8))]));
-          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(33)] * kernel_shared[(((((int)threadIdx.x) * 48) + 11))]));
           compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(25)] * kernel_shared[(((((int)threadIdx.x) * 48) + 8))]));
-          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(34)] * kernel_shared[(((((int)threadIdx.x) * 48) + 11))]));
           compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(26)] * kernel_shared[(((((int)threadIdx.x) * 48) + 8))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(29)] * kernel_shared[(((((int)threadIdx.x) * 48) + 11))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(30)] * kernel_shared[(((((int)threadIdx.x) * 48) + 11))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(31)] * kernel_shared[(((((int)threadIdx.x) * 48) + 11))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(32)] * kernel_shared[(((((int)threadIdx.x) * 48) + 11))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(33)] * kernel_shared[(((((int)threadIdx.x) * 48) + 11))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(34)] * kernel_shared[(((((int)threadIdx.x) * 48) + 11))]));
           compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(35)] * kernel_shared[(((((int)threadIdx.x) * 48) + 11))]));
-          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[(20)] * kernel_shared[(((((int)threadIdx.x) * 48) + 32))]));
-          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[(29)] * kernel_shared[(((((int)threadIdx.x) * 48) + 35))]));
-          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[(21)] * kernel_shared[(((((int)threadIdx.x) * 48) + 32))]));
-          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[(30)] * kernel_shared[(((((int)threadIdx.x) * 48) + 35))]));
-          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[(22)] * kernel_shared[(((((int)threadIdx.x) * 48) + 32))]));
-          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[(31)] * kernel_shared[(((((int)threadIdx.x) * 48) + 35))]));
-          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[(23)] * kernel_shared[(((((int)threadIdx.x) * 48) + 32))]));
-          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[(32)] * kernel_shared[(((((int)threadIdx.x) * 48) + 35))]));
-          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[(24)] * kernel_shared[(((((int)threadIdx.x) * 48) + 32))]));
-          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[(33)] * kernel_shared[(((((int)threadIdx.x) * 48) + 35))]));
-          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[(25)] * kernel_shared[(((((int)threadIdx.x) * 48) + 32))]));
-          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[(34)] * kernel_shared[(((((int)threadIdx.x) * 48) + 35))]));
-          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[(26)] * kernel_shared[(((((int)threadIdx.x) * 48) + 32))]));
-          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[(35)] * kernel_shared[(((((int)threadIdx.x) * 48) + 35))]));
           compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(36)] * kernel_shared[(((((int)threadIdx.x) * 48) + 12))]));
-          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(45)] * kernel_shared[(((((int)threadIdx.x) * 48) + 15))]));
           compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(37)] * kernel_shared[(((((int)threadIdx.x) * 48) + 12))]));
-          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(46)] * kernel_shared[(((((int)threadIdx.x) * 48) + 15))]));
           compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(38)] * kernel_shared[(((((int)threadIdx.x) * 48) + 12))]));
-          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(47)] * kernel_shared[(((((int)threadIdx.x) * 48) + 15))]));
           compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(39)] * kernel_shared[(((((int)threadIdx.x) * 48) + 12))]));
-          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(48)] * kernel_shared[(((((int)threadIdx.x) * 48) + 15))]));
           compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(40)] * kernel_shared[(((((int)threadIdx.x) * 48) + 12))]));
-          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(49)] * kernel_shared[(((((int)threadIdx.x) * 48) + 15))]));
           compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(41)] * kernel_shared[(((((int)threadIdx.x) * 48) + 12))]));
-          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(50)] * kernel_shared[(((((int)threadIdx.x) * 48) + 15))]));
           compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(42)] * kernel_shared[(((((int)threadIdx.x) * 48) + 12))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(45)] * kernel_shared[(((((int)threadIdx.x) * 48) + 15))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(46)] * kernel_shared[(((((int)threadIdx.x) * 48) + 15))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(47)] * kernel_shared[(((((int)threadIdx.x) * 48) + 15))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(48)] * kernel_shared[(((((int)threadIdx.x) * 48) + 15))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(49)] * kernel_shared[(((((int)threadIdx.x) * 48) + 15))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(50)] * kernel_shared[(((((int)threadIdx.x) * 48) + 15))]));
           compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(51)] * kernel_shared[(((((int)threadIdx.x) * 48) + 15))]));
-          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[(36)] * kernel_shared[(((((int)threadIdx.x) * 48) + 36))]));
-          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[(45)] * kernel_shared[(((((int)threadIdx.x) * 48) + 39))]));
-          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[(37)] * kernel_shared[(((((int)threadIdx.x) * 48) + 36))]));
-          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[(46)] * kernel_shared[(((((int)threadIdx.x) * 48) + 39))]));
-          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[(38)] * kernel_shared[(((((int)threadIdx.x) * 48) + 36))]));
-          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[(47)] * kernel_shared[(((((int)threadIdx.x) * 48) + 39))]));
-          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[(39)] * kernel_shared[(((((int)threadIdx.x) * 48) + 36))]));
-          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[(48)] * kernel_shared[(((((int)threadIdx.x) * 48) + 39))]));
-          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[(40)] * kernel_shared[(((((int)threadIdx.x) * 48) + 36))]));
-          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[(49)] * kernel_shared[(((((int)threadIdx.x) * 48) + 39))]));
-          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[(41)] * kernel_shared[(((((int)threadIdx.x) * 48) + 36))]));
-          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[(50)] * kernel_shared[(((((int)threadIdx.x) * 48) + 39))]));
-          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[(42)] * kernel_shared[(((((int)threadIdx.x) * 48) + 36))]));
-          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[(51)] * kernel_shared[(((((int)threadIdx.x) * 48) + 39))]));
           compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(37)] * kernel_shared[(((((int)threadIdx.x) * 48) + 13))]));
-          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(46)] * kernel_shared[(((((int)threadIdx.x) * 48) + 16))]));
           compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(38)] * kernel_shared[(((((int)threadIdx.x) * 48) + 13))]));
-          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(47)] * kernel_shared[(((((int)threadIdx.x) * 48) + 16))]));
           compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(39)] * kernel_shared[(((((int)threadIdx.x) * 48) + 13))]));
-          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(48)] * kernel_shared[(((((int)threadIdx.x) * 48) + 16))]));
           compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(40)] * kernel_shared[(((((int)threadIdx.x) * 48) + 13))]));
-          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(49)] * kernel_shared[(((((int)threadIdx.x) * 48) + 16))]));
           compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(41)] * kernel_shared[(((((int)threadIdx.x) * 48) + 13))]));
-          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(50)] * kernel_shared[(((((int)threadIdx.x) * 48) + 16))]));
           compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(42)] * kernel_shared[(((((int)threadIdx.x) * 48) + 13))]));
-          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(51)] * kernel_shared[(((((int)threadIdx.x) * 48) + 16))]));
           compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(43)] * kernel_shared[(((((int)threadIdx.x) * 48) + 13))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(46)] * kernel_shared[(((((int)threadIdx.x) * 48) + 16))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(47)] * kernel_shared[(((((int)threadIdx.x) * 48) + 16))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(48)] * kernel_shared[(((((int)threadIdx.x) * 48) + 16))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(49)] * kernel_shared[(((((int)threadIdx.x) * 48) + 16))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(50)] * kernel_shared[(((((int)threadIdx.x) * 48) + 16))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(51)] * kernel_shared[(((((int)threadIdx.x) * 48) + 16))]));
           compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(52)] * kernel_shared[(((((int)threadIdx.x) * 48) + 16))]));
-          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[(37)] * kernel_shared[(((((int)threadIdx.x) * 48) + 37))]));
-          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[(46)] * kernel_shared[(((((int)threadIdx.x) * 48) + 40))]));
-          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[(38)] * kernel_shared[(((((int)threadIdx.x) * 48) + 37))]));
-          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[(47)] * kernel_shared[(((((int)threadIdx.x) * 48) + 40))]));
-          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[(39)] * kernel_shared[(((((int)threadIdx.x) * 48) + 37))]));
-          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[(48)] * kernel_shared[(((((int)threadIdx.x) * 48) + 40))]));
-          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[(40)] * kernel_shared[(((((int)threadIdx.x) * 48) + 37))]));
-          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[(49)] * kernel_shared[(((((int)threadIdx.x) * 48) + 40))]));
-          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[(41)] * kernel_shared[(((((int)threadIdx.x) * 48) + 37))]));
-          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[(50)] * kernel_shared[(((((int)threadIdx.x) * 48) + 40))]));
-          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[(42)] * kernel_shared[(((((int)threadIdx.x) * 48) + 37))]));
-          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[(51)] * kernel_shared[(((((int)threadIdx.x) * 48) + 40))]));
-          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[(43)] * kernel_shared[(((((int)threadIdx.x) * 48) + 37))]));
-          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[(52)] * kernel_shared[(((((int)threadIdx.x) * 48) + 40))]));
           compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(38)] * kernel_shared[(((((int)threadIdx.x) * 48) + 14))]));
-          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(47)] * kernel_shared[(((((int)threadIdx.x) * 48) + 17))]));
           compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(39)] * kernel_shared[(((((int)threadIdx.x) * 48) + 14))]));
-          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(48)] * kernel_shared[(((((int)threadIdx.x) * 48) + 17))]));
           compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(40)] * kernel_shared[(((((int)threadIdx.x) * 48) + 14))]));
-          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(49)] * kernel_shared[(((((int)threadIdx.x) * 48) + 17))]));
           compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(41)] * kernel_shared[(((((int)threadIdx.x) * 48) + 14))]));
-          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(50)] * kernel_shared[(((((int)threadIdx.x) * 48) + 17))]));
           compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(42)] * kernel_shared[(((((int)threadIdx.x) * 48) + 14))]));
-          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(51)] * kernel_shared[(((((int)threadIdx.x) * 48) + 17))]));
           compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(43)] * kernel_shared[(((((int)threadIdx.x) * 48) + 14))]));
-          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(52)] * kernel_shared[(((((int)threadIdx.x) * 48) + 17))]));
           compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(44)] * kernel_shared[(((((int)threadIdx.x) * 48) + 14))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(47)] * kernel_shared[(((((int)threadIdx.x) * 48) + 17))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(48)] * kernel_shared[(((((int)threadIdx.x) * 48) + 17))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(49)] * kernel_shared[(((((int)threadIdx.x) * 48) + 17))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(50)] * kernel_shared[(((((int)threadIdx.x) * 48) + 17))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(51)] * kernel_shared[(((((int)threadIdx.x) * 48) + 17))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(52)] * kernel_shared[(((((int)threadIdx.x) * 48) + 17))]));
           compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(53)] * kernel_shared[(((((int)threadIdx.x) * 48) + 17))]));
-          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[(38)] * kernel_shared[(((((int)threadIdx.x) * 48) + 38))]));
-          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[(47)] * kernel_shared[(((((int)threadIdx.x) * 48) + 41))]));
-          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[(39)] * kernel_shared[(((((int)threadIdx.x) * 48) + 38))]));
-          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[(48)] * kernel_shared[(((((int)threadIdx.x) * 48) + 41))]));
-          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[(40)] * kernel_shared[(((((int)threadIdx.x) * 48) + 38))]));
-          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[(49)] * kernel_shared[(((((int)threadIdx.x) * 48) + 41))]));
-          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[(41)] * kernel_shared[(((((int)threadIdx.x) * 48) + 38))]));
-          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[(50)] * kernel_shared[(((((int)threadIdx.x) * 48) + 41))]));
-          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[(42)] * kernel_shared[(((((int)threadIdx.x) * 48) + 38))]));
-          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[(51)] * kernel_shared[(((((int)threadIdx.x) * 48) + 41))]));
-          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[(43)] * kernel_shared[(((((int)threadIdx.x) * 48) + 38))]));
-          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[(52)] * kernel_shared[(((((int)threadIdx.x) * 48) + 41))]));
-          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[(44)] * kernel_shared[(((((int)threadIdx.x) * 48) + 38))]));
-          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[(53)] * kernel_shared[(((((int)threadIdx.x) * 48) + 41))]));
           compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(54)] * kernel_shared[(((((int)threadIdx.x) * 48) + 18))]));
-          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(63)] * kernel_shared[(((((int)threadIdx.x) * 48) + 21))]));
           compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(55)] * kernel_shared[(((((int)threadIdx.x) * 48) + 18))]));
-          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(64)] * kernel_shared[(((((int)threadIdx.x) * 48) + 21))]));
           compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(56)] * kernel_shared[(((((int)threadIdx.x) * 48) + 18))]));
-          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(65)] * kernel_shared[(((((int)threadIdx.x) * 48) + 21))]));
           compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(57)] * kernel_shared[(((((int)threadIdx.x) * 48) + 18))]));
-          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(66)] * kernel_shared[(((((int)threadIdx.x) * 48) + 21))]));
           compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(58)] * kernel_shared[(((((int)threadIdx.x) * 48) + 18))]));
-          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(67)] * kernel_shared[(((((int)threadIdx.x) * 48) + 21))]));
           compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(59)] * kernel_shared[(((((int)threadIdx.x) * 48) + 18))]));
-          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(68)] * kernel_shared[(((((int)threadIdx.x) * 48) + 21))]));
           compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(60)] * kernel_shared[(((((int)threadIdx.x) * 48) + 18))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(63)] * kernel_shared[(((((int)threadIdx.x) * 48) + 21))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(64)] * kernel_shared[(((((int)threadIdx.x) * 48) + 21))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(65)] * kernel_shared[(((((int)threadIdx.x) * 48) + 21))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(66)] * kernel_shared[(((((int)threadIdx.x) * 48) + 21))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(67)] * kernel_shared[(((((int)threadIdx.x) * 48) + 21))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(68)] * kernel_shared[(((((int)threadIdx.x) * 48) + 21))]));
           compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(69)] * kernel_shared[(((((int)threadIdx.x) * 48) + 21))]));
-          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[(54)] * kernel_shared[(((((int)threadIdx.x) * 48) + 42))]));
-          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[(63)] * kernel_shared[(((((int)threadIdx.x) * 48) + 45))]));
-          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[(55)] * kernel_shared[(((((int)threadIdx.x) * 48) + 42))]));
-          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[(64)] * kernel_shared[(((((int)threadIdx.x) * 48) + 45))]));
-          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[(56)] * kernel_shared[(((((int)threadIdx.x) * 48) + 42))]));
-          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[(65)] * kernel_shared[(((((int)threadIdx.x) * 48) + 45))]));
-          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[(57)] * kernel_shared[(((((int)threadIdx.x) * 48) + 42))]));
-          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[(66)] * kernel_shared[(((((int)threadIdx.x) * 48) + 45))]));
-          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[(58)] * kernel_shared[(((((int)threadIdx.x) * 48) + 42))]));
-          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[(67)] * kernel_shared[(((((int)threadIdx.x) * 48) + 45))]));
-          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[(59)] * kernel_shared[(((((int)threadIdx.x) * 48) + 42))]));
-          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[(68)] * kernel_shared[(((((int)threadIdx.x) * 48) + 45))]));
-          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[(60)] * kernel_shared[(((((int)threadIdx.x) * 48) + 42))]));
-          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[(69)] * kernel_shared[(((((int)threadIdx.x) * 48) + 45))]));
           compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(55)] * kernel_shared[(((((int)threadIdx.x) * 48) + 19))]));
-          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(64)] * kernel_shared[(((((int)threadIdx.x) * 48) + 22))]));
           compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(56)] * kernel_shared[(((((int)threadIdx.x) * 48) + 19))]));
-          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(65)] * kernel_shared[(((((int)threadIdx.x) * 48) + 22))]));
           compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(57)] * kernel_shared[(((((int)threadIdx.x) * 48) + 19))]));
-          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(66)] * kernel_shared[(((((int)threadIdx.x) * 48) + 22))]));
           compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(58)] * kernel_shared[(((((int)threadIdx.x) * 48) + 19))]));
-          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(67)] * kernel_shared[(((((int)threadIdx.x) * 48) + 22))]));
           compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(59)] * kernel_shared[(((((int)threadIdx.x) * 48) + 19))]));
-          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(68)] * kernel_shared[(((((int)threadIdx.x) * 48) + 22))]));
           compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(60)] * kernel_shared[(((((int)threadIdx.x) * 48) + 19))]));
-          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(69)] * kernel_shared[(((((int)threadIdx.x) * 48) + 22))]));
           compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(61)] * kernel_shared[(((((int)threadIdx.x) * 48) + 19))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(64)] * kernel_shared[(((((int)threadIdx.x) * 48) + 22))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(65)] * kernel_shared[(((((int)threadIdx.x) * 48) + 22))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(66)] * kernel_shared[(((((int)threadIdx.x) * 48) + 22))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(67)] * kernel_shared[(((((int)threadIdx.x) * 48) + 22))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(68)] * kernel_shared[(((((int)threadIdx.x) * 48) + 22))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(69)] * kernel_shared[(((((int)threadIdx.x) * 48) + 22))]));
           compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(70)] * kernel_shared[(((((int)threadIdx.x) * 48) + 22))]));
-          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[(55)] * kernel_shared[(((((int)threadIdx.x) * 48) + 43))]));
-          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[(64)] * kernel_shared[(((((int)threadIdx.x) * 48) + 46))]));
-          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[(56)] * kernel_shared[(((((int)threadIdx.x) * 48) + 43))]));
-          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[(65)] * kernel_shared[(((((int)threadIdx.x) * 48) + 46))]));
-          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[(57)] * kernel_shared[(((((int)threadIdx.x) * 48) + 43))]));
-          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[(66)] * kernel_shared[(((((int)threadIdx.x) * 48) + 46))]));
-          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[(58)] * kernel_shared[(((((int)threadIdx.x) * 48) + 43))]));
-          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[(67)] * kernel_shared[(((((int)threadIdx.x) * 48) + 46))]));
-          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[(59)] * kernel_shared[(((((int)threadIdx.x) * 48) + 43))]));
-          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[(68)] * kernel_shared[(((((int)threadIdx.x) * 48) + 46))]));
-          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[(60)] * kernel_shared[(((((int)threadIdx.x) * 48) + 43))]));
-          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[(69)] * kernel_shared[(((((int)threadIdx.x) * 48) + 46))]));
-          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[(61)] * kernel_shared[(((((int)threadIdx.x) * 48) + 43))]));
-          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[(70)] * kernel_shared[(((((int)threadIdx.x) * 48) + 46))]));
           compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(56)] * kernel_shared[(((((int)threadIdx.x) * 48) + 20))]));
-          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(65)] * kernel_shared[(((((int)threadIdx.x) * 48) + 23))]));
           compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(57)] * kernel_shared[(((((int)threadIdx.x) * 48) + 20))]));
-          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(66)] * kernel_shared[(((((int)threadIdx.x) * 48) + 23))]));
           compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(58)] * kernel_shared[(((((int)threadIdx.x) * 48) + 20))]));
-          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(67)] * kernel_shared[(((((int)threadIdx.x) * 48) + 23))]));
           compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(59)] * kernel_shared[(((((int)threadIdx.x) * 48) + 20))]));
-          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(68)] * kernel_shared[(((((int)threadIdx.x) * 48) + 23))]));
           compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(60)] * kernel_shared[(((((int)threadIdx.x) * 48) + 20))]));
-          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(69)] * kernel_shared[(((((int)threadIdx.x) * 48) + 23))]));
           compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(61)] * kernel_shared[(((((int)threadIdx.x) * 48) + 20))]));
-          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(70)] * kernel_shared[(((((int)threadIdx.x) * 48) + 23))]));
           compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(62)] * kernel_shared[(((((int)threadIdx.x) * 48) + 20))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(65)] * kernel_shared[(((((int)threadIdx.x) * 48) + 23))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(66)] * kernel_shared[(((((int)threadIdx.x) * 48) + 23))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(67)] * kernel_shared[(((((int)threadIdx.x) * 48) + 23))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(68)] * kernel_shared[(((((int)threadIdx.x) * 48) + 23))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(69)] * kernel_shared[(((((int)threadIdx.x) * 48) + 23))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(70)] * kernel_shared[(((((int)threadIdx.x) * 48) + 23))]));
           compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(71)] * kernel_shared[(((((int)threadIdx.x) * 48) + 23))]));
-          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[(56)] * kernel_shared[(((((int)threadIdx.x) * 48) + 44))]));
-          compute1[(7)] = (compute1[(7)] + (pad_temp_shared[(65)] * kernel_shared[(((((int)threadIdx.x) * 48) + 47))]));
-          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[(57)] * kernel_shared[(((((int)threadIdx.x) * 48) + 44))]));
-          compute1[(8)] = (compute1[(8)] + (pad_temp_shared[(66)] * kernel_shared[(((((int)threadIdx.x) * 48) + 47))]));
-          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[(58)] * kernel_shared[(((((int)threadIdx.x) * 48) + 44))]));
-          compute1[(9)] = (compute1[(9)] + (pad_temp_shared[(67)] * kernel_shared[(((((int)threadIdx.x) * 48) + 47))]));
-          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[(59)] * kernel_shared[(((((int)threadIdx.x) * 48) + 44))]));
-          compute1[(10)] = (compute1[(10)] + (pad_temp_shared[(68)] * kernel_shared[(((((int)threadIdx.x) * 48) + 47))]));
-          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[(60)] * kernel_shared[(((((int)threadIdx.x) * 48) + 44))]));
-          compute1[(11)] = (compute1[(11)] + (pad_temp_shared[(69)] * kernel_shared[(((((int)threadIdx.x) * 48) + 47))]));
-          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[(61)] * kernel_shared[(((((int)threadIdx.x) * 48) + 44))]));
-          compute1[(12)] = (compute1[(12)] + (pad_temp_shared[(70)] * kernel_shared[(((((int)threadIdx.x) * 48) + 47))]));
-          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[(62)] * kernel_shared[(((((int)threadIdx.x) * 48) + 44))]));
-          compute1[(13)] = (compute1[(13)] + (pad_temp_shared[(71)] * kernel_shared[(((((int)threadIdx.x) * 48) + 47))]));
-        }
-      }
-      for (int i1_inner = 0; i1_inner < 2; ++i1_inner) {
-        for (int i3_inner = 0; i3_inner < 7; ++i3_inner) {
-          compute[(((((((((int)blockIdx.x) / 7) * 6272) + (((int)threadIdx.x) * 98)) + (i1_inner * 49)) + ((((int)blockIdx.x) % 7) * 7)) + i3_inner))] = max((compute1[(((i1_inner * 7) + i3_inner))] + bias[(((((((int)blockIdx.x) / 7) * 128) + (((int)threadIdx.x) * 2)) + i1_inner))]), 0.000000e+00f);
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(72)] * kernel_shared[(((((int)threadIdx.x) * 48) + 24))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(73)] * kernel_shared[(((((int)threadIdx.x) * 48) + 24))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(74)] * kernel_shared[(((((int)threadIdx.x) * 48) + 24))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(75)] * kernel_shared[(((((int)threadIdx.x) * 48) + 24))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(76)] * kernel_shared[(((((int)threadIdx.x) * 48) + 24))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(77)] * kernel_shared[(((((int)threadIdx.x) * 48) + 24))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(78)] * kernel_shared[(((((int)threadIdx.x) * 48) + 24))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(81)] * kernel_shared[(((((int)threadIdx.x) * 48) + 27))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(82)] * kernel_shared[(((((int)threadIdx.x) * 48) + 27))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(83)] * kernel_shared[(((((int)threadIdx.x) * 48) + 27))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(84)] * kernel_shared[(((((int)threadIdx.x) * 48) + 27))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(85)] * kernel_shared[(((((int)threadIdx.x) * 48) + 27))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(86)] * kernel_shared[(((((int)threadIdx.x) * 48) + 27))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(87)] * kernel_shared[(((((int)threadIdx.x) * 48) + 27))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(73)] * kernel_shared[(((((int)threadIdx.x) * 48) + 25))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(74)] * kernel_shared[(((((int)threadIdx.x) * 48) + 25))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(75)] * kernel_shared[(((((int)threadIdx.x) * 48) + 25))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(76)] * kernel_shared[(((((int)threadIdx.x) * 48) + 25))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(77)] * kernel_shared[(((((int)threadIdx.x) * 48) + 25))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(78)] * kernel_shared[(((((int)threadIdx.x) * 48) + 25))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(79)] * kernel_shared[(((((int)threadIdx.x) * 48) + 25))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(82)] * kernel_shared[(((((int)threadIdx.x) * 48) + 28))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(83)] * kernel_shared[(((((int)threadIdx.x) * 48) + 28))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(84)] * kernel_shared[(((((int)threadIdx.x) * 48) + 28))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(85)] * kernel_shared[(((((int)threadIdx.x) * 48) + 28))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(86)] * kernel_shared[(((((int)threadIdx.x) * 48) + 28))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(87)] * kernel_shared[(((((int)threadIdx.x) * 48) + 28))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(88)] * kernel_shared[(((((int)threadIdx.x) * 48) + 28))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(74)] * kernel_shared[(((((int)threadIdx.x) * 48) + 26))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(75)] * kernel_shared[(((((int)threadIdx.x) * 48) + 26))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(76)] * kernel_shared[(((((int)threadIdx.x) * 48) + 26))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(77)] * kernel_shared[(((((int)threadIdx.x) * 48) + 26))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(78)] * kernel_shared[(((((int)threadIdx.x) * 48) + 26))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(79)] * kernel_shared[(((((int)threadIdx.x) * 48) + 26))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(80)] * kernel_shared[(((((int)threadIdx.x) * 48) + 26))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(83)] * kernel_shared[(((((int)threadIdx.x) * 48) + 29))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(84)] * kernel_shared[(((((int)threadIdx.x) * 48) + 29))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(85)] * kernel_shared[(((((int)threadIdx.x) * 48) + 29))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(86)] * kernel_shared[(((((int)threadIdx.x) * 48) + 29))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(87)] * kernel_shared[(((((int)threadIdx.x) * 48) + 29))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(88)] * kernel_shared[(((((int)threadIdx.x) * 48) + 29))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(89)] * kernel_shared[(((((int)threadIdx.x) * 48) + 29))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(90)] * kernel_shared[(((((int)threadIdx.x) * 48) + 30))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(91)] * kernel_shared[(((((int)threadIdx.x) * 48) + 30))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(92)] * kernel_shared[(((((int)threadIdx.x) * 48) + 30))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(93)] * kernel_shared[(((((int)threadIdx.x) * 48) + 30))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(94)] * kernel_shared[(((((int)threadIdx.x) * 48) + 30))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(95)] * kernel_shared[(((((int)threadIdx.x) * 48) + 30))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(96)] * kernel_shared[(((((int)threadIdx.x) * 48) + 30))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(99)] * kernel_shared[(((((int)threadIdx.x) * 48) + 33))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(100)] * kernel_shared[(((((int)threadIdx.x) * 48) + 33))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(101)] * kernel_shared[(((((int)threadIdx.x) * 48) + 33))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(102)] * kernel_shared[(((((int)threadIdx.x) * 48) + 33))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(103)] * kernel_shared[(((((int)threadIdx.x) * 48) + 33))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(104)] * kernel_shared[(((((int)threadIdx.x) * 48) + 33))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(105)] * kernel_shared[(((((int)threadIdx.x) * 48) + 33))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(91)] * kernel_shared[(((((int)threadIdx.x) * 48) + 31))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(92)] * kernel_shared[(((((int)threadIdx.x) * 48) + 31))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(93)] * kernel_shared[(((((int)threadIdx.x) * 48) + 31))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(94)] * kernel_shared[(((((int)threadIdx.x) * 48) + 31))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(95)] * kernel_shared[(((((int)threadIdx.x) * 48) + 31))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(96)] * kernel_shared[(((((int)threadIdx.x) * 48) + 31))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(97)] * kernel_shared[(((((int)threadIdx.x) * 48) + 31))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(100)] * kernel_shared[(((((int)threadIdx.x) * 48) + 34))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(101)] * kernel_shared[(((((int)threadIdx.x) * 48) + 34))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(102)] * kernel_shared[(((((int)threadIdx.x) * 48) + 34))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(103)] * kernel_shared[(((((int)threadIdx.x) * 48) + 34))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(104)] * kernel_shared[(((((int)threadIdx.x) * 48) + 34))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(105)] * kernel_shared[(((((int)threadIdx.x) * 48) + 34))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(106)] * kernel_shared[(((((int)threadIdx.x) * 48) + 34))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(92)] * kernel_shared[(((((int)threadIdx.x) * 48) + 32))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(93)] * kernel_shared[(((((int)threadIdx.x) * 48) + 32))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(94)] * kernel_shared[(((((int)threadIdx.x) * 48) + 32))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(95)] * kernel_shared[(((((int)threadIdx.x) * 48) + 32))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(96)] * kernel_shared[(((((int)threadIdx.x) * 48) + 32))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(97)] * kernel_shared[(((((int)threadIdx.x) * 48) + 32))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(98)] * kernel_shared[(((((int)threadIdx.x) * 48) + 32))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(101)] * kernel_shared[(((((int)threadIdx.x) * 48) + 35))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(102)] * kernel_shared[(((((int)threadIdx.x) * 48) + 35))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(103)] * kernel_shared[(((((int)threadIdx.x) * 48) + 35))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(104)] * kernel_shared[(((((int)threadIdx.x) * 48) + 35))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(105)] * kernel_shared[(((((int)threadIdx.x) * 48) + 35))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(106)] * kernel_shared[(((((int)threadIdx.x) * 48) + 35))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(107)] * kernel_shared[(((((int)threadIdx.x) * 48) + 35))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(108)] * kernel_shared[(((((int)threadIdx.x) * 48) + 36))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(109)] * kernel_shared[(((((int)threadIdx.x) * 48) + 36))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(110)] * kernel_shared[(((((int)threadIdx.x) * 48) + 36))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(111)] * kernel_shared[(((((int)threadIdx.x) * 48) + 36))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(112)] * kernel_shared[(((((int)threadIdx.x) * 48) + 36))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(113)] * kernel_shared[(((((int)threadIdx.x) * 48) + 36))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(114)] * kernel_shared[(((((int)threadIdx.x) * 48) + 36))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(117)] * kernel_shared[(((((int)threadIdx.x) * 48) + 39))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(118)] * kernel_shared[(((((int)threadIdx.x) * 48) + 39))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(119)] * kernel_shared[(((((int)threadIdx.x) * 48) + 39))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(120)] * kernel_shared[(((((int)threadIdx.x) * 48) + 39))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(121)] * kernel_shared[(((((int)threadIdx.x) * 48) + 39))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(122)] * kernel_shared[(((((int)threadIdx.x) * 48) + 39))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(123)] * kernel_shared[(((((int)threadIdx.x) * 48) + 39))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(109)] * kernel_shared[(((((int)threadIdx.x) * 48) + 37))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(110)] * kernel_shared[(((((int)threadIdx.x) * 48) + 37))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(111)] * kernel_shared[(((((int)threadIdx.x) * 48) + 37))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(112)] * kernel_shared[(((((int)threadIdx.x) * 48) + 37))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(113)] * kernel_shared[(((((int)threadIdx.x) * 48) + 37))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(114)] * kernel_shared[(((((int)threadIdx.x) * 48) + 37))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(115)] * kernel_shared[(((((int)threadIdx.x) * 48) + 37))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(118)] * kernel_shared[(((((int)threadIdx.x) * 48) + 40))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(119)] * kernel_shared[(((((int)threadIdx.x) * 48) + 40))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(120)] * kernel_shared[(((((int)threadIdx.x) * 48) + 40))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(121)] * kernel_shared[(((((int)threadIdx.x) * 48) + 40))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(122)] * kernel_shared[(((((int)threadIdx.x) * 48) + 40))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(123)] * kernel_shared[(((((int)threadIdx.x) * 48) + 40))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(124)] * kernel_shared[(((((int)threadIdx.x) * 48) + 40))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(110)] * kernel_shared[(((((int)threadIdx.x) * 48) + 38))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(111)] * kernel_shared[(((((int)threadIdx.x) * 48) + 38))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(112)] * kernel_shared[(((((int)threadIdx.x) * 48) + 38))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(113)] * kernel_shared[(((((int)threadIdx.x) * 48) + 38))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(114)] * kernel_shared[(((((int)threadIdx.x) * 48) + 38))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(115)] * kernel_shared[(((((int)threadIdx.x) * 48) + 38))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(116)] * kernel_shared[(((((int)threadIdx.x) * 48) + 38))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(119)] * kernel_shared[(((((int)threadIdx.x) * 48) + 41))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(120)] * kernel_shared[(((((int)threadIdx.x) * 48) + 41))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(121)] * kernel_shared[(((((int)threadIdx.x) * 48) + 41))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(122)] * kernel_shared[(((((int)threadIdx.x) * 48) + 41))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(123)] * kernel_shared[(((((int)threadIdx.x) * 48) + 41))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(124)] * kernel_shared[(((((int)threadIdx.x) * 48) + 41))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(125)] * kernel_shared[(((((int)threadIdx.x) * 48) + 41))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(126)] * kernel_shared[(((((int)threadIdx.x) * 48) + 42))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(127)] * kernel_shared[(((((int)threadIdx.x) * 48) + 42))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(128)] * kernel_shared[(((((int)threadIdx.x) * 48) + 42))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(129)] * kernel_shared[(((((int)threadIdx.x) * 48) + 42))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(130)] * kernel_shared[(((((int)threadIdx.x) * 48) + 42))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(131)] * kernel_shared[(((((int)threadIdx.x) * 48) + 42))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(132)] * kernel_shared[(((((int)threadIdx.x) * 48) + 42))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(135)] * kernel_shared[(((((int)threadIdx.x) * 48) + 45))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(136)] * kernel_shared[(((((int)threadIdx.x) * 48) + 45))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(137)] * kernel_shared[(((((int)threadIdx.x) * 48) + 45))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(138)] * kernel_shared[(((((int)threadIdx.x) * 48) + 45))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(139)] * kernel_shared[(((((int)threadIdx.x) * 48) + 45))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(140)] * kernel_shared[(((((int)threadIdx.x) * 48) + 45))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(141)] * kernel_shared[(((((int)threadIdx.x) * 48) + 45))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(127)] * kernel_shared[(((((int)threadIdx.x) * 48) + 43))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(128)] * kernel_shared[(((((int)threadIdx.x) * 48) + 43))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(129)] * kernel_shared[(((((int)threadIdx.x) * 48) + 43))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(130)] * kernel_shared[(((((int)threadIdx.x) * 48) + 43))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(131)] * kernel_shared[(((((int)threadIdx.x) * 48) + 43))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(132)] * kernel_shared[(((((int)threadIdx.x) * 48) + 43))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(133)] * kernel_shared[(((((int)threadIdx.x) * 48) + 43))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(136)] * kernel_shared[(((((int)threadIdx.x) * 48) + 46))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(137)] * kernel_shared[(((((int)threadIdx.x) * 48) + 46))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(138)] * kernel_shared[(((((int)threadIdx.x) * 48) + 46))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(139)] * kernel_shared[(((((int)threadIdx.x) * 48) + 46))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(140)] * kernel_shared[(((((int)threadIdx.x) * 48) + 46))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(141)] * kernel_shared[(((((int)threadIdx.x) * 48) + 46))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(142)] * kernel_shared[(((((int)threadIdx.x) * 48) + 46))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(128)] * kernel_shared[(((((int)threadIdx.x) * 48) + 44))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(129)] * kernel_shared[(((((int)threadIdx.x) * 48) + 44))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(130)] * kernel_shared[(((((int)threadIdx.x) * 48) + 44))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(131)] * kernel_shared[(((((int)threadIdx.x) * 48) + 44))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(132)] * kernel_shared[(((((int)threadIdx.x) * 48) + 44))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(133)] * kernel_shared[(((((int)threadIdx.x) * 48) + 44))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(134)] * kernel_shared[(((((int)threadIdx.x) * 48) + 44))]));
+          compute1[(0)] = (compute1[(0)] + (pad_temp_shared[(137)] * kernel_shared[(((((int)threadIdx.x) * 48) + 47))]));
+          compute1[(1)] = (compute1[(1)] + (pad_temp_shared[(138)] * kernel_shared[(((((int)threadIdx.x) * 48) + 47))]));
+          compute1[(2)] = (compute1[(2)] + (pad_temp_shared[(139)] * kernel_shared[(((((int)threadIdx.x) * 48) + 47))]));
+          compute1[(3)] = (compute1[(3)] + (pad_temp_shared[(140)] * kernel_shared[(((((int)threadIdx.x) * 48) + 47))]));
+          compute1[(4)] = (compute1[(4)] + (pad_temp_shared[(141)] * kernel_shared[(((((int)threadIdx.x) * 48) + 47))]));
+          compute1[(5)] = (compute1[(5)] + (pad_temp_shared[(142)] * kernel_shared[(((((int)threadIdx.x) * 48) + 47))]));
+          compute1[(6)] = (compute1[(6)] + (pad_temp_shared[(143)] * kernel_shared[(((((int)threadIdx.x) * 48) + 47))]));
         }
       }
+      compute[(((((((int)blockIdx.x) / 7) * 3136) + (((int)threadIdx.x) * 49)) + (((int)blockIdx.x) % 7)))] = max((compute1[(0)] + bias[((((((int)blockIdx.x) / 7) * 64) + ((int)threadIdx.x)))]), 0.000000e+00f);
+      compute[((((((((int)blockIdx.x) / 7) * 3136) + (((int)threadIdx.x) * 49)) + (((int)blockIdx.x) % 7)) + 7))] = max((compute1[(1)] + bias[((((((int)blockIdx.x) / 7) * 64) + ((int)threadIdx.x)))]), 0.000000e+00f);
+      compute[((((((((int)blockIdx.x) / 7) * 3136) + (((int)threadIdx.x) * 49)) + (((int)blockIdx.x) % 7)) + 14))] = max((compute1[(2)] + bias[((((((int)blockIdx.x) / 7) * 64) + ((int)threadIdx.x)))]), 0.000000e+00f);
+      compute[((((((((int)blockIdx.x) / 7) * 3136) + (((int)threadIdx.x) * 49)) + (((int)blockIdx.x) % 7)) + 21))] = max((compute1[(3)] + bias[((((((int)blockIdx.x) / 7) * 64) + ((int)threadIdx.x)))]), 0.000000e+00f);
+      compute[((((((((int)blockIdx.x) / 7) * 3136) + (((int)threadIdx.x) * 49)) + (((int)blockIdx.x) % 7)) + 28))] = max((compute1[(4)] + bias[((((((int)blockIdx.x) / 7) * 64) + ((int)threadIdx.x)))]), 0.000000e+00f);
+      compute[((((((((int)blockIdx.x) / 7) * 3136) + (((int)threadIdx.x) * 49)) + (((int)blockIdx.x) % 7)) + 35))] = max((compute1[(5)] + bias[((((((int)blockIdx.x) / 7) * 64) + ((int)threadIdx.x)))]), 0.000000e+00f);
+      compute[((((((((int)blockIdx.x) / 7) * 3136) + (((int)threadIdx.x) * 49)) + (((int)blockIdx.x) % 7)) + 42))] = max((compute1[(6)] + bias[((((((int)blockIdx.x) / 7) * 64) + ((int)threadIdx.x)))]), 0.000000e+00f);
     }
 
 
@@ -1346,7 +1324,7 @@ In the example below we resume the status and do more 5 trials.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  29.784 seconds)
+   **Total running time of the script:** ( 2 minutes  20.264 seconds)
 
 
 .. _sphx_glr_download_tutorials_auto_scheduler_tune_conv2d_layer_cuda.py:
diff --git a/docs/_sources/tutorials/auto_scheduler/tune_network_arm.rst.txt b/docs/_sources/tutorials/auto_scheduler/tune_network_arm.rst.txt
index c1f6837..5e5ee1d 100644
--- a/docs/_sources/tutorials/auto_scheduler/tune_network_arm.rst.txt
+++ b/docs/_sources/tutorials/auto_scheduler/tune_network_arm.rst.txt
@@ -15,7 +15,7 @@ Auto-tuning for specific devices and workloads is critical for getting the
 best performance. This is a tutorial on how to tune a whole neural
 network for ARM CPU with the auto-scheduler via RPC.
 
-To auto-tune a neural network, we partition the network into small subgraphs and 
+To auto-tune a neural network, we partition the network into small subgraphs and
 tune them independently. Each subgraph is treated as one search task.
 A task scheduler slices the time and dynamically allocates time resources to
 these tasks. The task scheduler predicts the impact of each task on the end-to-end
@@ -243,7 +243,7 @@ set :code:`use_ndk` to True if you use android phone.
 
     # Also replace this with the device key, rpc host and rpc port in your tracker
     device_key = "rasp4b-64"
-    rpc_host = "0.0.0.0"
+    rpc_host = "127.0.0.1"
     rpc_port = 9191
 
     # Set this to True if you use ndk tools for cross compiling
diff --git a/docs/_sources/tutorials/auto_scheduler/tune_network_cuda.rst.txt b/docs/_sources/tutorials/auto_scheduler/tune_network_cuda.rst.txt
index f77020b..3ce11b8 100644
--- a/docs/_sources/tutorials/auto_scheduler/tune_network_cuda.rst.txt
+++ b/docs/_sources/tutorials/auto_scheduler/tune_network_cuda.rst.txt
@@ -414,10 +414,10 @@ The task scheduler will just optimize this objective.
     placeholder = PLACEHOLDER [1, 56, 56, 64]
     T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3])
 
-    ========== Task 18  (workload key: ["a5612fdeb9db4d579a75ec225ea4c06a", 1, 112, 112, 64, 1, 1, 1, 64, 1, 56, 56, 64]) ==========
+    ========== Task 18  (workload key: ["64b98c71af70a904fdbb81d7d4188d84", 1, 112, 112, 64, 1, 1, 1, 64, 1, 56, 56, 64]) ==========
     placeholder = PLACEHOLDER [1, 112, 112, 64]
     pad_temp(ax0, ax1, ax2, ax3) = tir.if_then_else(((((ax1 >= 1) && (ax1 < 113)) && (ax2 >= 1)) && (ax2 < 113)), placeholder[ax0, (ax1 - 1), (ax2 - 1), ax3], -3.40282e+38f)
-    tensor(ax0, ax1, ax2, ax3) max= pad_temp[ax0, ((ax1*2) + dh), ((ax2*2) + dw), ax3]
+    tensor(ax0, ax1, ax2, ax3) max= pad_temp[ax0, ((ax1*2) + rv0), ((ax2*2) + rv1), ax3]
     placeholder = PLACEHOLDER [1, 1, 1, 64]
     T_add(ax0, ax1, ax2, ax3) = (tensor[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
@@ -614,7 +614,7 @@ so we can read the log file and load the best schedules.
 
     Compile...
     Evaluate inference time cost...
-    Mean inference time (std dev): 3.29 ms (0.02 ms)
+    Mean inference time (std dev): 3.15 ms (0.00 ms)
 
 
 
diff --git a/docs/_sources/tutorials/auto_scheduler/tune_network_mali.rst.txt b/docs/_sources/tutorials/auto_scheduler/tune_network_mali.rst.txt
index bdfa3db..cedf1c8 100644
--- a/docs/_sources/tutorials/auto_scheduler/tune_network_mali.rst.txt
+++ b/docs/_sources/tutorials/auto_scheduler/tune_network_mali.rst.txt
@@ -15,7 +15,7 @@ Auto-tuning for specific devices and workloads is critical for getting the
 best performance. This is a tutorial on how to tune a whole neural
 network for mali GPU with the auto-scheduler.
 
-To auto-tune a neural network, we partition the network into small subgraphs and 
+To auto-tune a neural network, we partition the network into small subgraphs and
 tune them independently. Each subgraph is treated as one search task.
 A task scheduler slices the time and dynamically allocates time resources to
 these tasks. The task scheduler predicts the impact of each task on the end-to-end
@@ -207,7 +207,7 @@ The task scheduler will just optimize this objective.
  .. code-block:: none
 
     Extract tasks...
-
    ...2%, 0.01 MB, 40 KB/s, 0 seconds passed
    ...5%, 0.02 MB, 80 KB/s, 0 seconds passed
    ...7%, 0.02 MB, 120 KB/s, 0 seconds passed
    ...10%, 0.03 MB, 160 KB/s, 0 seconds passed
    ...12%, 0.04 MB, 199 KB/s, 0 seconds passed
    ...15%, 0.05 MB, 233 KB/s, 0 seconds passed
    ...18%, 0.05 MB, 271 KB/s, 0 seconds passed
    ...20%, 0.06 MB, 308 KB/s, 0 seconds passed
    ...23%, 0.07 MB, 346 KB/s, 0 seconds passed
    ...25%, 0.08 MB, 383 KB/s, 0 seconds passed
    ...28%, 0.09 MB, 421 KB/s, 0 seconds passed
    ...30%, 0.09 MB, 459 KB/s, 0 seconds passed
    ...33%, 0.10 MB, 486 KB/s, 0 seconds passed
    ...36%, 0.11 MB, 523 KB/s, 0 seconds passed
    ...38%, 0.12 MB, 559 KB/s, 0 seconds passed
    ...41%, 0.12 MB, 596 KB/s, 0 seconds passed
    ...43%, 0.13 MB, 633 KB/s, 0 seconds passed
    ...46%, 0.14 MB, 668 KB/s, 0 seconds passed
    ...48%, 0.15 MB, 704 KB/s, 0 seconds passed
    ...51%, 0.16 MB, 741 KB/s, 0 seconds passed
    ...54%, 0.16 MB, 778 KB/s, 0 seconds 
 passed
    ...56%, 0.17 MB, 814 KB/s, 0 seconds passed
    ...59%, 0.18 MB, 849 KB/s, 0 seconds passed
    ...61%, 0.19 MB, 885 KB/s, 0 seconds passed
    ...64%, 0.20 MB, 921 KB/s, 0 seconds passed
    ...66%, 0.20 MB, 958 KB/s, 0 seconds passed
    ...69%, 0.21 MB, 992 KB/s, 0 seconds passed
    ...72%, 0.22 MB, 1028 KB/s, 0 seconds passed
    ...74%, 0.23 MB, 1046 KB/s, 0 seconds passed
    ...77%, 0.23 MB, 1082 KB/s, 0 seconds passed
    ...79%, 0.24 MB, 1116 KB/s, 0 seconds passed
    ...82%, 0.25 MB, 1151 KB/s, 0 seconds passed
    ...84%, 0.26 MB, 1187 KB/s, 0 seconds passed
    ...87%, 0.27 MB, 1223 KB/s, 0 seconds passed
    ...90%, 0.27 MB, 1258 KB/s, 0 seconds passed
    ...92%, 0.28 MB, 1294 KB/s, 0 seconds passed
    ...95%, 0.29 MB, 1328 KB/s, 0 seconds passed
    ...97%, 0.30 MB, 1364 KB/s, 0 seconds passed
    ...100%, 0.30 MB, 1398 KB/s, 0 seconds passed
+
    ...2%, 0.01 MB, 43 KB/s, 0 seconds passed
    ...5%, 0.02 MB, 86 KB/s, 0 seconds passed
    ...7%, 0.02 MB, 129 KB/s, 0 seconds passed
    ...10%, 0.03 MB, 172 KB/s, 0 seconds passed
    ...12%, 0.04 MB, 214 KB/s, 0 seconds passed
    ...15%, 0.05 MB, 250 KB/s, 0 seconds passed
    ...18%, 0.05 MB, 292 KB/s, 0 seconds passed
    ...20%, 0.06 MB, 332 KB/s, 0 seconds passed
    ...23%, 0.07 MB, 373 KB/s, 0 seconds passed
    ...25%, 0.08 MB, 411 KB/s, 0 seconds passed
    ...28%, 0.09 MB, 452 KB/s, 0 seconds passed
    ...30%, 0.09 MB, 492 KB/s, 0 seconds passed
    ...33%, 0.10 MB, 521 KB/s, 0 seconds passed
    ...36%, 0.11 MB, 561 KB/s, 0 seconds passed
    ...38%, 0.12 MB, 599 KB/s, 0 seconds passed
    ...41%, 0.12 MB, 639 KB/s, 0 seconds passed
    ...43%, 0.13 MB, 678 KB/s, 0 seconds passed
    ...46%, 0.14 MB, 717 KB/s, 0 seconds passed
    ...48%, 0.15 MB, 756 KB/s, 0 seconds passed
    ...51%, 0.16 MB, 795 KB/s, 0 seconds passed
    ...54%, 0.16 MB, 835 KB/s, 0 seconds 
 passed
    ...56%, 0.17 MB, 874 KB/s, 0 seconds passed
    ...59%, 0.18 MB, 910 KB/s, 0 seconds passed
    ...61%, 0.19 MB, 949 KB/s, 0 seconds passed
    ...64%, 0.20 MB, 988 KB/s, 0 seconds passed
    ...66%, 0.20 MB, 1027 KB/s, 0 seconds passed
    ...69%, 0.21 MB, 1064 KB/s, 0 seconds passed
    ...72%, 0.22 MB, 1103 KB/s, 0 seconds passed
    ...74%, 0.23 MB, 1142 KB/s, 0 seconds passed
    ...77%, 0.23 MB, 1160 KB/s, 0 seconds passed
    ...79%, 0.24 MB, 1198 KB/s, 0 seconds passed
    ...82%, 0.25 MB, 1231 KB/s, 0 seconds passed
    ...84%, 0.26 MB, 1269 KB/s, 0 seconds passed
    ...87%, 0.27 MB, 1307 KB/s, 0 seconds passed
    ...90%, 0.27 MB, 1345 KB/s, 0 seconds passed
    ...92%, 0.28 MB, 1382 KB/s, 0 seconds passed
    ...95%, 0.29 MB, 1420 KB/s, 0 seconds passed
    ...97%, 0.30 MB, 1458 KB/s, 0 seconds passed
    ...100%, 0.30 MB, 1494 KB/s, 0 seconds passed
     ========== Task 0  (workload key: ["d7b65649a4dd54becea0a52aabbc5af5", 1, 1000, 1, 1000]) ==========
     placeholder = PLACEHOLDER [1, 1000]
     T_softmax_maxelem(i0) max= placeholder[i0, k]
@@ -408,11 +408,11 @@ The task scheduler will just optimize this objective.
   .. code-block:: python
 
     from tvm.auto_scheduler.utils import request_remote
-    remote = request_remote(device_key, "0.0.0.0", 9190)
+    remote = request_remote(device_key, "127.0.0.1", 9190)
     dev = remote.cl()
     max_shared_memory_per_block = dev.max_shared_memory_per_block
     # There is no explicit local memory limition
-    # so we can use INT32_MAX to disalbe the check on local_memory.
+    # so we can use INT32_MAX to disable the check on local_memory.
     max_local_memory_per_block = 2147483647 # INT32_MAX
     max_threads_per_block = dev.max_threads_per_block
     max_vthread_extent = int(dev.warp_size / 4) if int(dev.warp_size / 4) > 1 else dev.warp_size
@@ -459,7 +459,7 @@ Now, we set some options for tuning, launch the search tasks and evaluate the en
             num_measure_trials=200,  # change this to 20000 to achieve the best performance
             builder=auto_scheduler.LocalBuilder(build_func="ndk" if use_ndk else "default"),
             runner=auto_scheduler.RPCRunner(
-                device_key, host="0.0.0.0", port=9190, repeat=3, timeout=50
+                device_key, host="127.0.0.1", port=9190, repeat=3, timeout=50
             ),
             measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
         )
@@ -478,7 +478,7 @@ Now, we set some options for tuning, launch the search tasks and evaluate the en
         print("=============== Request Remote ===============")
         from tvm.auto_scheduler.utils import request_remote
 
-        remote = request_remote(device_key, "0.0.0.0", 9190)
+        remote = request_remote(device_key, "127.0.0.1", 9190)
         dev = remote.cl()
         from tvm.contrib import utils, ndk
 
diff --git a/docs/_sources/tutorials/auto_scheduler/tune_network_x86.rst.txt b/docs/_sources/tutorials/auto_scheduler/tune_network_x86.rst.txt
index 4165fdc..6a4e308 100644
--- a/docs/_sources/tutorials/auto_scheduler/tune_network_x86.rst.txt
+++ b/docs/_sources/tutorials/auto_scheduler/tune_network_x86.rst.txt
@@ -418,10 +418,10 @@ The task scheduler will just optimize this objective.
     T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
 
-    ========== Task 23  (workload key: ["a5612fdeb9db4d579a75ec225ea4c06a", 1, 112, 112, 64, 1, 1, 1, 64, 1, 56, 56, 64]) ==========
+    ========== Task 23  (workload key: ["64b98c71af70a904fdbb81d7d4188d84", 1, 112, 112, 64, 1, 1, 1, 64, 1, 56, 56, 64]) ==========
     placeholder = PLACEHOLDER [1, 112, 112, 64]
     pad_temp(ax0, ax1, ax2, ax3) = tir.if_then_else(((((ax1 >= 1) && (ax1 < 113)) && (ax2 >= 1)) && (ax2 < 113)), placeholder[ax0, (ax1 - 1), (ax2 - 1), ax3], -3.40282e+38f)
-    tensor(ax0, ax1, ax2, ax3) max= pad_temp[ax0, ((ax1*2) + dh), ((ax2*2) + dw), ax3]
+    tensor(ax0, ax1, ax2, ax3) max= pad_temp[ax0, ((ax1*2) + rv0), ((ax2*2) + rv1), ax3]
     placeholder = PLACEHOLDER [1, 1, 1, 64]
     T_add(ax0, ax1, ax2, ax3) = (tensor[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
     T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
@@ -615,7 +615,7 @@ so we can read the log file and load the best schedules.
 
     Compile...
     Evaluate inference time cost...
-    Mean inference time (std dev): 569.29 ms (0.09 ms)
+    Mean inference time (std dev): 544.56 ms (0.27 ms)
 
 
 
diff --git a/docs/_sources/tutorials/auto_scheduler/tune_sparse_x86.rst.txt b/docs/_sources/tutorials/auto_scheduler/tune_sparse_x86.rst.txt
index 374a401..88fe6d7 100644
--- a/docs/_sources/tutorials/auto_scheduler/tune_sparse_x86.rst.txt
+++ b/docs/_sources/tutorials/auto_scheduler/tune_sparse_x86.rst.txt
@@ -318,6 +318,7 @@ file and apply it.
 
 
 
+
 We can lower the schedule to see the IR after auto-scheduling.
 The auto-scheduler correctly performs optimizations including multi-level tiling,
 layout transformation, parallelization, vectorization, unrolling, and operator fusion.
@@ -342,31 +343,35 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
     Lowered TIR:
     primfn(placeholder_5: handle, placeholder_6: handle, placeholder_7: handle, placeholder_8: handle, placeholder_9: handle, compute_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
-      buffers = {placeholder_4: Buffer(placeholder_10: Pointer(int32), int32, [33], []),
-                 placeholder_3: Buffer(placeholder_11: Pointer(float32), float32, [4916, 16, 1], []),
+      buffers = {placeholder_4: Buffer(placeholder_10: Pointer(float32), float32, [128, 512], []),
+                 placeholder: Buffer(placeholder_11: Pointer(float32), float32, [4916, 16, 1], []),
+                 placeholder_1: Buffer(placeholder_12: Pointer(float32), float32, [128, 256], []),
                  compute: Buffer(compute_2: Pointer(float32), float32, [128, 512], []),
-                 placeholder: Buffer(placeholder_12: Pointer(int32), int32, [4916], []),
-                 placeholder_1: Buffer(placeholder_13: Pointer(float32), float32, [128, 256], []),
-                 placeholder_2: Buffer(placeholder_14: Pointer(float32), float32, [128, 512], [])}
-      buffer_map = {compute_1: compute, placeholder_7: placeholder, placeholder_5: placeholder_1, placeholder_9: placeholder_2, placeholder_6: placeholder_3, placeholder_8: placeholder_4} {
-      for (i0.outer.i1.outer.fused: int32, 0, 512) "parallel" {
+                 placeholder_2: Buffer(placeholder_13: Pointer(int32), int32, [4916], []),
+                 placeholder_3: Buffer(placeholder_14: Pointer(int32), int32, [33], [])}
+      buffer_map = {placeholder_6: placeholder, placeholder_5: placeholder_1, placeholder_7: placeholder_2, placeholder_8: placeholder_3, placeholder_9: placeholder_4, compute_1: compute} {
+      for (i0.outer.i1.outer.fused: int32, 0, 32) "parallel" {
         attr [compute_3: Pointer(float32)] "storage_scope" = "global";
-        allocate(compute_3, float32, [128]) {
+        allocate(compute_3, float32, [2048]) {
           for (i.outer.inner: int32, 0, 4) {
             for (nb_j.inner: int32, 0, 2) {
-              for (j.init: int32, 0, 16) {
-                compute_3[(((i.outer.inner*32) + (nb_j.inner*16)) + j.init)] = 0f32
+              for (i.inner.init: int32, 0, 16) {
+                for (j.init: int32, 0, 16) {
+                  compute_3[((((i.outer.inner*512) + (i.inner.init*32)) + (nb_j.inner*16)) + j.init)] = 0f32
+                }
               }
-              for (elem_idx: int32, 0, ((int32*)placeholder_10[(((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner) + 1)] - (int32*)placeholder_10[((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)])) {
-                for (j: int32, 0, 16) {
-                  compute_3[(((i.outer.inner*32) + (nb_j.inner*16)) + j)] = ((float32*)compute_3[(((i.outer.inner*32) + (nb_j.inner*16)) + j)] + ((float32*)placeholder_11[((((int32*)placeholder_10[((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)]*16) + (elem_idx*16)) + j)]*max((float32*)placeholder_13[(((floordiv(i0.outer.i1.outer.fused, 16)*1024) + (i.outer.inner*256)) + (int32*)placeholder_12[((int32*)placeholder_10[((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)] + el [...]
+              for (elem_idx: int32, 0, ((int32*)placeholder_14[(((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner) + 1)] - (int32*)placeholder_14[((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)])) {
+                for (i.inner: int32, 0, 16) {
+                  for (j: int32, 0, 16) {
+                    compute_3[((((i.outer.inner*512) + (i.inner*32)) + (nb_j.inner*16)) + j)] = ((float32*)compute_3[((((i.outer.inner*512) + (i.inner*32)) + (nb_j.inner*16)) + j)] + ((float32*)placeholder_11[((((int32*)placeholder_14[((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)]*16) + (elem_idx*16)) + j)]*max((float32*)placeholder_12[((((floordiv(i0.outer.i1.outer.fused, 16)*16384) + (i.outer.inner*4096)) + (i.inner*256)) + (int32*)placeholder_13[((int32*)placeholder_14[((fl [...]
+                  }
                 }
               }
             }
           }
-          for (i0.inner: int32, 0, 4) {
+          for (i0.inner: int32, 0, 64) {
             for (i1.inner: int32, 0, 32) {
-              compute_2[((((floordiv(i0.outer.i1.outer.fused, 16)*2048) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 16)*32)) + i1.inner)] = max(((float32*)compute_3[((i0.inner*32) + i1.inner)] + (float32*)placeholder_14[((((floordiv(i0.outer.i1.outer.fused, 16)*2048) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 16)*32)) + i1.inner)]), 0f32)
+              compute_2[((((floordiv(i0.outer.i1.outer.fused, 16)*32768) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 16)*32)) + i1.inner)] = max(((float32*)compute_3[((i0.inner*32) + i1.inner)] + (float32*)placeholder_10[((((floordiv(i0.outer.i1.outer.fused, 16)*32768) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 16)*32)) + i1.inner)]), 0f32)
             }
           }
         }
@@ -421,7 +426,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 1.743 ms
+    Execution time of this operator: 1.275 ms
 
 
 
diff --git a/docs/_sources/tutorials/autotvm/sg_execution_times.rst.txt b/docs/_sources/tutorials/autotvm/sg_execution_times.rst.txt
index e901c76..1a82314 100644
--- a/docs/_sources/tutorials/autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorials/autotvm/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**00:38.012** total execution time for **tutorials_autotvm** files:
+**00:37.722** total execution time for **tutorials_autotvm** files:
 
-- **00:37.200**: :ref:`sphx_glr_tutorials_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``)
-- **00:00.225**: :ref:`sphx_glr_tutorials_autotvm_tune_relay_cuda.py` (``tune_relay_cuda.py``)
-- **00:00.202**: :ref:`sphx_glr_tutorials_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``)
-- **00:00.193**: :ref:`sphx_glr_tutorials_autotvm_tune_relay_mobile_gpu.py` (``tune_relay_mobile_gpu.py``)
-- **00:00.192**: :ref:`sphx_glr_tutorials_autotvm_tune_relay_arm.py` (``tune_relay_arm.py``)
+- **00:36.997**: :ref:`sphx_glr_tutorials_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``)
+- **00:00.209**: :ref:`sphx_glr_tutorials_autotvm_tune_relay_cuda.py` (``tune_relay_cuda.py``)
+- **00:00.174**: :ref:`sphx_glr_tutorials_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``)
+- **00:00.171**: :ref:`sphx_glr_tutorials_autotvm_tune_relay_mobile_gpu.py` (``tune_relay_mobile_gpu.py``)
+- **00:00.171**: :ref:`sphx_glr_tutorials_autotvm_tune_relay_arm.py` (``tune_relay_arm.py``)
diff --git a/docs/_sources/tutorials/autotvm/tune_conv2d_cuda.rst.txt b/docs/_sources/tutorials/autotvm/tune_conv2d_cuda.rst.txt
index d2c7231..e92ca7b 100644
--- a/docs/_sources/tutorials/autotvm/tune_conv2d_cuda.rst.txt
+++ b/docs/_sources/tutorials/autotvm/tune_conv2d_cuda.rst.txt
@@ -242,26 +242,26 @@ for this template
        7 unroll_explicit: OtherOption([0, 1]) len=2
     )
     Get devices for measurement successfully!
-    No: 1   GFLOPS: 0.00/0.00       result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
-    No: 2   GFLOPS: 0.00/0.00       result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
-    No: 3   GFLOPS: 0.00/0.00       result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
-    No: 4   GFLOPS: 0.00/0.00       result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
-    No: 5   GFLOPS: 0.00/0.00       result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
-    No: 6   GFLOPS: 81.65/81.65     result: MeasureResult(costs=(0.0028354476052631582,), error_no=0, all_cost=1.8336963653564453, timestamp=1619468544.2176208)    [('tile_f', [-1, 1, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3754080
-    No: 7   GFLOPS: 0.00/81.65      result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
-    No: 8   GFLOPS: 0.00/81.65      result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
-    No: 9   GFLOPS: 0.00/81.65      result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
-    No: 10  GFLOPS: 0.00/81.65      result: MeasureResult(costs=(TimeoutError(),), error_no=6, all_cost=10, timestamp=1619468555.1799824)   [('tile_f', [-1, 32, 2, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4691833
-    No: 11  GFLOPS: 0.00/81.65      result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
-    No: 12  GFLOPS: 0.00/81.65      result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
-    No: 13  GFLOPS: 0.00/81.65      result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
-    No: 14  GFLOPS: 0.00/81.65      result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
-    No: 15  GFLOPS: 0.00/81.65      result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
-    No: 16  GFLOPS: 0.00/81.65      result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
-    No: 17  GFLOPS: 0.00/81.65      result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
-    No: 18  GFLOPS: 0.00/81.65      result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
-    No: 19  GFLOPS: 0.95/81.65      result: MeasureResult(costs=(0.24381311025,), error_no=0, all_cost=6.927755832672119, timestamp=1619468564.9183898)     [('tile_f', [-1, 8, 2, 16]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6390073
-    No: 20  GFLOPS: 143.16/143.16   result: MeasureResult(costs=(0.0016170250156249998,), error_no=0, all_cost=1.5111267566680908, timestamp=1619468565.976795)     [('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9881539
+    No: 1   GFLOPS: 0.00/0.00       result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
+    No: 2   GFLOPS: 0.00/0.00       result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
+    No: 3   GFLOPS: 0.00/0.00       result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
+    No: 4   GFLOPS: 0.00/0.00       result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
+    No: 5   GFLOPS: 0.00/0.00       result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
+    No: 6   GFLOPS: 54.15/54.15     result: MeasureResult(costs=(0.004275541291666666,), error_no=0, all_cost=1.7125194072723389, timestamp=1620242423.3596973)     [('tile_f', [-1, 1, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3754080
+    No: 7   GFLOPS: 0.00/54.15      result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
+    No: 8   GFLOPS: 0.00/54.15      result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
+    No: 9   GFLOPS: 0.00/54.15      result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
+    No: 10  GFLOPS: 0.00/54.15      result: MeasureResult(costs=(TimeoutError(),), error_no=6, all_cost=10, timestamp=1620242434.2559874)   [('tile_f', [-1, 32, 2, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4691833
+    No: 11  GFLOPS: 0.00/54.15      result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
+    No: 12  GFLOPS: 0.00/54.15      result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
+    No: 13  GFLOPS: 0.00/54.15      result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
+    No: 14  GFLOPS: 0.00/54.15      result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
+    No: 15  GFLOPS: 0.00/54.15      result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
+    No: 16  GFLOPS: 0.00/54.15      result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
+    No: 17  GFLOPS: 0.00/54.15      result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
+    No: 18  GFLOPS: 0.00/54.15      result: MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  20: TVMFuncCall\n        at /workspace/src/runtime/c_runtime_api.cc:474\n  19: tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const\n        at /workspace/include/tvm/runtime/packed_func.h:1150\n  18: std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRet [...]
+    No: 19  GFLOPS: 0.95/54.15      result: MeasureResult(costs=(0.243582077,), error_no=0, all_cost=6.715496063232422, timestamp=1620242444.0091836)       [('tile_f', [-1, 8, 2, 16]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6390073
+    No: 20  GFLOPS: 148.89/148.89   result: MeasureResult(costs=(0.0015548858640776699,), error_no=0, all_cost=1.588977336883545, timestamp=1620242445.0803642)     [('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9881539
 
 
 
@@ -314,7 +314,7 @@ and measure running time.
 
     Best config:
     [('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9881539
-    Time cost of this operator: 0.002018
+    Time cost of this operator: 0.001969
 
 
 
diff --git a/docs/_sources/tutorials/autotvm/tune_relay_arm.rst.txt b/docs/_sources/tutorials/autotvm/tune_relay_arm.rst.txt
index ee1a79e..56b3a82 100644
--- a/docs/_sources/tutorials/autotvm/tune_relay_arm.rst.txt
+++ b/docs/_sources/tutorials/autotvm/tune_relay_arm.rst.txt
@@ -234,7 +234,7 @@ set :code:`use_android` to True if you use android phone.
             builder=autotvm.LocalBuilder(build_func="ndk" if use_android else "default"),
             runner=autotvm.RPCRunner(
                 device_key,
-                host="0.0.0.0",
+                host="127.0.0.1",
                 port=9190,
                 number=5,
                 timeout=10,
@@ -369,7 +369,7 @@ Finally, we launch tuning jobs and evaluate the end-to-end performance.
 
             # upload module to device
             print("Upload...")
-            remote = autotvm.measure.request_remote(device_key, "0.0.0.0", 9190, timeout=10000)
+            remote = autotvm.measure.request_remote(device_key, "127.0.0.1", 9190, timeout=10000)
             remote.upload(tmp.relpath(filename))
             rlib = remote.load_module(filename)
 
diff --git a/docs/_sources/tutorials/autotvm/tune_relay_cuda.rst.txt b/docs/_sources/tutorials/autotvm/tune_relay_cuda.rst.txt
index 58566d4..1cb0526 100644
--- a/docs/_sources/tutorials/autotvm/tune_relay_cuda.rst.txt
+++ b/docs/_sources/tutorials/autotvm/tune_relay_cuda.rst.txt
@@ -374,13 +374,13 @@ we need to add `--no-fork` to the argument list.)
 
 .. code-block:: bash
 
-    python -m tvm.exec.rpc_server --tracker=0.0.0.0:9190 --key=1080ti
+    python -m tvm.exec.rpc_server --tracker=127.0.0.1:9190 --key=1080ti
 
 After registering devices, we can confirm it by querying rpc_tracker
 
 .. code-block:: bash
 
-  python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190
+  python -m tvm.exec.query_rpc_tracker --host=127.0.0.1 --port=9190
 
 For example, if we have four 1080ti, two titanx and one gfx900, the output can be
 
@@ -411,7 +411,7 @@ to replace the corresponding part above.
             builder=autotvm.LocalBuilder(timeout=10),
             runner=autotvm.RPCRunner(
                 "1080ti",  # change the device key to your key
-                "0.0.0.0",
+                "127.0.0.1",
                 9190,
                 number=20,
                 repeat=3,
diff --git a/docs/_sources/tutorials/autotvm/tune_relay_mobile_gpu.rst.txt b/docs/_sources/tutorials/autotvm/tune_relay_mobile_gpu.rst.txt
index 692f314..c350e96 100644
--- a/docs/_sources/tutorials/autotvm/tune_relay_mobile_gpu.rst.txt
+++ b/docs/_sources/tutorials/autotvm/tune_relay_mobile_gpu.rst.txt
@@ -234,7 +234,7 @@ set :code:`use_android` to True if you use android phone.
             builder=autotvm.LocalBuilder(build_func="ndk" if use_android else "default"),
             runner=autotvm.RPCRunner(
                 device_key,
-                host="0.0.0.0",
+                host="127.0.0.1",
                 port=9190,
                 number=10,
                 timeout=5,
@@ -365,7 +365,7 @@ Finally, we launch tuning jobs and evaluate the end-to-end performance.
 
             # upload module to device
             print("Upload...")
-            remote = autotvm.measure.request_remote(device_key, "0.0.0.0", 9190, timeout=10000)
+            remote = autotvm.measure.request_remote(device_key, "127.0.0.1", 9190, timeout=10000)
             remote.upload(tmp.relpath(filename))
             rlib = remote.load_module(filename)
 
diff --git a/docs/_sources/tutorials/dev/low_level_custom_pass.rst.txt b/docs/_sources/tutorials/dev/low_level_custom_pass.rst.txt
index 58a3eb2..0208a9b 100644
--- a/docs/_sources/tutorials/dev/low_level_custom_pass.rst.txt
+++ b/docs/_sources/tutorials/dev/low_level_custom_pass.rst.txt
@@ -74,8 +74,8 @@ our customized lowering pass to manipulate the IR directly instead of using sche
 
     primfn(a_1: handle, b_1: handle, c_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
-      buffers = {b: Buffer(b_2: Pointer(float32), float32, [128], []),
-                 c: Buffer(c_2: Pointer(float32), float32, [128], []),
+      buffers = {c: Buffer(c_2: Pointer(float32), float32, [128], []),
+                 b: Buffer(b_2: Pointer(float32), float32, [128], []),
                  a: Buffer(a_2: Pointer(float32), float32, [128], [])}
       buffer_map = {a_1: a, b_1: b, c_1: c} {
       for (i: int32, 0, 128) {
diff --git a/docs/_sources/tutorials/dev/sg_execution_times.rst.txt b/docs/_sources/tutorials/dev/sg_execution_times.rst.txt
index aa219e1..7d0e90e 100644
--- a/docs/_sources/tutorials/dev/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorials/dev/sg_execution_times.rst.txt
@@ -5,8 +5,8 @@
 
 Computation times
 =================
-**00:29.059** total execution time for **tutorials_dev** files:
+**00:28.312** total execution time for **tutorials_dev** files:
 
-- **00:28.401**: :ref:`sphx_glr_tutorials_dev_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``)
-- **00:00.432**: :ref:`sphx_glr_tutorials_dev_use_pass_infra.py` (``use_pass_infra.py``)
-- **00:00.226**: :ref:`sphx_glr_tutorials_dev_low_level_custom_pass.py` (``low_level_custom_pass.py``)
+- **00:27.699**: :ref:`sphx_glr_tutorials_dev_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``)
+- **00:00.407**: :ref:`sphx_glr_tutorials_dev_use_pass_infra.py` (``use_pass_infra.py``)
+- **00:00.207**: :ref:`sphx_glr_tutorials_dev_low_level_custom_pass.py` (``low_level_custom_pass.py``)
diff --git a/docs/_sources/tutorials/frontend/deploy_model_on_android.rst.txt b/docs/_sources/tutorials/frontend/deploy_model_on_android.rst.txt
index 58f885f..d03d00a 100644
--- a/docs/_sources/tutorials/frontend/deploy_model_on_android.rst.txt
+++ b/docs/_sources/tutorials/frontend/deploy_model_on_android.rst.txt
@@ -349,7 +349,7 @@ to the remote android device.
 .. code-block:: default
 
 
-    tracker_host = os.environ.get("TVM_TRACKER_HOST", "0.0.0.0")
+    tracker_host = os.environ.get("TVM_TRACKER_HOST", "127.0.0.1")
     tracker_port = int(os.environ.get("TVM_TRACKER_PORT", 9190))
     key = "android"
 
@@ -417,7 +417,7 @@ Execute on TVM
 
     TVM prediction top-1: tiger cat
     Evaluate inference time cost...
-    Mean inference time (std dev): 6.30 ms (0.08 ms)
+    Mean inference time (std dev): 10.72 ms (0.20 ms)
 
 
 
diff --git a/docs/_sources/tutorials/frontend/deploy_object_detection_pytorch.rst.txt b/docs/_sources/tutorials/frontend/deploy_object_detection_pytorch.rst.txt
index f2fecb4..01c6bd2 100644
--- a/docs/_sources/tutorials/frontend/deploy_object_detection_pytorch.rst.txt
+++ b/docs/_sources/tutorials/frontend/deploy_object_detection_pytorch.rst.txt
@@ -246,7 +246,7 @@ Get boxes with score larger than 0.9
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 5 minutes  31.108 seconds)
+   **Total running time of the script:** ( 5 minutes  47.128 seconds)
 
 
 .. _sphx_glr_download_tutorials_frontend_deploy_object_detection_pytorch.py:
diff --git a/docs/_sources/tutorials/frontend/deploy_prequantized.rst.txt b/docs/_sources/tutorials/frontend/deploy_prequantized.rst.txt
index 20e59fc..e29310f 100644
--- a/docs/_sources/tutorials/frontend/deploy_prequantized.rst.txt
+++ b/docs/_sources/tutorials/frontend/deploy_prequantized.rst.txt
@@ -350,7 +350,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
  .. code-block:: none
 
-    Elapsed average ms: 21.566911189999995
+    Elapsed average ms: 38.79874884
 
 
 
diff --git a/docs/_sources/tutorials/frontend/deploy_prequantized_tflite.rst.txt b/docs/_sources/tutorials/frontend/deploy_prequantized_tflite.rst.txt
index 5a6ae66..bbb21f0 100644
--- a/docs/_sources/tutorials/frontend/deploy_prequantized_tflite.rst.txt
+++ b/docs/_sources/tutorials/frontend/deploy_prequantized_tflite.rst.txt
@@ -368,7 +368,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
  .. code-block:: none
 
-    Elapsed average ms: 42.03731231999999
+    Elapsed average ms: 69.77875007
 
 
 
@@ -401,7 +401,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 2 minutes  17.919 seconds)
+   **Total running time of the script:** ( 2 minutes  23.146 seconds)
 
 
 .. _sphx_glr_download_tutorials_frontend_deploy_prequantized_tflite.py:
diff --git a/docs/_sources/tutorials/frontend/deploy_sparse.rst.txt b/docs/_sources/tutorials/frontend/deploy_sparse.rst.txt
index e0d0139..0a1035c 100644
--- a/docs/_sources/tutorials/frontend/deploy_sparse.rst.txt
+++ b/docs/_sources/tutorials/frontend/deploy_sparse.rst.txt
@@ -219,7 +219,7 @@ to reimport from tensorflow each time this script is run.
                 with open(os.path.join(abs_path, relay_params), "wb") as fo:
                     fo.write(runtime.save_param_dict(params))
 
-        return mod, params, shape_dict
+        return mod, dict(params.items()), shape_dict
 
 
 
diff --git a/docs/_sources/tutorials/frontend/deploy_ssd_gluoncv.rst.txt b/docs/_sources/tutorials/frontend/deploy_ssd_gluoncv.rst.txt
index 323d9e0..42d88f8 100644
--- a/docs/_sources/tutorials/frontend/deploy_ssd_gluoncv.rst.txt
+++ b/docs/_sources/tutorials/frontend/deploy_ssd_gluoncv.rst.txt
@@ -199,7 +199,7 @@ Display result
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  58.987 seconds)
+   **Total running time of the script:** ( 2 minutes  3.697 seconds)
 
 
 .. _sphx_glr_download_tutorials_frontend_deploy_ssd_gluoncv.py:
diff --git a/docs/_sources/tutorials/frontend/from_onnx.rst.txt b/docs/_sources/tutorials/frontend/from_onnx.rst.txt
index 9cbf850..c2f7d5f 100644
--- a/docs/_sources/tutorials/frontend/from_onnx.rst.txt
+++ b/docs/_sources/tutorials/frontend/from_onnx.rst.txt
@@ -142,7 +142,7 @@ provides a static definition of the input size.
 
  .. code-block:: none
 
-    /workspace/docs/../python/tvm/relay/frontend/onnx.py:3270: UserWarning: Mismatched attribute type in ' : kernel_shape'
+    /workspace/docs/../python/tvm/relay/frontend/onnx.py:3378: UserWarning: Mismatched attribute type in ' : kernel_shape'
 
     ==> Context: Bad node spec: input: "1" input: "2" output: "11" op_type: "Conv" attribute { name: "kernel_shape" ints: 5 ints: 5 } attribute { name: "strides" ints: 1 ints: 1 } attribute { name: "pads" ints: 2 ints: 2 ints: 2 ints: 2 } attribute { name: "dilations" ints: 1 ints: 1 } attribute { name: "group" i: 1 }
       warnings.warn(str(e))
diff --git a/docs/_sources/tutorials/frontend/from_tensorflow.rst.txt b/docs/_sources/tutorials/frontend/from_tensorflow.rst.txt
index b3e7178..cad5d8e 100644
--- a/docs/_sources/tutorials/frontend/from_tensorflow.rst.txt
+++ b/docs/_sources/tutorials/frontend/from_tensorflow.rst.txt
@@ -193,7 +193,7 @@ Results:
 
  .. code-block:: none
 
-    /workspace/docs/../python/tvm/relay/frontend/tensorflow.py:3396: UserWarning: Ignore the passed shape. Shape in graphdef will be used for operator DecodeJpeg/contents.
+    /workspace/docs/../python/tvm/relay/frontend/tensorflow.py:3401: UserWarning: Ignore the passed shape. Shape in graphdef will be used for operator DecodeJpeg/contents.
       "will be used for operator %s." % node.name
     /workspace/docs/../python/tvm/relay/frontend/tensorflow.py:899: UserWarning: DecodeJpeg: It's a pass through, please handle preprocessing before input
       warnings.warn("DecodeJpeg: It's a pass through, please handle preprocessing before input")
@@ -221,150 +221,6 @@ Results:
 
 
 
-.. rst-class:: sphx-glr-script-out
-
- Out:
-
- .. code-block:: none
-
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-    conv2d NHWC layout is not optimized for x86 with autotvm.
-
 
 
 Execute the portable graph on TVM
diff --git a/docs/_sources/tutorials/frontend/sg_execution_times.rst.txt b/docs/_sources/tutorials/frontend/sg_execution_times.rst.txt
index e69326d..6bee43e 100644
--- a/docs/_sources/tutorials/frontend/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorials/frontend/sg_execution_times.rst.txt
@@ -5,24 +5,24 @@
 
 Computation times
 =================
-**13:54.613** total execution time for **tutorials_frontend** files:
+**15:11.992** total execution time for **tutorials_frontend** files:
 
-- **05:31.108**: :ref:`sphx_glr_tutorials_frontend_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``)
-- **02:17.919**: :ref:`sphx_glr_tutorials_frontend_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``)
-- **01:58.987**: :ref:`sphx_glr_tutorials_frontend_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``)
-- **00:42.205**: :ref:`sphx_glr_tutorials_frontend_from_tensorflow.py` (``from_tensorflow.py``)
-- **00:32.477**: :ref:`sphx_glr_tutorials_frontend_deploy_prequantized.py` (``deploy_prequantized.py``)
-- **00:28.860**: :ref:`sphx_glr_tutorials_frontend_deploy_quantized.py` (``deploy_quantized.py``)
-- **00:25.102**: :ref:`sphx_glr_tutorials_frontend_from_tflite.py` (``from_tflite.py``)
-- **00:24.157**: :ref:`sphx_glr_tutorials_frontend_from_darknet.py` (``from_darknet.py``)
-- **00:16.706**: :ref:`sphx_glr_tutorials_frontend_from_caffe2.py` (``from_caffe2.py``)
-- **00:15.660**: :ref:`sphx_glr_tutorials_frontend_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``)
-- **00:13.339**: :ref:`sphx_glr_tutorials_frontend_deploy_model_on_android.py` (``deploy_model_on_android.py``)
-- **00:11.716**: :ref:`sphx_glr_tutorials_frontend_from_pytorch.py` (``from_pytorch.py``)
-- **00:10.467**: :ref:`sphx_glr_tutorials_frontend_from_mxnet.py` (``from_mxnet.py``)
-- **00:10.196**: :ref:`sphx_glr_tutorials_frontend_from_keras.py` (``from_keras.py``)
-- **00:09.411**: :ref:`sphx_glr_tutorials_frontend_from_coreml.py` (``from_coreml.py``)
-- **00:03.145**: :ref:`sphx_glr_tutorials_frontend_using_external_lib.py` (``using_external_lib.py``)
-- **00:01.723**: :ref:`sphx_glr_tutorials_frontend_from_onnx.py` (``from_onnx.py``)
-- **00:01.212**: :ref:`sphx_glr_tutorials_frontend_build_gcn.py` (``build_gcn.py``)
-- **00:00.223**: :ref:`sphx_glr_tutorials_frontend_deploy_sparse.py` (``deploy_sparse.py``)
+- **05:47.128**: :ref:`sphx_glr_tutorials_frontend_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``)
+- **02:23.146**: :ref:`sphx_glr_tutorials_frontend_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``)
+- **02:03.697**: :ref:`sphx_glr_tutorials_frontend_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``)
+- **00:44.982**: :ref:`sphx_glr_tutorials_frontend_from_tensorflow.py` (``from_tensorflow.py``)
+- **00:44.916**: :ref:`sphx_glr_tutorials_frontend_deploy_prequantized.py` (``deploy_prequantized.py``)
+- **00:38.742**: :ref:`sphx_glr_tutorials_frontend_deploy_quantized.py` (``deploy_quantized.py``)
+- **00:29.018**: :ref:`sphx_glr_tutorials_frontend_from_darknet.py` (``from_darknet.py``)
+- **00:25.636**: :ref:`sphx_glr_tutorials_frontend_from_tflite.py` (``from_tflite.py``)
+- **00:21.957**: :ref:`sphx_glr_tutorials_frontend_from_caffe2.py` (``from_caffe2.py``)
+- **00:18.333**: :ref:`sphx_glr_tutorials_frontend_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``)
+- **00:18.069**: :ref:`sphx_glr_tutorials_frontend_deploy_model_on_android.py` (``deploy_model_on_android.py``)
+- **00:14.551**: :ref:`sphx_glr_tutorials_frontend_from_pytorch.py` (``from_pytorch.py``)
+- **00:12.998**: :ref:`sphx_glr_tutorials_frontend_from_coreml.py` (``from_coreml.py``)
+- **00:12.393**: :ref:`sphx_glr_tutorials_frontend_from_mxnet.py` (``from_mxnet.py``)
+- **00:09.859**: :ref:`sphx_glr_tutorials_frontend_from_keras.py` (``from_keras.py``)
+- **00:03.190**: :ref:`sphx_glr_tutorials_frontend_using_external_lib.py` (``using_external_lib.py``)
+- **00:01.944**: :ref:`sphx_glr_tutorials_frontend_from_onnx.py` (``from_onnx.py``)
+- **00:01.225**: :ref:`sphx_glr_tutorials_frontend_build_gcn.py` (``build_gcn.py``)
+- **00:00.208**: :ref:`sphx_glr_tutorials_frontend_deploy_sparse.py` (``deploy_sparse.py``)
diff --git a/docs/_sources/tutorials/get_started/auto_tuning_with_python.rst.txt b/docs/_sources/tutorials/get_started/auto_tuning_with_python.rst.txt
index 4a0537d..523cba0 100644
--- a/docs/_sources/tutorials/get_started/auto_tuning_with_python.rst.txt
+++ b/docs/_sources/tutorials/get_started/auto_tuning_with_python.rst.txt
@@ -229,21 +229,7 @@ runtime module from the library.
 
  .. code-block:: none
 
-
    ...47%, 0.01 MB, 197 KB/s, 0 seconds passed
    ...94%, 0.02 MB, 391 KB/s, 0 seconds passed
    ...100%, 0.02 MB, 584 KB/s, 0 seconds passed
-    Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('conv2d_NCHWc.x86', ('TENSOR', (1, 256, 56, 56), 'float32'), ('TENSOR', (128, 256, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'NCHW', 'NCHW', 'float32'). A fallback configuration is used, which may bring great performance regression.
-    Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('conv2d_NCHWc.x86', ('TENSOR', (1, 128, 56, 56), 'float32'), ('TENSOR', (128, 128, 3, 3), 'float32'), (2, 2), (1, 1, 1, 1), (1, 1), 'NCHW', 'NCHW', 'float32'). A fallback configuration is used, which may bring great performance regression.
-    Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('conv2d_NCHWc.x86', ('TENSOR', (1, 512, 28, 28), 'float32'), ('TENSOR', (256, 512, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'NCHW', 'NCHW', 'float32'). A fallback configuration is used, which may bring great performance regression.
-    Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('conv2d_NCHWc.x86', ('TENSOR', (1, 256, 28, 28), 'float32'), ('TENSOR', (256, 256, 3, 3), 'float32'), (2, 2), (1, 1, 1, 1), (1, 1), 'NCHW', 'NCHW', 'float32'). A fallback configuration is used, which may bring great performance regression.
-    Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('conv2d_NCHWc.x86', ('TENSOR', (1, 1024, 14, 14), 'float32'), ('TENSOR', (512, 1024, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'NCHW', 'NCHW', 'float32'). A fallback configuration is used, which may bring great performance regression.
-    Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('conv2d_NCHWc.x86', ('TENSOR', (1, 512, 14, 14), 'float32'), ('TENSOR', (512, 512, 3, 3), 'float32'), (2, 2), (1, 1, 1, 1), (1, 1), 'NCHW', 'NCHW', 'float32'). A fallback configuration is used, which may bring great performance regression.
-    Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('dense_pack.x86', ('TENSOR', (1, 2048), 'float32'), ('TENSOR', (1000, 2048), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
-    Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('dense_pack.x86', ('TENSOR', (1, 2048), 'float32'), ('TENSOR', (100, 2048, 10), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
-    Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('conv2d_NCHWc.x86', ('TENSOR', (1, 64, 14, 14, 8), 'float32'), ('TENSOR', (64, 64, 3, 3, 8, 8), 'float32'), (2, 2), (1, 1, 1, 1), (1, 1), 'NCHW8c', 'NCHW8c', 'float32'). A fallback configuration is used, which may bring great performance regression.
-    Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('conv2d_NCHWc.x86', ('TENSOR', (1, 128, 14, 14, 8), 'float32'), ('TENSOR', (64, 128, 1, 1, 8, 8), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'NCHW8c', 'NCHW8c', 'float32'). A fallback configuration is used, which may bring great performance regression.
-    Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('conv2d_NCHWc.x86', ('TENSOR', (1, 32, 28, 28, 8), 'float32'), ('TENSOR', (32, 32, 3, 3, 8, 8), 'float32'), (2, 2), (1, 1, 1, 1), (1, 1), 'NCHW8c', 'NCHW8c', 'float32'). A fallback configuration is used, which may bring great performance regression.
-    Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('conv2d_NCHWc.x86', ('TENSOR', (1, 64, 28, 28, 8), 'float32'), ('TENSOR', (32, 64, 1, 1, 8, 8), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'NCHW8c', 'NCHW8c', 'float32'). A fallback configuration is used, which may bring great performance regression.
-    Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('conv2d_NCHWc.x86', ('TENSOR', (1, 16, 56, 56, 8), 'float32'), ('TENSOR', (16, 16, 3, 3, 8, 8), 'float32'), (2, 2), (1, 1, 1, 1), (1, 1), 'NCHW8c', 'NCHW8c', 'float32'). A fallback configuration is used, which may bring great performance regression.
-    Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('conv2d_NCHWc.x86', ('TENSOR', (1, 32, 56, 56, 8), 'float32'), ('TENSOR', (16, 32, 1, 1, 8, 8), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'NCHW8c', 'NCHW8c', 'float32'). A fallback configuration is used, which may bring great performance regression.
+
    ...47%, 0.01 MB, 196 KB/s, 0 seconds passed
    ...94%, 0.02 MB, 389 KB/s, 0 seconds passed
    ...100%, 0.02 MB, 581 KB/s, 0 seconds passed
 
 
 
@@ -310,7 +296,7 @@ standard deviation.
 
  .. code-block:: none
 
-    {'mean': 116.61725386977196, 'median': 116.69999185251072, 'std': 0.4373409880996617}
+    {'mean': 225.59652527037542, 'median': 225.24130374658853, 'std': 0.927784740794048}
 
 
 
@@ -504,31 +490,31 @@ In the simplest form, tuning requires you to provide three things:
 
  .. code-block:: none
 
-
    [Task  1/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  1/25]  Current/Best:    2.47/  41.64 GFLOPS | Progress: (8/10) | 12.78 s
    [Task  1/25]  Current/Best:    0.00/  41.64 GFLOPS | Progress: (10/10) | 22.98 s Done.
-
    [Task  2/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  2/25]  Current/Best:   34.75/  34.75 GFLOPS | Progress: (8/10) | 13.38 s
    [Task  2/25]  Current/Best:   27.45/  34.75 GFLOPS | Progress: (10/10) | 23.91 s Done.
-
    [Task  3/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  3/25]  Current/Best:   66.06/  95.57 GFLOPS | Progress: (8/10) | 6.77 s
    [Task  3/25]  Current/Best:   23.55/  95.57 GFLOPS | Progress: (10/10) | 8.51 s Done.
-
    [Task  4/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  4/25]  Current/Best:   42.72/  85.54 GFLOPS | Progress: (8/10) | 6.02 s
    [Task  4/25]  Current/Best:   40.94/  85.54 GFLOPS | Progress: (10/10) | 9.24 s Done.
-
    [Task  5/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  5/25]  Current/Best:   36.27/  89.74 GFLOPS | Progress: (8/10) | 4.46 s
    [Task  5/25]  Current/Best:   31.12/  89.74 GFLOPS | Progress: (10/10) | 5.36 s Done.
-
    [Task  6/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  6/25]  Current/Best:   91.39/  91.39 GFLOPS | Progress: (8/10) | 6.52 s
    [Task  6/25]  Current/Best:   47.82/  91.39 GFLOPS | Progress: (10/10) | 7.82 s Done.
-
    [Task  7/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  7/25]  Current/Best:   67.43/  67.43 GFLOPS | Progress: (8/10) | 5.05 s
    [Task  7/25]  Current/Best:   27.15/  67.43 GFLOPS | Progress: (10/10) | 7.86 s Done.
-
    [Task  8/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  8/25]  Current/Best:   24.10/  58.69 GFLOPS | Progress: (8/10) | 5.81 s
    [Task  8/25]  Current/Best:   92.63/  92.63 GFLOPS | Progress: (10/10) | 7.21 s Done.
-
    [Task  9/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  9/25]  Current/Best:   25.20/  95.32 GFLOPS | Progress: (8/10) | 4.54 s
    [Task  9/25]  Current/Best:   28.38/  95.32 GFLOPS | Progress: (10/10) | 5.41 s Done.
-
    [Task 10/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 10/25]  Current/Best:   59.85/  80.49 GFLOPS | Progress: (8/10) | 3.97 s
    [Task 10/25]  Current/Best:   12.33/  80.49 GFLOPS | Progress: (10/10) | 5.08 s Done.
-
    [Task 11/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 11/25]  Current/Best:   38.21/  86.10 GFLOPS | Progress: (8/10) | 4.69 s
    [Task 11/25]  Current/Best:   81.00/  86.10 GFLOPS | Progress: (10/10) | 6.75 s Done.
-
    [Task 12/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 12/25]  Current/Best:   65.01/  67.53 GFLOPS | Progress: (8/10) | 6.11 s
    [Task 12/25]  Current/Best:   53.60/  67.53 GFLOPS | Progress: (10/10) | 7.00 s Done.
-
    [Task 13/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 13/25]  Current/Best:   74.07/  74.07 GFLOPS | Progress: (8/10) | 3.39 s
    [Task 13/25]  Current/Best:   30.13/  74.07 GFLOPS | Progress: (10/10) | 4.58 s Done.
-
    [Task 14/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 14/25]  Current/Best:   27.52/  88.13 GFLOPS | Progress: (8/10) | 3.24 s
    [Task 14/25]  Current/Best:   72.62/  88.13 GFLOPS | Progress: (10/10) | 4.56 s Done.
-
    [Task 15/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 15/25]  Current/Best:   87.68/  87.68 GFLOPS | Progress: (8/10) | 3.70 s
    [Task 15/25]  Current/Best:   40.67/  87.68 GFLOPS | Progress: (10/10) | 5.24 s Done.
-
    [Task 16/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 16/25]  Current/Best:   84.71/  93.38 GFLOPS | Progress: (8/10) | 3.49 s
    [Task 16/25]  Current/Best:   12.72/  93.38 GFLOPS | Progress: (10/10) | 4.54 s Done.
-
    [Task 17/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 17/25]  Current/Best:   54.95/  58.79 GFLOPS | Progress: (8/10) | 5.54 s
    [Task 17/25]  Current/Best:   55.53/  62.90 GFLOPS | Progress: (10/10) | 7.61 s Done.
-
    [Task 18/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 18/25]  Current/Best:   46.00/  66.74 GFLOPS | Progress: (8/10) | 4.97 s
    [Task 18/25]  Current/Best:   31.68/  66.74 GFLOPS | Progress: (10/10) | 7.06 s Done.
-
    [Task 19/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 19/25]  Current/Best:   42.06/  96.40 GFLOPS | Progress: (8/10) | 3.48 s
    [Task 19/25]  Current/Best:   43.79/  96.40 GFLOPS | Progress: (10/10) | 4.34 s Done.
-
    [Task 20/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 20/25]  Current/Best:   55.24/  60.53 GFLOPS | Progress: (8/10) | 3.76 s
    [Task 20/25]  Current/Best:   22.41/  60.53 GFLOPS | Progress: (10/10) | 6.62 s Done.
-
    [Task 21/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 21/25]  Current/Best:   68.04/  82.73 GFLOPS | Progress: (8/10) | 4.59 s
    [Task 21/25]  Current/Best:   25.97/  82.73 GFLOPS | Progress: (10/10) | 5.89 s Done.
-
    [Task 22/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 22/25]  Current/Best:   53.42/  90.49 GFLOPS | Progress: (8/10) | 7.30 s
    [Task 22/25]  Current/Best:   25.25/  90.49 GFLOPS | Progress: (10/10) | 8.85 s Done.
-
    [Task 23/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 23/25]  Current/Best:   90.80/  90.80 GFLOPS | Progress: (8/10) | 5.29 s
    [Task 23/25]  Current/Best:   21.70/  90.80 GFLOPS | Progress: (10/10) | 15.85 s Done.
-
    [Task 24/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 24/25]  Current/Best:   39.03/  82.10 GFLOPS | Progress: (8/10) | 5.57 s
    [Task 24/25]  Current/Best:   21.93/  82.10 GFLOPS | Progress: (10/10) | 9.29 s Done.
-
    [Task 25/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 25/25]  Current/Best:   32.55/  66.52 GFLOPS | Progress: (8/10) | 4.69 s
    [Task 25/25]  Current/Best:   81.08/  81.08 GFLOPS | Progress: (10/10) | 6.38 s Done.
+
    [Task  1/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  1/25]  Current/Best:    2.60/  11.09 GFLOPS | Progress: (4/10) | 11.49 s
    [Task  1/25]  Current/Best:   18.80/  18.80 GFLOPS | Progress: (8/10) | 22.26 s
    [Task  1/25]  Current/Best:   21.19/  23.94 GFLOPS | Progress: (10/10) | 22.84 s Done.
+
    [Task  2/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  2/25]  Current/Best:    8.17/  20.44 GFLOPS | Progress: (4/10) | 2.56 s
    [Task  2/25]  Current/Best:    8.23/  20.44 GFLOPS | Progress: (8/10) | 5.26 s
    [Task  2/25]  Current/Best:   12.49/  20.44 GFLOPS | Progress: (10/10) | 10.81 s Done.
+
    [Task  3/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  3/25]  Current/Best:   17.79/  48.71 GFLOPS | Progress: (4/10) | 2.02 s
    [Task  3/25]  Current/Best:   22.90/  48.71 GFLOPS | Progress: (8/10) | 6.87 s
    [Task  3/25]  Current/Best:   43.33/  48.71 GFLOPS | Progress: (10/10) | 7.82 s Done.
+
    [Task  4/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  4/25]  Current/Best:   15.37/  49.78 GFLOPS | Progress: (4/10) | 4.26 s
    [Task  4/25]  Current/Best:   18.49/  49.78 GFLOPS | Progress: (8/10) | 6.43 s
    [Task  4/25]  Current/Best:   22.16/  49.78 GFLOPS | Progress: (10/10) | 7.32 s Done.
+
    [Task  5/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  5/25]  Current/Best:   17.21/  48.57 GFLOPS | Progress: (4/10) | 5.11 s
    [Task  5/25]  Current/Best:   27.82/  48.57 GFLOPS | Progress: (8/10) | 7.46 s
    [Task  5/25]  Current/Best:   20.97/  48.57 GFLOPS | Progress: (10/10) | 10.21 s Done.
+
    [Task  6/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  6/25]  Current/Best:   25.14/  37.00 GFLOPS | Progress: (4/10) | 4.88 s
    [Task  6/25]  Current/Best:   42.58/  42.58 GFLOPS | Progress: (8/10) | 8.10 s
    [Task  6/25]  Current/Best:   27.71/  42.58 GFLOPS | Progress: (10/10) | 9.23 s Done.
+
    [Task  7/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  7/25]  Current/Best:   27.53/  36.35 GFLOPS | Progress: (4/10) | 2.95 s
    [Task  7/25]  Current/Best:   27.24/  38.54 GFLOPS | Progress: (8/10) | 4.92 s
    [Task  7/25]  Current/Best:   37.83/  38.54 GFLOPS | Progress: (10/10) | 5.68 s Done.
+
    [Task  8/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  8/25]  Current/Best:   39.95/  39.95 GFLOPS | Progress: (4/10) | 3.57 s
    [Task  8/25]  Current/Best:   30.47/  45.57 GFLOPS | Progress: (8/10) | 5.60 s
    [Task  8/25]  Current/Best:   14.18/  45.57 GFLOPS | Progress: (10/10) | 6.31 s Done.
+
    [Task  9/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  9/25]  Current/Best:    6.23/  41.29 GFLOPS | Progress: (4/10) | 3.13 s
    [Task  9/25]  Current/Best:   39.13/  41.29 GFLOPS | Progress: (8/10) | 5.43 s
    [Task  9/25]  Current/Best:   12.32/  41.29 GFLOPS | Progress: (10/10) | 6.34 s Done.
+
    [Task 10/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 10/25]  Current/Best:   40.27/  40.27 GFLOPS | Progress: (4/10) | 3.70 s
    [Task 10/25]  Current/Best:   37.78/  42.86 GFLOPS | Progress: (8/10) | 4.91 s
    [Task 10/25]  Current/Best:   22.32/  42.86 GFLOPS | Progress: (10/10) | 5.55 s Done.
+
    [Task 11/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 11/25]  Current/Best:   14.21/  31.72 GFLOPS | Progress: (4/10) | 2.78 s
    [Task 11/25]  Current/Best:   45.54/  45.54 GFLOPS | Progress: (8/10) | 5.95 s
    [Task 11/25]  Current/Best:   42.64/  45.54 GFLOPS | Progress: (10/10) | 6.70 s Done.
+
    [Task 12/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 12/25]  Current/Best:   21.01/  39.08 GFLOPS | Progress: (4/10) | 2.35 s
    [Task 12/25]  Current/Best:   26.49/  47.65 GFLOPS | Progress: (8/10) | 5.64 s
    [Task 12/25]  Current/Best:   30.03/  47.65 GFLOPS | Progress: (10/10) | 7.05 s Done.
+
    [Task 13/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 13/25]  Current/Best:   22.76/  42.24 GFLOPS | Progress: (4/10) | 11.53 s
    [Task 13/25]  Current/Best:   37.60/  42.24 GFLOPS | Progress: (8/10) | 14.79 s
    [Task 13/25]  Current/Best:   20.74/  42.24 GFLOPS | Progress: (10/10) | 16.81 s Done.
+
    [Task 14/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 14/25]  Current/Best:   12.17/  35.13 GFLOPS | Progress: (4/10) | 2.63 s
    [Task 14/25]  Current/Best:   43.52/  43.52 GFLOPS | Progress: (8/10) | 4.66 s
    [Task 14/25]  Current/Best:   20.67/  43.52 GFLOPS | Progress: (10/10) | 5.70 s Done.
+
    [Task 15/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 15/25]  Current/Best:   31.44/  47.13 GFLOPS | Progress: (4/10) | 2.06 s
    [Task 15/25]  Current/Best:   23.42/  47.13 GFLOPS | Progress: (8/10) | 3.32 s
    [Task 15/25]  Current/Best:   15.07/  47.13 GFLOPS | Progress: (10/10) | 4.26 s Done.
+
    [Task 16/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 16/25]  Current/Best:   26.71/  30.86 GFLOPS | Progress: (4/10) | 3.15 s
    [Task 16/25]  Current/Best:   43.52/  43.52 GFLOPS | Progress: (8/10) | 5.14 s
    [Task 16/25]  Current/Best:   36.01/  47.93 GFLOPS | Progress: (10/10) | 5.90 s Done.
+
    [Task 17/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 17/25]  Current/Best:   26.48/  29.09 GFLOPS | Progress: (4/10) | 2.96 s
    [Task 17/25]  Current/Best:   27.94/  39.45 GFLOPS | Progress: (8/10) | 5.09 s
    [Task 17/25]  Current/Best:   28.88/  39.45 GFLOPS | Progress: (10/10) | 6.56 s Done.
+
    [Task 18/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 18/25]  Current/Best:   11.93/  29.03 GFLOPS | Progress: (4/10) | 4.12 s
    [Task 18/25]  Current/Best:   35.41/  46.49 GFLOPS | Progress: (8/10) | 7.07 s
    [Task 18/25]  Current/Best:   27.88/  46.49 GFLOPS | Progress: (10/10) | 8.38 s Done.
+
    [Task 19/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 19/25]  Current/Best:   31.03/  44.06 GFLOPS | Progress: (4/10) | 2.56 s
    [Task 19/25]  Current/Best:   22.37/  44.06 GFLOPS | Progress: (8/10) | 4.28 s
    [Task 19/25]  Current/Best:   28.55/  44.06 GFLOPS | Progress: (10/10) | 5.25 s Done.
+
    [Task 20/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 20/25]  Current/Best:    9.99/  37.63 GFLOPS | Progress: (4/10) | 2.39 s
    [Task 20/25]  Current/Best:   12.62/  37.63 GFLOPS | Progress: (8/10) | 4.14 s
    [Task 20/25]  Current/Best:    8.84/  37.63 GFLOPS | Progress: (10/10) | 5.20 s Done.
+
    [Task 21/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 21/25]  Current/Best:   18.23/  35.35 GFLOPS | Progress: (4/10) | 2.29 s
    [Task 21/25]  Current/Best:   40.33/  40.33 GFLOPS | Progress: (8/10) | 3.68 s
    [Task 21/25]  Current/Best:   19.23/  40.33 GFLOPS | Progress: (10/10) | 4.92 s Done.
+
    [Task 22/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 22/25]  Current/Best:   23.56/  24.31 GFLOPS | Progress: (4/10) | 2.53 s
    [Task 22/25]  Current/Best:   30.03/  47.50 GFLOPS | Progress: (8/10) | 4.24 s
    [Task 22/25]  Current/Best:   33.91/  47.50 GFLOPS | Progress: (10/10) | 5.32 s Done.
+
    [Task 23/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 23/25]  Current/Best:   26.64/  45.66 GFLOPS | Progress: (4/10) | 2.48 s
    [Task 23/25]  Current/Best:    6.83/  45.66 GFLOPS | Progress: (8/10) | 6.54 s
    [Task 23/25]  Current/Best:   27.63/  45.66 GFLOPS | Progress: (10/10) | 9.89 s Done.
+
    [Task 24/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 24/25]  Current/Best:   27.19/  33.53 GFLOPS | Progress: (4/10) | 5.88 s
    [Task 24/25]  Current/Best:   35.87/  38.58 GFLOPS | Progress: (8/10) | 7.40 s
    [Task 24/25]  Current/Best:   38.21/  38.58 GFLOPS | Progress: (10/10) | 8.16 s Done.
+
    [Task 25/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 25/25]  Current/Best:   11.88/  46.89 GFLOPS | Progress: (4/10) | 3.07 s
    [Task 25/25]  Current/Best:   41.79/  46.89 GFLOPS | Progress: (8/10) | 4.52 s
    [Task 25/25]  Current/Best:   27.92/  46.89 GFLOPS | Progress: (10/10) | 5.55 s Done.
 
 
 
@@ -671,8 +657,8 @@ improvement in comparing the optimized model to the unoptimized model.
 
  .. code-block:: none
 
-    optimized: {'mean': 91.3871666014893, 'median': 91.30017985007726, 'std': 0.2475099150760818}
-    unoptimized: {'mean': 116.61725386977196, 'median': 116.69999185251072, 'std': 0.4373409880996617}
+    optimized: {'mean': 172.70571838016622, 'median': 172.77562514937017, 'std': 0.16911500694668277}
+    unoptimized: {'mean': 225.59652527037542, 'median': 225.24130374658853, 'std': 0.927784740794048}
 
 
 
@@ -692,7 +678,7 @@ profiling/benchmarking.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 4 minutes  37.163 seconds)
+   **Total running time of the script:** ( 4 minutes  53.432 seconds)
 
 
 .. _sphx_glr_download_tutorials_get_started_auto_tuning_with_python.py:
diff --git a/docs/_sources/tutorials/get_started/cross_compilation_and_rpc.rst.txt b/docs/_sources/tutorials/get_started/cross_compilation_and_rpc.rst.txt
index 190b2b3..1406b28 100644
--- a/docs/_sources/tutorials/get_started/cross_compilation_and_rpc.rst.txt
+++ b/docs/_sources/tutorials/get_started/cross_compilation_and_rpc.rst.txt
@@ -235,7 +235,7 @@ device and returns the measured cost. Network overhead is excluded.
 
  .. code-block:: none
 
-    1.21e-07 secs/op
+    1.228e-07 secs/op
 
 
 
diff --git a/docs/_sources/tutorials/get_started/relay_quick_start.rst.txt b/docs/_sources/tutorials/get_started/relay_quick_start.rst.txt
index f6c32ad..9de9765 100644
--- a/docs/_sources/tutorials/get_started/relay_quick_start.rst.txt
+++ b/docs/_sources/tutorials/get_started/relay_quick_start.rst.txt
@@ -225,7 +225,7 @@ in this example. Then the machine code will be generated as the module library.
 
  .. code-block:: none
 
-
    ...1%, 0.01 MB, 41 KB/s, 0 seconds passed
    ...3%, 0.02 MB, 82 KB/s, 0 seconds passed
    ...5%, 0.02 MB, 123 KB/s, 0 seconds passed
    ...6%, 0.03 MB, 164 KB/s, 0 seconds passed
    ...8%, 0.04 MB, 205 KB/s, 0 seconds passed
    ...10%, 0.05 MB, 238 KB/s, 0 seconds passed
    ...11%, 0.05 MB, 277 KB/s, 0 seconds passed
    ...13%, 0.06 MB, 316 KB/s, 0 seconds passed
    ...15%, 0.07 MB, 355 KB/s, 0 seconds passed
    ...16%, 0.08 MB, 394 KB/s, 0 seconds passed
    ...18%, 0.09 MB, 431 KB/s, 0 seconds passed
    ...20%, 0.09 MB, 457 KB/s, 0 seconds passed
    ...21%, 0.10 MB, 495 KB/s, 0 seconds passed
    ...23%, 0.11 MB, 533 KB/s, 0 seconds passed
    ...25%, 0.12 MB, 569 KB/s, 0 seconds passed
    ...26%, 0.12 MB, 607 KB/s, 0 seconds passed
    ...28%, 0.13 MB, 642 KB/s, 0 seconds passed
    ...30%, 0.14 MB, 680 KB/s, 0 seconds passed
    ...31%, 0.15 MB, 716 KB/s, 0 seconds passed
    ...33%, 0.16 MB, 754 KB/s, 0 seconds passed
    ...35%, 0.16 MB, 788 KB/s, 0 seconds pa
 ssed
    ...36%, 0.17 MB, 825 KB/s, 0 seconds passed
    ...38%, 0.18 MB, 862 KB/s, 0 seconds passed
    ...40%, 0.19 MB, 899 KB/s, 0 seconds passed
    ...41%, 0.20 MB, 936 KB/s, 0 seconds passed
    ...43%, 0.20 MB, 970 KB/s, 0 seconds passed
    ...45%, 0.21 MB, 1007 KB/s, 0 seconds passed
    ...46%, 0.22 MB, 1044 KB/s, 0 seconds passed
    ...48%, 0.23 MB, 1061 KB/s, 0 seconds passed
    ...50%, 0.23 MB, 1097 KB/s, 0 seconds passed
    ...51%, 0.24 MB, 1133 KB/s, 0 seconds passed
    ...53%, 0.25 MB, 1169 KB/s, 0 seconds passed
    ...55%, 0.26 MB, 1202 KB/s, 0 seconds passed
    ...56%, 0.27 MB, 1238 KB/s, 0 seconds passed
    ...58%, 0.27 MB, 1274 KB/s, 0 seconds passed
    ...60%, 0.28 MB, 1310 KB/s, 0 seconds passed
    ...61%, 0.29 MB, 1343 KB/s, 0 seconds passed
    ...63%, 0.30 MB, 1379 KB/s, 0 seconds passed
    ...65%, 0.30 MB, 1411 KB/s, 0 seconds passed
    ...66%, 0.31 MB, 1447 KB/s, 0 seconds passed
    ...68%, 0.32 MB, 1483 KB/s, 0 seconds passed
    ...70%, 0.33 
 MB, 1519 KB/s, 0 seconds passed
    ...71%, 0.34 MB, 1554 KB/s, 0 seconds passed
    ...73%, 0.34 MB, 1590 KB/s, 0 seconds passed
    ...75%, 0.35 MB, 1623 KB/s, 0 seconds passed
    ...76%, 0.36 MB, 1658 KB/s, 0 seconds passed
    ...78%, 0.37 MB, 1690 KB/s, 0 seconds passed
    ...80%, 0.38 MB, 1726 KB/s, 0 seconds passed
    ...81%, 0.38 MB, 1761 KB/s, 0 seconds passed
    ...83%, 0.39 MB, 1796 KB/s, 0 seconds passed
    ...85%, 0.40 MB, 1832 KB/s, 0 seconds passed
    ...86%, 0.41 MB, 1867 KB/s, 0 seconds passed
    ...88%, 0.41 MB, 1902 KB/s, 0 seconds passed
    ...90%, 0.42 MB, 1932 KB/s, 0 seconds passed
    ...91%, 0.43 MB, 1968 KB/s, 0 seconds passed
    ...93%, 0.44 MB, 2003 KB/s, 0 seconds passed
    ...95%, 0.45 MB, 2038 KB/s, 0 seconds passed
    ...96%, 0.45 MB, 2065 KB/s, 0 seconds passed
    ...98%, 0.46 MB, 2100 KB/s, 0 seconds passed
    ...100%, 0.47 MB, 2132 KB/s, 0 seconds passed
+
    ...1%, 0.01 MB, 45 KB/s, 0 seconds passed
    ...3%, 0.02 MB, 91 KB/s, 0 seconds passed
    ...5%, 0.02 MB, 136 KB/s, 0 seconds passed
    ...6%, 0.03 MB, 182 KB/s, 0 seconds passed
    ...8%, 0.04 MB, 226 KB/s, 0 seconds passed
    ...10%, 0.05 MB, 265 KB/s, 0 seconds passed
    ...11%, 0.05 MB, 307 KB/s, 0 seconds passed
    ...13%, 0.06 MB, 349 KB/s, 0 seconds passed
    ...15%, 0.07 MB, 392 KB/s, 0 seconds passed
    ...16%, 0.08 MB, 434 KB/s, 0 seconds passed
    ...18%, 0.09 MB, 475 KB/s, 0 seconds passed
    ...20%, 0.09 MB, 511 KB/s, 0 seconds passed
    ...21%, 0.10 MB, 552 KB/s, 0 seconds passed
    ...23%, 0.11 MB, 594 KB/s, 0 seconds passed
    ...25%, 0.12 MB, 634 KB/s, 0 seconds passed
    ...26%, 0.12 MB, 672 KB/s, 0 seconds passed
    ...28%, 0.13 MB, 710 KB/s, 0 seconds passed
    ...30%, 0.14 MB, 752 KB/s, 0 seconds passed
    ...31%, 0.15 MB, 790 KB/s, 0 seconds passed
    ...33%, 0.16 MB, 828 KB/s, 0 seconds passed
    ...35%, 0.16 MB, 865 KB/s, 0 seconds pa
 ssed
    ...36%, 0.17 MB, 900 KB/s, 0 seconds passed
    ...38%, 0.18 MB, 939 KB/s, 0 seconds passed
    ...40%, 0.19 MB, 978 KB/s, 0 seconds passed
    ...41%, 0.20 MB, 1012 KB/s, 0 seconds passed
    ...43%, 0.20 MB, 1047 KB/s, 0 seconds passed
    ...45%, 0.21 MB, 1082 KB/s, 0 seconds passed
    ...46%, 0.22 MB, 1122 KB/s, 0 seconds passed
    ...48%, 0.23 MB, 1158 KB/s, 0 seconds passed
    ...50%, 0.23 MB, 1192 KB/s, 0 seconds passed
    ...51%, 0.24 MB, 1224 KB/s, 0 seconds passed
    ...53%, 0.25 MB, 1263 KB/s, 0 seconds passed
    ...55%, 0.26 MB, 1297 KB/s, 0 seconds passed
    ...56%, 0.27 MB, 1329 KB/s, 0 seconds passed
    ...58%, 0.27 MB, 1361 KB/s, 0 seconds passed
    ...60%, 0.28 MB, 1394 KB/s, 0 seconds passed
    ...61%, 0.29 MB, 1432 KB/s, 0 seconds passed
    ...63%, 0.30 MB, 1459 KB/s, 0 seconds passed
    ...65%, 0.30 MB, 1497 KB/s, 0 seconds passed
    ...66%, 0.31 MB, 1526 KB/s, 0 seconds passed
    ...68%, 0.32 MB, 1563 KB/s, 0 seconds passed
    ...70%, 0.3
 3 MB, 1586 KB/s, 0 seconds passed
    ...71%, 0.34 MB, 1623 KB/s, 0 seconds passed
    ...73%, 0.34 MB, 1654 KB/s, 0 seconds passed
    ...75%, 0.35 MB, 1691 KB/s, 0 seconds passed
    ...76%, 0.36 MB, 1721 KB/s, 0 seconds passed
    ...78%, 0.37 MB, 1758 KB/s, 0 seconds passed
    ...80%, 0.38 MB, 1781 KB/s, 0 seconds passed
    ...81%, 0.38 MB, 1817 KB/s, 0 seconds passed
    ...83%, 0.39 MB, 1843 KB/s, 0 seconds passed
    ...85%, 0.40 MB, 1879 KB/s, 0 seconds passed
    ...86%, 0.41 MB, 1910 KB/s, 0 seconds passed
    ...88%, 0.41 MB, 1946 KB/s, 0 seconds passed
    ...90%, 0.42 MB, 1972 KB/s, 0 seconds passed
    ...91%, 0.43 MB, 2007 KB/s, 0 seconds passed
    ...93%, 0.44 MB, 2035 KB/s, 0 seconds passed
    ...95%, 0.45 MB, 2071 KB/s, 0 seconds passed
    ...96%, 0.45 MB, 2095 KB/s, 0 seconds passed
    ...98%, 0.46 MB, 2130 KB/s, 0 seconds passed
    ...100%, 0.47 MB, 2164 KB/s, 0 seconds passed
 
 
 
diff --git a/docs/_sources/tutorials/get_started/sg_execution_times.rst.txt b/docs/_sources/tutorials/get_started/sg_execution_times.rst.txt
index 79c9241..00b7eed 100644
--- a/docs/_sources/tutorials/get_started/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorials/get_started/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
 
 Computation times
 =================
-**06:29.934** total execution time for **tutorials_get_started** files:
+**07:07.655** total execution time for **tutorials_get_started** files:
 
-- **04:37.163**: :ref:`sphx_glr_tutorials_get_started_auto_tuning_with_python.py` (``auto_tuning_with_python.py``)
-- **00:56.087**: :ref:`sphx_glr_tutorials_get_started_tensor_expr_get_started.py` (``tensor_expr_get_started.py``)
-- **00:22.911**: :ref:`sphx_glr_tutorials_get_started_tune_matmul_x86.py` (``tune_matmul_x86.py``)
-- **00:17.161**: :ref:`sphx_glr_tutorials_get_started_relay_quick_start.py` (``relay_quick_start.py``)
-- **00:16.289**: :ref:`sphx_glr_tutorials_get_started_autotvm_matmul.py` (``autotvm_matmul.py``)
-- **00:00.195**: :ref:`sphx_glr_tutorials_get_started_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``)
-- **00:00.046**: :ref:`sphx_glr_tutorials_get_started_introduction.py` (``introduction.py``)
-- **00:00.042**: :ref:`sphx_glr_tutorials_get_started_install.py` (``install.py``)
-- **00:00.040**: :ref:`sphx_glr_tutorials_get_started_tvmc_command_line_driver.py` (``tvmc_command_line_driver.py``)
+- **04:53.432**: :ref:`sphx_glr_tutorials_get_started_auto_tuning_with_python.py` (``auto_tuning_with_python.py``)
+- **01:02.464**: :ref:`sphx_glr_tutorials_get_started_tensor_expr_get_started.py` (``tensor_expr_get_started.py``)
+- **00:34.423**: :ref:`sphx_glr_tutorials_get_started_tune_matmul_x86.py` (``tune_matmul_x86.py``)
+- **00:19.062**: :ref:`sphx_glr_tutorials_get_started_relay_quick_start.py` (``relay_quick_start.py``)
+- **00:18.018**: :ref:`sphx_glr_tutorials_get_started_autotvm_matmul.py` (``autotvm_matmul.py``)
+- **00:00.162**: :ref:`sphx_glr_tutorials_get_started_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``)
+- **00:00.040**: :ref:`sphx_glr_tutorials_get_started_introduction.py` (``introduction.py``)
+- **00:00.027**: :ref:`sphx_glr_tutorials_get_started_install.py` (``install.py``)
+- **00:00.026**: :ref:`sphx_glr_tutorials_get_started_tvmc_command_line_driver.py` (``tvmc_command_line_driver.py``)
diff --git a/docs/_sources/tutorials/get_started/tensor_expr_get_started.rst.txt b/docs/_sources/tutorials/get_started/tensor_expr_get_started.rst.txt
index 470656f..bc0a605 100644
--- a/docs/_sources/tutorials/get_started/tensor_expr_get_started.rst.txt
+++ b/docs/_sources/tutorials/get_started/tensor_expr_get_started.rst.txt
@@ -236,8 +236,8 @@ helper function to run a profile of the TVM generated code.
 
  .. code-block:: none
 
-    Numpy running time: 0.000007
-    naive: 0.000006
+    Numpy running time: 0.000014
+    naive: 0.000011
 
 
 
@@ -327,7 +327,7 @@ compile and run this new schedule with the parallel operation applied:
 
  .. code-block:: none
 
-    parallel: 0.000003
+    parallel: 0.000005
 
 
 
@@ -380,7 +380,7 @@ factor to be the number of threads on your CPU.
 
  .. code-block:: none
 
-    vector: 0.000009
+    vector: 0.000017
     primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
       buffers = {C: Buffer(C_2: Pointer(float32), float32, [n: int32], [stride: int32], type="auto"),
@@ -428,10 +428,10 @@ We can now compare the different schedules
  .. code-block:: none
 
                 Operator                  Timing             Performance
-                   numpy    6.980659672990442e-06                    1.0
-                   naive              5.6905e-06      0.8151808377104637
-                parallel    3.3419999999999995e-06    0.4787513152848378
-                  vector              8.8547e-06      1.2684617808057013
+                   numpy    1.3710560160689056e-05                   1.0
+                   naive    1.0926500000000001e-05    0.7969404511515498
+                parallel    5.0590999999999996e-06    0.3689929470938365
+                  vector             1.65544e-05       1.207419668195965
 
 
 
@@ -601,7 +601,7 @@ The following code first performs the following steps:
 
  .. code-block:: none
 
-    ['myadd.o', 'myadd.so']
+    ['myadd.so', 'myadd.o']
 
 
 
@@ -816,7 +816,7 @@ matrix multiplication.
 
  .. code-block:: none
 
-    Numpy running time: 0.008237
+    Numpy running time: 0.012868
 
 
 
@@ -872,7 +872,7 @@ optimizations.
 
  .. code-block:: none
 
-    none: 3.238522
+    none: 3.508450
 
 
 
@@ -968,7 +968,7 @@ schedule.
 
  .. code-block:: none
 
-    blocking: 0.314021
+    blocking: 0.289656
 
 
 
@@ -994,8 +994,8 @@ internal representation and compare it to the original:
 
     primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
-      buffers = {B: Buffer(B_2: Pointer(float32), float32, [1024, 1024], []),
-                 C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),
+      buffers = {C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),
+                 B: Buffer(B_2: Pointer(float32), float32, [1024, 1024], []),
                  A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], [])}
       buffer_map = {A_1: A, B_1: B, C_1: C} {
       for (x.outer: int32, 0, 32) {
@@ -1056,11 +1056,11 @@ already cache friendly from our previous optimizations.
 
  .. code-block:: none
 
-    vectorization: 0.325936
+    vectorization: 0.324798
     primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
-      buffers = {C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),
-                 B: Buffer(B_2: Pointer(float32), float32, [1024, 1024], []),
+      buffers = {B: Buffer(B_2: Pointer(float32), float32, [1024, 1024], []),
+                 C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),
                  A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], [])}
       buffer_map = {A_1: A, B_1: B, C_1: C} {
       for (x.outer: int32, 0, 32) {
@@ -1124,7 +1124,7 @@ more cache friendly.
 
  .. code-block:: none
 
-    loop permutation: 0.114230
+    loop permutation: 0.109318
     primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
       buffers = {C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),
@@ -1217,7 +1217,7 @@ optimized schedule.
 
  .. code-block:: none
 
-    array packing: 0.101902
+    array packing: 0.201209
     primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
       buffers = {C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),
@@ -1305,7 +1305,7 @@ to `C` when all the block results are ready.
 
  .. code-block:: none
 
-    block caching: 0.107511
+    block caching: 0.185369
     primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
       buffers = {C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),
@@ -1384,11 +1384,11 @@ of thread-level parallelization.
 
  .. code-block:: none
 
-    parallelization: 0.033461
+    parallelization: 0.057319
     primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
-      buffers = {C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),
-                 B: Buffer(B_2: Pointer(float32), float32, [1024, 1024], []),
+      buffers = {B: Buffer(B_2: Pointer(float32), float32, [1024, 1024], []),
+                 C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),
                  A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], [])}
       buffer_map = {A_1: A, B_1: B, C_1: C} {
       attr [packedB: Pointer(float32x32)] "storage_scope" = "global";
@@ -1458,13 +1458,13 @@ working, we can compare the results.
  .. code-block:: none
 
                 Operator                  Timing             Performance
-                    none             3.238521834                     1.0
-                blocking            0.3140211334     0.09696434036763699
-           vectorization            0.3259355685      0.1006433135877385
-        loop permutation            0.1142302892    0.035272354195898865
-           array packing            0.1019016828     0.03146549198161126
-           block caching            0.1075105775     0.03319742246950063
-         parallelization            0.0334612482    0.010332259566294466
+                    none      3.5084501237999994                     1.0
+                blocking            0.2896558851     0.08255949917460248
+           vectorization     0.32479784629999997     0.09257587676583862
+        loop permutation     0.10931828029999999     0.03115856758470816
+           array packing            0.2012089377    0.057349807065825056
+           block caching            0.1853688535     0.05283496899172879
+         parallelization            0.0573185166     0.01633727559960817
 
 
 
@@ -1499,6 +1499,11 @@ operations with tunable parameters that allows you to automatically optimize
 the computation for specific platforms.
 
 
+.. rst-class:: sphx-glr-timing
+
+   **Total running time of the script:** ( 1 minutes  2.464 seconds)
+
+
 .. _sphx_glr_download_tutorials_get_started_tensor_expr_get_started.py:
 
 
diff --git a/docs/_sources/tutorials/get_started/tune_matmul_x86.rst.txt b/docs/_sources/tutorials/get_started/tune_matmul_x86.rst.txt
index cb4b870..bdec232 100644
--- a/docs/_sources/tutorials/get_started/tune_matmul_x86.rst.txt
+++ b/docs/_sources/tutorials/get_started/tune_matmul_x86.rst.txt
@@ -186,6 +186,7 @@ trials, we can load the best schedule from the log file and apply it.
 
 
 
+
 Inspecting the Optimized Schedule
 ---------------------------------
 We can lower the schedule to see the IR after auto-scheduling.  The
@@ -213,55 +214,77 @@ operator fusion.
     Lowered TIR:
     primfn(A_1: handle, B_1: handle, C_1: handle, out_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
-      buffers = {C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),
-                 out: Buffer(out_2: Pointer(float32), float32, [1024, 1024], []),
+      buffers = {out: Buffer(out_2: Pointer(float32), float32, [1024, 1024], []),
+                 C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),
                  B: Buffer(B_2: Pointer(float32), float32, [1024, 1024], []),
                  A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], [])}
       buffer_map = {A_1: A, B_1: B, C_1: C, out_1: out} {
       attr [auto_scheduler_layout_transform: Pointer(float32)] "storage_scope" = "global";
-      allocate(auto_scheduler_layout_transform, float32, [1048576]) {
+      allocate(auto_scheduler_layout_transform, float32, [1048576]);
+      attr [matmul: Pointer(float32)] "storage_scope" = "global";
+      allocate(matmul, float32, [1048576]) {
         for (ax0.ax1.fused.ax2.fused: int32, 0, 128) "parallel" {
-          for (ax4: int32, 0, 256) {
-            for (ax6: int32, 0, 4) {
-              for (ax7: int32, 0, 8) {
-                auto_scheduler_layout_transform[((((ax0.ax1.fused.ax2.fused*8192) + (ax4*32)) + (ax6*8)) + ax7)] = (float32*)B_2[((((ax4*4096) + (ax6*1024)) + (ax0.ax1.fused.ax2.fused*8)) + ax7)]
+          for (ax3: int32, 0, 128) {
+            for (ax4: int32, 0, 8) {
+              for (ax5: int32, 0, 8) {
+                auto_scheduler_layout_transform[((((ax0.ax1.fused.ax2.fused*8192) + (ax3*64)) + (ax4*8)) + ax5)] = (float32*)B_2[((((ax0.ax1.fused.ax2.fused*8192) + (ax4*1024)) + (ax3*8)) + ax5)]
               }
             }
           }
         }
-        for (i.outer.outer.j.outer.outer.fused: int32, 0, 4096) "parallel" {
-          attr [matmul: Pointer(float32x8)] "storage_scope" = "global";
-          allocate(matmul, float32x8, [4]);
-          for (i.outer.inner: int32, 0, 2) {
-            for (j.outer.inner: int32, 0, 4) {
-              matmul[ramp(0, 1, 8)] = broadcast(0f32, 8)
-              matmul[ramp(8, 1, 8)] = broadcast(0f32, 8)
-              matmul[ramp(16, 1, 8)] = broadcast(0f32, 8)
-              matmul[ramp(24, 1, 8)] = broadcast(0f32, 8)
-              for (k.outer: int32, 0, 256) {
-                matmul[ramp(0, 1, 8)] = ((float32x8*)matmul[ramp(0, 1, 8)] + (broadcast((float32*)A_2[(((floordiv(i.outer.outer.j.outer.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (k.outer*4))], 8)*(float32x8*)auto_scheduler_layout_transform[ramp((((floormod(i.outer.outer.j.outer.outer.fused, 32)*32768) + (j.outer.inner*8192)) + (k.outer*32)), 1, 8)]))
-                matmul[ramp(8, 1, 8)] = ((float32x8*)matmul[ramp(8, 1, 8)] + (broadcast((float32*)A_2[((((floordiv(i.outer.outer.j.outer.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (k.outer*4)) + 1024)], 8)*(float32x8*)auto_scheduler_layout_transform[ramp((((floormod(i.outer.outer.j.outer.outer.fused, 32)*32768) + (j.outer.inner*8192)) + (k.outer*32)), 1, 8)]))
-                matmul[ramp(16, 1, 8)] = ((float32x8*)matmul[ramp(16, 1, 8)] + (broadcast((float32*)A_2[((((floordiv(i.outer.outer.j.outer.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (k.outer*4)) + 2048)], 8)*(float32x8*)auto_scheduler_layout_transform[ramp((((floormod(i.outer.outer.j.outer.outer.fused, 32)*32768) + (j.outer.inner*8192)) + (k.outer*32)), 1, 8)]))
-                matmul[ramp(24, 1, 8)] = ((float32x8*)matmul[ramp(24, 1, 8)] + (broadcast((float32*)A_2[((((floordiv(i.outer.outer.j.outer.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (k.outer*4)) + 3072)], 8)*(float32x8*)auto_scheduler_layout_transform[ramp((((floormod(i.outer.outer.j.outer.outer.fused, 32)*32768) + (j.outer.inner*8192)) + (k.outer*32)), 1, 8)]))
-                matmul[ramp(0, 1, 8)] = ((float32x8*)matmul[ramp(0, 1, 8)] + (broadcast((float32*)A_2[((((floordiv(i.outer.outer.j.outer.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (k.outer*4)) + 1)], 8)*(float32x8*)auto_scheduler_layout_transform[ramp(((((floormod(i.outer.outer.j.outer.outer.fused, 32)*32768) + (j.outer.inner*8192)) + (k.outer*32)) + 8), 1, 8)]))
-                matmul[ramp(8, 1, 8)] = ((float32x8*)matmul[ramp(8, 1, 8)] + (broadcast((float32*)A_2[((((floordiv(i.outer.outer.j.outer.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (k.outer*4)) + 1025)], 8)*(float32x8*)auto_scheduler_layout_transform[ramp(((((floormod(i.outer.outer.j.outer.outer.fused, 32)*32768) + (j.outer.inner*8192)) + (k.outer*32)) + 8), 1, 8)]))
-                matmul[ramp(16, 1, 8)] = ((float32x8*)matmul[ramp(16, 1, 8)] + (broadcast((float32*)A_2[((((floordiv(i.outer.outer.j.outer.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (k.outer*4)) + 2049)], 8)*(float32x8*)auto_scheduler_layout_transform[ramp(((((floormod(i.outer.outer.j.outer.outer.fused, 32)*32768) + (j.outer.inner*8192)) + (k.outer*32)) + 8), 1, 8)]))
-                matmul[ramp(24, 1, 8)] = ((float32x8*)matmul[ramp(24, 1, 8)] + (broadcast((float32*)A_2[((((floordiv(i.outer.outer.j.outer.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (k.outer*4)) + 3073)], 8)*(float32x8*)auto_scheduler_layout_transform[ramp(((((floormod(i.outer.outer.j.outer.outer.fused, 32)*32768) + (j.outer.inner*8192)) + (k.outer*32)) + 8), 1, 8)]))
-                matmul[ramp(0, 1, 8)] = ((float32x8*)matmul[ramp(0, 1, 8)] + (broadcast((float32*)A_2[((((floordiv(i.outer.outer.j.outer.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (k.outer*4)) + 2)], 8)*(float32x8*)auto_scheduler_layout_transform[ramp(((((floormod(i.outer.outer.j.outer.outer.fused, 32)*32768) + (j.outer.inner*8192)) + (k.outer*32)) + 16), 1, 8)]))
-                matmul[ramp(8, 1, 8)] = ((float32x8*)matmul[ramp(8, 1, 8)] + (broadcast((float32*)A_2[((((floordiv(i.outer.outer.j.outer.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (k.outer*4)) + 1026)], 8)*(float32x8*)auto_scheduler_layout_transform[ramp(((((floormod(i.outer.outer.j.outer.outer.fused, 32)*32768) + (j.outer.inner*8192)) + (k.outer*32)) + 16), 1, 8)]))
-                matmul[ramp(16, 1, 8)] = ((float32x8*)matmul[ramp(16, 1, 8)] + (broadcast((float32*)A_2[((((floordiv(i.outer.outer.j.outer.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (k.outer*4)) + 2050)], 8)*(float32x8*)auto_scheduler_layout_transform[ramp(((((floormod(i.outer.outer.j.outer.outer.fused, 32)*32768) + (j.outer.inner*8192)) + (k.outer*32)) + 16), 1, 8)]))
-                matmul[ramp(24, 1, 8)] = ((float32x8*)matmul[ramp(24, 1, 8)] + (broadcast((float32*)A_2[((((floordiv(i.outer.outer.j.outer.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (k.outer*4)) + 3074)], 8)*(float32x8*)auto_scheduler_layout_transform[ramp(((((floormod(i.outer.outer.j.outer.outer.fused, 32)*32768) + (j.outer.inner*8192)) + (k.outer*32)) + 16), 1, 8)]))
-                matmul[ramp(0, 1, 8)] = ((float32x8*)matmul[ramp(0, 1, 8)] + (broadcast((float32*)A_2[((((floordiv(i.outer.outer.j.outer.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (k.outer*4)) + 3)], 8)*(float32x8*)auto_scheduler_layout_transform[ramp(((((floormod(i.outer.outer.j.outer.outer.fused, 32)*32768) + (j.outer.inner*8192)) + (k.outer*32)) + 24), 1, 8)]))
-                matmul[ramp(8, 1, 8)] = ((float32x8*)matmul[ramp(8, 1, 8)] + (broadcast((float32*)A_2[((((floordiv(i.outer.outer.j.outer.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (k.outer*4)) + 1027)], 8)*(float32x8*)auto_scheduler_layout_transform[ramp(((((floormod(i.outer.outer.j.outer.outer.fused, 32)*32768) + (j.outer.inner*8192)) + (k.outer*32)) + 24), 1, 8)]))
-                matmul[ramp(16, 1, 8)] = ((float32x8*)matmul[ramp(16, 1, 8)] + (broadcast((float32*)A_2[((((floordiv(i.outer.outer.j.outer.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (k.outer*4)) + 2051)], 8)*(float32x8*)auto_scheduler_layout_transform[ramp(((((floormod(i.outer.outer.j.outer.outer.fused, 32)*32768) + (j.outer.inner*8192)) + (k.outer*32)) + 24), 1, 8)]))
-                matmul[ramp(24, 1, 8)] = ((float32x8*)matmul[ramp(24, 1, 8)] + (broadcast((float32*)A_2[((((floordiv(i.outer.outer.j.outer.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (k.outer*4)) + 3075)], 8)*(float32x8*)auto_scheduler_layout_transform[ramp(((((floormod(i.outer.outer.j.outer.outer.fused, 32)*32768) + (j.outer.inner*8192)) + (k.outer*32)) + 24), 1, 8)]))
-              }
-              for (i.inner: int32, 0, 4) {
-                out_2[ramp((((((floordiv(i.outer.outer.j.outer.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*1024)) + (floormod(i.outer.outer.j.outer.outer.fused, 32)*32)) + (j.outer.inner*8)), 1, 8)] = ((float32x8*)matmul[ramp((i.inner*8), 1, 8)] + (float32x8*)C_2[ramp((((((floordiv(i.outer.outer.j.outer.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*1024)) + (floormod(i.outer.outer.j.outer.outer.fused, 32)*32)) + (j.outer.inner*8)), 1, 8)])
+        for (i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused: int32, 0, 16) "parallel" {
+          for (i.outer.inner.init: int32, 0, 16) {
+            for (j.outer.inner.init: int32, 0, 128) {
+              matmul[ramp((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner.init*4096)) + (j.outer.inner.init*8)), 1, 8)] = broadcast(0f32, 8)
+              matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner.init*4096)) + (j.outer.inner.init*8)) + 1024), 1, 8)] = broadcast(0f32, 8)
+              matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner.init*4096)) + (j.outer.inner.init*8)) + 2048), 1, 8)] = broadcast(0f32, 8)
+              matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner.init*4096)) + (j.outer.inner.init*8)) + 3072), 1, 8)] = broadcast(0f32, 8)
+            }
+          }
+          for (k.outer: int32, 0, 128) {
+            for (i.outer.inner: int32, 0, 16) {
+              for (j.outer.inner: int32, 0, 128) {
+                matmul[ramp((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)), 1, 8)] = ((float32x8*)matmul[ramp((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)), 1, 8)] + (broadcast((float32*)A_2[(((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8))], 8)*(float32x8*)au [...]
+                matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 1024), 1, 8)] = ((float32x8*)matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 1024), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8))  [...]
+                matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 2048), 1, 8)] = ((float32x8*)matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 2048), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8))  [...]
+                matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 3072), 1, 8)] = ((float32x8*)matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 3072), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8))  [...]
+                matmul[ramp((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)), 1, 8)] = ((float32x8*)matmul[ramp((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8)) + 1)], 8)*(float32 [...]
+                matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 1024), 1, 8)] = ((float32x8*)matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 1024), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8))  [...]
+                matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 2048), 1, 8)] = ((float32x8*)matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 2048), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8))  [...]
+                matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 3072), 1, 8)] = ((float32x8*)matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 3072), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8))  [...]
+                matmul[ramp((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)), 1, 8)] = ((float32x8*)matmul[ramp((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8)) + 2)], 8)*(float32 [...]
+                matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 1024), 1, 8)] = ((float32x8*)matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 1024), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8))  [...]
+                matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 2048), 1, 8)] = ((float32x8*)matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 2048), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8))  [...]
+                matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 3072), 1, 8)] = ((float32x8*)matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 3072), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8))  [...]
+                matmul[ramp((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)), 1, 8)] = ((float32x8*)matmul[ramp((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8)) + 3)], 8)*(float32 [...]
+                matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 1024), 1, 8)] = ((float32x8*)matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 1024), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8))  [...]
+                matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 2048), 1, 8)] = ((float32x8*)matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 2048), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8))  [...]
+                matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 3072), 1, 8)] = ((float32x8*)matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 3072), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8))  [...]
+                matmul[ramp((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)), 1, 8)] = ((float32x8*)matmul[ramp((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8)) + 4)], 8)*(float32 [...]
+                matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 1024), 1, 8)] = ((float32x8*)matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 1024), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8))  [...]
+                matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 2048), 1, 8)] = ((float32x8*)matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 2048), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8))  [...]
+                matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 3072), 1, 8)] = ((float32x8*)matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 3072), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8))  [...]
+                matmul[ramp((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)), 1, 8)] = ((float32x8*)matmul[ramp((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8)) + 5)], 8)*(float32 [...]
+                matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 1024), 1, 8)] = ((float32x8*)matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 1024), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8))  [...]
+                matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 2048), 1, 8)] = ((float32x8*)matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 2048), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8))  [...]
+                matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 3072), 1, 8)] = ((float32x8*)matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 3072), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8))  [...]
+                matmul[ramp((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)), 1, 8)] = ((float32x8*)matmul[ramp((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8)) + 6)], 8)*(float32 [...]
+                matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 1024), 1, 8)] = ((float32x8*)matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 1024), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8))  [...]
+                matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 2048), 1, 8)] = ((float32x8*)matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 2048), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8))  [...]
+                matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 3072), 1, 8)] = ((float32x8*)matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 3072), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8))  [...]
+                matmul[ramp((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)), 1, 8)] = ((float32x8*)matmul[ramp((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8)) + 7)], 8)*(float32 [...]
+                matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 1024), 1, 8)] = ((float32x8*)matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 1024), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8))  [...]
+                matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 2048), 1, 8)] = ((float32x8*)matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 2048), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8))  [...]
+                matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 3072), 1, 8)] = ((float32x8*)matmul[ramp(((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (j.outer.inner*8)) + 3072), 1, 8)] + (broadcast((float32*)A_2[((((i.outer.outer.outer.j.outer.outer.outer.fused.i.outer.outer.inner.fused*65536) + (i.outer.inner*4096)) + (k.outer*8))  [...]
               }
             }
           }
         }
+        for (i: int32, 0, 1024) "parallel" {
+          for (j: int32, 0, 1024) {
+            out_2[((i*1024) + j)] = ((float32*)matmul[((i*1024) + j)] + (float32*)C_2[((i*1024) + j)])
+          }
+        }
       }
     }
 
@@ -311,7 +334,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 22.551 ms
+    Execution time of this operator: 44.151 ms
 
 
 
@@ -346,25 +369,19 @@ the behavior of the auto-scheduler.
     matmul_i, matmul_j, matmul_k = tuple(matmul.op.axis) + tuple(matmul.op.reduce_axis)
     out_i, out_j = tuple(out.op.axis) + tuple(out.op.reduce_axis)
     matmul_i_o_i, matmul_i_i = s[matmul].split(matmul_i, factor=4)
-    matmul_i_o_o_i, matmul_i_o_i = s[matmul].split(matmul_i_o_i, factor=1)
-    matmul_i_o_o_o, matmul_i_o_o_i = s[matmul].split(matmul_i_o_o_i, factor=2)
+    matmul_i_o_o_i, matmul_i_o_i = s[matmul].split(matmul_i_o_i, factor=16)
+    matmul_i_o_o_o, matmul_i_o_o_i = s[matmul].split(matmul_i_o_o_i, factor=16)
     matmul_j_o_i, matmul_j_i = s[matmul].split(matmul_j, factor=8)
-    matmul_j_o_o_i, matmul_j_o_i = s[matmul].split(matmul_j_o_i, factor=1)
-    matmul_j_o_o_o, matmul_j_o_o_i = s[matmul].split(matmul_j_o_o_i, factor=4)
-    matmul_k_o, matmul_k_i = s[matmul].split(matmul_k, factor=4)
+    matmul_j_o_o_i, matmul_j_o_i = s[matmul].split(matmul_j_o_i, factor=128)
+    matmul_j_o_o_o, matmul_j_o_o_i = s[matmul].split(matmul_j_o_o_i, factor=1)
+    matmul_k_o, matmul_k_i = s[matmul].split(matmul_k, factor=8)
     s[matmul].reorder(matmul_i_o_o_o, matmul_j_o_o_o, matmul_i_o_o_i, matmul_j_o_o_i, matmul_k_o, matmul_i_o_i, matmul_j_o_i, matmul_k_i, matmul_i_i, matmul_j_i)
-    out_i_o_i, out_i_i = s[out].split(out_i, factor=4)
-    out_i_o_o, out_i_o_i = s[out].split(out_i_o_i, factor=2)
-    out_j_o_i, out_j_i = s[out].split(out_j, factor=8)
-    out_j_o_o, out_j_o_i = s[out].split(out_j_o_i, factor=4)
-    s[out].reorder(out_i_o_o, out_j_o_o, out_i_o_i, out_j_o_i, out_i_i, out_j_i)
-    s[matmul].compute_at(s[out], out_j_o_i)
-    out_i_o_o_j_o_o_fused = s[out].fuse(out_i_o_o, out_j_o_o)
-    s[out].parallel(out_i_o_o_j_o_o_fused)
-    s[matmul].pragma(matmul_i_o_o_o, "auto_unroll_max_step", 512)
-    s[matmul].pragma(matmul_i_o_o_o, "unroll_explicit", True)
+    matmul_i_o_o_o_j_o_o_o_fused_i_o_o_i_fused = s[matmul].fuse(matmul_i_o_o_o, matmul_j_o_o_o, matmul_i_o_o_i)
+    s[matmul].parallel(matmul_i_o_o_o_j_o_o_o_fused_i_o_o_i_fused)
+    s[out].parallel(out_i)
+    s[matmul].pragma(matmul_i_o_o_o_j_o_o_o_fused_i_o_o_i_fused, "auto_unroll_max_step", 64)
+    s[matmul].pragma(matmul_i_o_o_o_j_o_o_o_fused_i_o_o_i_fused, "unroll_explicit", True)
     s[matmul].vectorize(matmul_j_i)
-    s[out].vectorize(out_j_i)
 
 
 
diff --git a/docs/_sources/tutorials/index.rst.txt b/docs/_sources/tutorials/index.rst.txt
index 7da2c06..d06ea7a 100644
--- a/docs/_sources/tutorials/index.rst.txt
+++ b/docs/_sources/tutorials/index.rst.txt
@@ -837,26 +837,6 @@ Optimize Tensor Operators
    :hidden:
 
    /tutorials/optimize/opt_conv_tensorcore
-
-.. raw:: html
-
-    <div class="sphx-glr-thumbcontainer" tooltip="In this tutorial, we will demonstrate how to write a high performance matmul schedule on Volta/...">
-
-.. only:: html
-
-    .. figure:: /tutorials/optimize/images/thumb/sphx_glr_opt_matmul_auto_tensorcore_thumb.png
-
-        :ref:`sphx_glr_tutorials_optimize_opt_matmul_auto_tensorcore.py`
-
-.. raw:: html
-
-    </div>
-
-
-.. toctree::
-   :hidden:
-
-   /tutorials/optimize/opt_matmul_auto_tensorcore
 .. raw:: html
 
     <div style='clear:both'></div>
diff --git a/docs/_sources/tutorials/language/intrin_math.rst.txt b/docs/_sources/tutorials/language/intrin_math.rst.txt
index b21915b..07fadde 100644
--- a/docs/_sources/tutorials/language/intrin_math.rst.txt
+++ b/docs/_sources/tutorials/language/intrin_math.rst.txt
@@ -180,6 +180,7 @@ The same te.exp can also be used for float64 data types.
 
  .. code-block:: none
 
+    // Function: myexp_kernel0
     __kernel void myexp_kernel0(__global float* restrict B, __global float* restrict A, int n, int stride, int stride1) {
       if (((int)get_group_id(0)) < (n >> 6)) {
         B[((((((int)get_group_id(0)) * 64) + ((int)get_local_id(0))) * stride1))] = exp(A[((((((int)get_group_id(0)) * 64) + ((int)get_local_id(0))) * stride))]);
diff --git a/docs/_sources/tutorials/language/schedule_primitives.rst.txt b/docs/_sources/tutorials/language/schedule_primitives.rst.txt
index a549ff4..cb411e8 100644
--- a/docs/_sources/tutorials/language/schedule_primitives.rst.txt
+++ b/docs/_sources/tutorials/language/schedule_primitives.rst.txt
@@ -533,15 +533,15 @@ compute_root
 
     primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
-      buffers = {C: Buffer(C_2: Pointer(float32), float32, [m: int32], [stride: int32], type="auto"),
-                 B: Buffer(B_2: Pointer(float32), float32, [m], [stride_1: int32], type="auto"),
+      buffers = {B: Buffer(B_2: Pointer(float32), float32, [m: int32], [stride: int32], type="auto"),
+                 C: Buffer(C_2: Pointer(float32), float32, [m], [stride_1: int32], type="auto"),
                  A: Buffer(A_2: Pointer(float32), float32, [m], [stride_2: int32], type="auto")}
       buffer_map = {A_1: A, B_1: B, C_1: C} {
       for (i: int32, 0, m) {
-        B_2[(i*stride_1)] = ((float32*)A_2[(i*stride_2)] + 1f32)
+        B_2[(i*stride)] = ((float32*)A_2[(i*stride_2)] + 1f32)
       }
       for (i_1: int32, 0, m) {
-        C_2[(i_1*stride)] = ((float32*)B_2[(i_1*stride_1)]*2f32)
+        C_2[(i_1*stride_1)] = ((float32*)B_2[(i_1*stride)]*2f32)
       }
     }
 
diff --git a/docs/_sources/tutorials/language/sg_execution_times.rst.txt b/docs/_sources/tutorials/language/sg_execution_times.rst.txt
index c1b191f..182c276 100644
--- a/docs/_sources/tutorials/language/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorials/language/sg_execution_times.rst.txt
@@ -5,13 +5,13 @@
 
 Computation times
 =================
-**00:04.536** total execution time for **tutorials_language** files:
+**00:04.179** total execution time for **tutorials_language** files:
 
-- **00:01.548**: :ref:`sphx_glr_tutorials_language_intrin_math.py` (``intrin_math.py``)
-- **00:00.781**: :ref:`sphx_glr_tutorials_language_tensorize.py` (``tensorize.py``)
-- **00:00.599**: :ref:`sphx_glr_tutorials_language_scan.py` (``scan.py``)
-- **00:00.565**: :ref:`sphx_glr_tutorials_language_reduction.py` (``reduction.py``)
-- **00:00.330**: :ref:`sphx_glr_tutorials_language_extern_op.py` (``extern_op.py``)
-- **00:00.255**: :ref:`sphx_glr_tutorials_language_schedule_primitives.py` (``schedule_primitives.py``)
-- **00:00.240**: :ref:`sphx_glr_tutorials_language_tuple_inputs.py` (``tuple_inputs.py``)
-- **00:00.218**: :ref:`sphx_glr_tutorials_language_tedd.py` (``tedd.py``)
+- **00:01.396**: :ref:`sphx_glr_tutorials_language_intrin_math.py` (``intrin_math.py``)
+- **00:00.737**: :ref:`sphx_glr_tutorials_language_tensorize.py` (``tensorize.py``)
+- **00:00.545**: :ref:`sphx_glr_tutorials_language_scan.py` (``scan.py``)
+- **00:00.504**: :ref:`sphx_glr_tutorials_language_reduction.py` (``reduction.py``)
+- **00:00.354**: :ref:`sphx_glr_tutorials_language_extern_op.py` (``extern_op.py``)
+- **00:00.234**: :ref:`sphx_glr_tutorials_language_schedule_primitives.py` (``schedule_primitives.py``)
+- **00:00.224**: :ref:`sphx_glr_tutorials_language_tuple_inputs.py` (``tuple_inputs.py``)
+- **00:00.184**: :ref:`sphx_glr_tutorials_language_tedd.py` (``tedd.py``)
diff --git a/docs/_sources/tutorials/language/tensorize.rst.txt b/docs/_sources/tutorials/language/tensorize.rst.txt
index 49d2d34..8930f8c 100644
--- a/docs/_sources/tutorials/language/tensorize.rst.txt
+++ b/docs/_sources/tutorials/language/tensorize.rst.txt
@@ -120,8 +120,8 @@ Thus we break down the matmul loops to make the innermost loops a (16x64) GEMV.
 
     primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
-      buffers = {B: Buffer(B_2: Pointer(float32), float32, [512, 64], []),
-                 C: Buffer(C_2: Pointer(float32), float32, [1024, 512], []),
+      buffers = {C: Buffer(C_2: Pointer(float32), float32, [1024, 512], []),
+                 B: Buffer(B_2: Pointer(float32), float32, [512, 64], []),
                  A: Buffer(A_2: Pointer(float32), float32, [1024, 64], [])}
       buffer_map = {A_1: A, B_1: B, C_1: C} {
       for (i: int32, 0, 1024) {
@@ -313,8 +313,8 @@ The importing needs to happen before the tensorized GEMV being executed.
                  B: Buffer(B_2: Pointer(float32), float32, [512, 64], []),
                  A: Buffer(A_2: Pointer(float32), float32, [1024, 64], [])}
       buffer_map = {A_1: A, B_1: B, C_1: C} {
-      attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmps6h100br/input0.cc'
-    source_filename = "/tmp/tmps6h100br/input0.cc"
+      attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmp5fwbjad6/input0.cc'
+    source_filename = "/tmp/tmp5fwbjad6/input0.cc"
     target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
     target triple = "x86_64-pc-linux-gnu"
 
diff --git a/docs/_sources/tutorials/language/tuple_inputs.rst.txt b/docs/_sources/tutorials/language/tuple_inputs.rst.txt
index d6dc7bc..2865cb3 100644
--- a/docs/_sources/tutorials/language/tuple_inputs.rst.txt
+++ b/docs/_sources/tutorials/language/tuple_inputs.rst.txt
@@ -64,15 +64,15 @@ together in the next schedule procedure.
 
     primfn(A0_1: handle, A1_1: handle, B.v0_1: handle, B.v1_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
-      buffers = {B.v0: Buffer(B.v0_2: Pointer(float32), float32, [m: int32, n: int32], [stride: int32, stride_1: int32], type="auto"),
-                 B.v1: Buffer(B.v1_2: Pointer(float32), float32, [m, n], [stride_2: int32, stride_3: int32], type="auto"),
+      buffers = {B.v1: Buffer(B.v1_2: Pointer(float32), float32, [m: int32, n: int32], [stride: int32, stride_1: int32], type="auto"),
+                 B.v0: Buffer(B.v0_2: Pointer(float32), float32, [m, n], [stride_2: int32, stride_3: int32], type="auto"),
                  A1: Buffer(A1_2: Pointer(float32), float32, [m, n], [stride_4: int32, stride_5: int32], type="auto"),
                  A0: Buffer(A0_2: Pointer(float32), float32, [m, n], [stride_6: int32, stride_7: int32], type="auto")}
       buffer_map = {A0_1: A0, A1_1: A1, B.v0_1: B.v0, B.v1_1: B.v1} {
       for (i: int32, 0, m) {
         for (j: int32, 0, n) {
-          B.v0_2[((i*stride) + (j*stride_1))] = ((float32*)A0_2[((i*stride_6) + (j*stride_7))] + 2f32)
-          B.v1_2[((i*stride_2) + (j*stride_3))] = ((float32*)A1_2[((i*stride_4) + (j*stride_5))]*3f32)
+          B.v0_2[((i*stride_2) + (j*stride_3))] = ((float32*)A0_2[((i*stride_6) + (j*stride_7))] + 2f32)
+          B.v1_2[((i*stride) + (j*stride_1))] = ((float32*)A1_2[((i*stride_4) + (j*stride_5))]*3f32)
         }
       }
     }
@@ -193,8 +193,8 @@ in terms of operation.
 
     primfn(A0_1: handle, A1_1: handle, C_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
-      buffers = {C: Buffer(C_2: Pointer(float32), float32, [m: int32, n: int32], [stride: int32, stride_1: int32], type="auto"),
-                 A1: Buffer(A1_2: Pointer(float32), float32, [m, n], [stride_2: int32, stride_3: int32], type="auto"),
+      buffers = {A1: Buffer(A1_2: Pointer(float32), float32, [m: int32, n: int32], [stride: int32, stride_1: int32], type="auto"),
+                 C: Buffer(C_2: Pointer(float32), float32, [m, n], [stride_2: int32, stride_3: int32], type="auto"),
                  A0: Buffer(A0_2: Pointer(float32), float32, [m, n], [stride_4: int32, stride_5: int32], type="auto")}
       buffer_map = {A0_1: A0, A1_1: A1, C_1: C} {
       attr [B.v0: Pointer(float32)] "storage_scope" = "global";
@@ -207,7 +207,7 @@ in terms of operation.
           B.v1[j] = ((float32*)A0_2[((i*stride_4) + (j*stride_5))]*3f32)
         }
         for (j_1: int32, 0, n) {
-          C_2[((i*stride) + (j_1*stride_1))] = ((float32*)A1_2[((i*stride_2) + (j_1*stride_3))] + (float32*)B.v0[j_1])
+          C_2[((i*stride_2) + (j_1*stride_3))] = ((float32*)A1_2[((i*stride) + (j_1*stride_1))] + (float32*)B.v0[j_1])
         }
       }
     }
diff --git a/docs/_sources/tutorials/micro/sg_execution_times.rst.txt b/docs/_sources/tutorials/micro/sg_execution_times.rst.txt
index 01cbced..e0036d0 100644
--- a/docs/_sources/tutorials/micro/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorials/micro/sg_execution_times.rst.txt
@@ -5,7 +5,7 @@
 
 Computation times
 =================
-**00:06.237** total execution time for **tutorials_micro** files:
+**00:05.521** total execution time for **tutorials_micro** files:
 
-- **00:06.013**: :ref:`sphx_glr_tutorials_micro_micro_tflite.py` (``micro_tflite.py``)
-- **00:00.225**: :ref:`sphx_glr_tutorials_micro_micro_reference_vm.py` (``micro_reference_vm.py``)
+- **00:05.317**: :ref:`sphx_glr_tutorials_micro_micro_tflite.py` (``micro_tflite.py``)
+- **00:00.204**: :ref:`sphx_glr_tutorials_micro_micro_reference_vm.py` (``micro_reference_vm.py``)
diff --git a/docs/_sources/tutorials/optimize/opt_conv_cuda.rst.txt b/docs/_sources/tutorials/optimize/opt_conv_cuda.rst.txt
index 4750ae7..08c0e4e 100644
--- a/docs/_sources/tutorials/optimize/opt_conv_cuda.rst.txt
+++ b/docs/_sources/tutorials/optimize/opt_conv_cuda.rst.txt
@@ -296,7 +296,7 @@ latency of convolution.
 
  .. code-block:: none
 
-    Convolution: 53.383167 ms
+    Convolution: 48.012702 ms
 
 
 
diff --git a/docs/_sources/tutorials/optimize/opt_conv_tensorcore.rst.txt b/docs/_sources/tutorials/optimize/opt_conv_tensorcore.rst.txt
index b8faee1..8ac6c47 100644
--- a/docs/_sources/tutorials/optimize/opt_conv_tensorcore.rst.txt
+++ b/docs/_sources/tutorials/optimize/opt_conv_tensorcore.rst.txt
@@ -624,7 +624,7 @@ be able to run on our build server
 
  .. code-block:: none
 
-    conv2d with tensor core: 7.025254 ms
+    conv2d with tensor core: 7.240717 ms
 
 
 
diff --git a/docs/_sources/tutorials/optimize/opt_gemm.rst.txt b/docs/_sources/tutorials/optimize/opt_gemm.rst.txt
index 15967bb..d3ab8f2 100644
--- a/docs/_sources/tutorials/optimize/opt_gemm.rst.txt
+++ b/docs/_sources/tutorials/optimize/opt_gemm.rst.txt
@@ -118,8 +118,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
 
  .. code-block:: none
 
-    Numpy running time: 0.008044
-    Baseline: 3.548576
+    Numpy running time: 0.013055
+    Baseline: 3.513028
 
 
 
@@ -206,7 +206,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
 
  .. code-block:: none
 
-    Opt1: 0.306843
+    Opt1: 0.289864
 
 
 
@@ -300,7 +300,7 @@ In this tutorial, we chose to vectorize the inner loop row data since it is cach
 
  .. code-block:: none
 
-    Opt2: 0.320415
+    Opt2: 0.318925
 
 
 
@@ -389,7 +389,7 @@ the access pattern for A matrix is more cache friendly.
 
  .. code-block:: none
 
-    Opt3: 0.120079
+    Opt3: 0.110932
 
 
 
@@ -499,7 +499,7 @@ the corresponding value from the packed array.
 
  .. code-block:: none
 
-    Opt4: 0.105430
+    Opt4: 0.217146
 
 
 
@@ -609,7 +609,7 @@ write to C when all the block results are ready.
 
  .. code-block:: none
 
-    Opt5: 0.113378
+    Opt5: 0.192204
 
 
 
@@ -725,7 +725,7 @@ Futhermore, we can also utilize multi-core processors to do the thread-level par
 
  .. code-block:: none
 
-    Opt6: 0.035298
+    Opt6: 0.058951
 
 
 
diff --git a/docs/_sources/tutorials/optimize/opt_matmul_auto_tensorcore.rst.txt b/docs/_sources/tutorials/optimize/opt_matmul_auto_tensorcore.rst.txt
deleted file mode 100644
index 330386a..0000000
--- a/docs/_sources/tutorials/optimize/opt_matmul_auto_tensorcore.rst.txt
+++ /dev/null
@@ -1,585 +0,0 @@
-.. note::
-    :class: sphx-glr-download-link-note
-
-    Click :ref:`here <sphx_glr_download_tutorials_optimize_opt_matmul_auto_tensorcore.py>` to download the full example code
-.. rst-class:: sphx-glr-example-title
-
-.. _sphx_glr_tutorials_optimize_opt_matmul_auto_tensorcore.py:
-
-
-.. _opt-matmul-auto-tensorcore:
-
-How to optimize matmul with Auto TensorCore CodeGen
-===================================================
-**Author**: `Minmin Sun <https://github.com/minminsun>`_,             `Lanbo Li <https://github.com/Orion34C>`_,             `Chenfan Jia <https://github.com/jcf94>`_,             `Jun Yang <https://github.com/yangjunpro>`_
-
-In this tutorial, we will demonstrate how to write a high performance matmul
-schedule on Volta/Turing GPUs with TVM Auto TensorCore CodeGen.
-This is a transparent solution to generate tensorcore kernel
-with most transformations done in ir passes.
-Users can also write schedule with tensorization to generate TensorCore code.
-Both solutions use the same tensorcore intrinsics.
-Please refer to :ref:`opt-conv-tensorcore` tutorial for more details.
-
-Preparation and Algorithm
--------------------------
-2 kinds of input data types are supported: float16 and int8.
-For float16, the accumulator is float32.
-For int8, the accumulator is int32.
-For data layouts, 'N' means None-transpose while 'T' means Transpose.
-
-
-.. code-block:: default
-
-
-    import logging
-    import sys
-
-    import numpy as np
-    import tvm
-    from tvm import te
-
-    from tvm import autotvm
-    from tvm.contrib import nvcc
-    import tvm.testing
-
-
-    def matmul_nn(A, B, L, dtype="float16", layout="NN"):
-        k = te.reduce_axis((0, L), name="k")
-        if dtype == "float16":
-            out_type = "float"
-        elif dtype == "int8":
-            out_type = "int"
-        elif dtype == "int4" or dtype == "int1":
-            out_type = "int"
-        if layout == "NN":
-            return te.compute(
-                (N, M), lambda i, j: te.sum(A[i, k].astype(out_type) * B[k, j].astype(out_type), axis=k)
-            )
-        if layout == "NT":
-            return te.compute(
-                (N, M), lambda i, j: te.sum(A[k, i].astype(out_type) * B[k, j].astype(out_type), axis=k)
-            )
-        if layout == "TN":
-            return te.compute(
-                (N, M), lambda i, j: te.sum(A[i, k].astype(out_type) * B[j, k].astype(out_type), axis=k)
-            )
-        if layout == "TT":
-            return te.compute(
-                (N, M), lambda i, j: te.sum(A[k, i].astype(out_type) * B[j, k].astype(out_type), axis=k)
-            )
-
-
-
-
-
-
-
-
-Scheduling the Computation
---------------------------
-This schedule is no different than a non-tensorcore matmul schedule on GPU.
-Please refer to :ref:`opt-gemm` tutorial for basics of optimizing matmul schedule.
-When the "tensor_core" pragma is set, the "rewrite for tensorcore" ir pass
-will automatically transform the schedule for tensorcore codegen,
-otherwise normal CUDA code, with lower performance but equal functionality, will be generated.
-
-.. note::
-
-  *Requirements of TesnsorCore*
-
-  Note that in the following 2 cases, even though the "tensor_core" pragma is set, TVM will still fall back to normal CUDA codegen:
-  (1) The m, n or k of input matrices is not multiple of 16;
-  (2) The warp tile size is not 16x16x16 on CUDA9, or not one of {16x16x16, 32x8x16, 8x32x16} on CUDA version >= 10.0.
-
-In this schedule, storage_align is used to reduce bank conflicts of shared memory. Please refer to this
-`doc <https://tvm.apache.org/docs/api/python/te.html#tvm.te.Stage.storage_align>`_
-for the usage of storage_align primitive. In short, we need to add an offset to some shared memory buffer
-to reduce bank conflicts.
-According to the `wmma doc <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#wmma-description>`_,
-the stride of load_matrix_sync must be a multiple of 16 bytes,
-so we choose 8 as offset for float16 and 16 as offset for int8.
-
-We use AutoTVM to search for best configurations in this schedule.
-
-
-.. code-block:: default
-
-
-
-    @autotvm.template("tutorial/auto_tensorcore/test_gemm")
-    def test_gemm(N, L, M, dtype, layout):
-        if layout == "NN":
-            shape_a = (N, L)
-            shape_b = (L, M)
-        elif layout == "NT":
-            shape_a = (L, N)
-            shape_b = (L, M)
-        elif layout == "TN":
-            shape_a = (N, L)
-            shape_b = (M, L)
-        elif layout == "TT":
-            shape_a = (L, N)
-            shape_b = (M, L)
-        else:
-            print("Unsupported layout:", layout)
-            sys.exit(1)
-        A = te.placeholder(shape_a, name="A", dtype=dtype)
-        B = te.placeholder(shape_b, name="B", dtype=dtype)
-        C = matmul_nn(A, B, L, dtype, layout)
-
-        s = te.create_schedule(C.op)
-        y, x = s[C].op.axis
-        k = s[C].op.reduce_axis[0]
-
-        # storage_align params
-        factor = 16
-        offset = 8
-        if dtype == "int8":
-            factor = 32
-            offset = 16
-        elif dtype == "int4":
-            factor = 64
-            offset = 32
-        elif dtype == "int1":
-            factor = 256
-            offset = 128
-
-        # create cache stages
-        AA = s.cache_read(A, "shared", [C])
-        if layout == "NN" or layout == "TN":
-            s[AA].storage_align(AA.op.axis[0], factor, offset)
-        AL = s.cache_read(AA, "local", [C])
-        BB = s.cache_read(B, "shared", [C])
-        if layout == "TT" or layout == "NT":
-            s[BB].storage_align(BB.op.axis[0], factor, offset)
-        BL = s.cache_read(BB, "local", [C])
-        CL = s.cache_write(C, "local")
-
-        # autotvm search space definition
-        cfg = autotvm.get_config()
-
-        cfg.define_knob("bx", [2, 4, 8])
-        cfg.define_knob("by", [8, 16, 32, 64])
-        cfg.define_knob("step_k", [1, 2, 4, 8, 16, 32])
-        cfg.define_knob("v", [4, 8, 16, 32])
-        by = cfg["by"].val
-        bx = cfg["bx"].val
-        step_k = cfg["step_k"].val
-        v = cfg["v"].val
-
-        # thread tile
-        TX = 8
-        TY = 1
-        if dtype == "int4" or dtype == "int1":
-            TX = 2
-        # warp tile
-        warp_tile_m = 16  # it could also be 8 or 32 on CUDA version >= 10.0
-        warp_tile_k = 16  # it must be 16 for fp16/int8 data type
-        if dtype == "int4":
-            warp_tile_m = 8
-            warp_tile_k = 32
-        elif dtype == "int1":
-            warp_tile_m = 8
-            warp_tile_k = 128
-        # block tile
-        tile_x = bx * TX
-        tile_y = by * TY
-
-        yo, ty = s[C].split(y, tile_y)
-        ty, yi = s[C].split(ty, TY)
-
-        # schedule for C stage
-        xo, xi = s[C].split(x, tile_x)
-        WX = min(warp_tile_m, tile_x)
-        tz, xi = s[C].split(xi, WX)
-        tx, xi = s[C].split(xi, TX)
-        s[C].reorder(yo, xo, tz, ty, tx, yi, xi)
-        s[C].bind(yo, te.thread_axis("blockIdx.y"))
-        s[C].bind(xo, te.thread_axis("blockIdx.x"))
-        s[C].bind(ty, te.thread_axis("threadIdx.y"))
-        s[C].bind(tz, te.thread_axis("threadIdx.z"))
-        s[C].bind(tx, te.thread_axis("threadIdx.x"))
-
-        # schedule for CL stage
-        ko, ki = s[CL].split(k, step_k * warp_tile_k)
-        kl, ki = s[CL].split(ki, warp_tile_k)
-        s[CL].compute_at(s[C], tx)
-        yo, xo = CL.op.axis
-        s[CL].reorder(ko, kl, ki, yo, xo)
-
-        # schedule for AA stage
-        s[AA].compute_at(s[CL], ko)
-        xo, xi = s[AA].split(s[AA].op.axis[1], factor=bx * v)
-        tz, tx = s[AA].split(xi, factor=(WX // TX) * v)
-        tx, vec = s[AA].split(tx, factor=v)
-        fused = s[AA].fuse(s[AA].op.axis[0], xo)
-        _, ty = s[AA].split(fused, factor=by)
-        s[AA].bind(ty, te.thread_axis("threadIdx.y"))
-        s[AA].bind(tz, te.thread_axis("threadIdx.z"))
-        s[AA].bind(tx, te.thread_axis("threadIdx.x"))
-        # vectorization is very important for float16/int8 inputs
-        s[AA].vectorize(vec)
-
-        # schedule for BB stage
-        s[BB].compute_at(s[CL], ko)
-        xo, xi = s[BB].split(s[BB].op.axis[1], factor=bx * v)
-        tz, tx = s[BB].split(xi, factor=(WX // TX) * v)
-        tx, vec = s[BB].split(tx, factor=v)
-        fused = s[BB].fuse(s[BB].op.axis[0], xo)
-        _, ty = s[BB].split(fused, factor=by)
-        s[BB].bind(ty, te.thread_axis("threadIdx.y"))
-        s[BB].bind(tz, te.thread_axis("threadIdx.z"))
-        s[BB].bind(tx, te.thread_axis("threadIdx.x"))
-        s[BB].vectorize(vec)
-
-        s[AL].compute_at(s[CL], kl)
-        s[BL].compute_at(s[CL], kl)
-
-        # set the 'tensor_core' pragma for tensorcore codegen
-        s[CL].pragma(ko, "tensor_core")
-
-        return s, [A, B, C]
-
-
-
-
-
-
-
-
-AutoTune and Test
------------------
-Finally we use a tuner to tune the schedule, generate code with best config
-and run the kernel to compare with numpy to check whether the results are correct.
-
-
-.. code-block:: default
-
-
-    # check whether the gpu has tensorcore
-    if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-        raise Exception("skip building this tutorial because cuda is not enabled..")
-
-    dev = tvm.gpu()
-    if not nvcc.have_tensorcore(dev.compute_version):
-        raise Exception("the gpu has no tensorcore, skipping...")
-
-    M, N, L = 512, 32, 512
-    dtype = "float16"
-    layout = "NN"
-    if len(sys.argv) >= 4:
-        M, N, L = int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3])
-    if len(sys.argv) >= 5:
-        dtype = sys.argv[4]
-    if len(sys.argv) >= 6:
-        layout = sys.argv[5]
-
-    # check whether current gpu arch support support current dtype's wmma codegen
-    cuda_compute_capability = tvm.runtime._ffi_api.GetDeviceAttr(2, 0, 4)
-    major, minor = nvcc.parse_compute_version(cuda_compute_capability)
-    if dtype == "int8":
-        assert major == 7 and minor >= 2
-    elif dtype == "int4" or dtype == "int1":
-        # int4/int1 only support layout TN
-        assert major == 7 and minor == 5 and layout == "TN"
-
-
-    def tune_and_evaluate(M, N, L, dtype, layout):
-        task = autotvm.task.create(
-            "tutorial/auto_tensorcore/test_gemm", args=(N, L, M, dtype, layout), target="cuda"
-        )
-        print(task.config_space)
-
-        logging.getLogger("autotvm").setLevel(logging.DEBUG)
-        logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout))
-
-        measure_option = autotvm.measure_option(builder="local", runner=autotvm.LocalRunner(number=5))
-
-        tuner = autotvm.tuner.XGBTuner(task)
-        tuner.tune(
-            n_trial=1000,
-            measure_option=measure_option,
-            callbacks=[autotvm.callback.log_to_file("matmul.log")],
-        )
-
-        dispatch_context = autotvm.apply_history_best("matmul.log")
-        best_config = dispatch_context.query(task.target, task.workload)
-        print("\nBest config:")
-        print(best_config)
-        with autotvm.apply_history_best("matmul.log"):
-            with tvm.target.Target("cuda"):
-                s, arg_bufs = test_gemm(N, L, M, dtype, layout)
-                print(tvm.lower(s, arg_bufs, simple_mode=True))
-                func = tvm.build(s, arg_bufs)
-        dev_module = func.imported_modules[0]
-        print(dev_module.get_source())
-
-        # check correctness
-        if layout == "NN":
-            shape_a = (N, L)
-            shape_b = (L, M)
-        elif layout == "NT":
-            shape_a = (L, N)
-            shape_b = (L, M)
-        elif layout == "TN":
-            shape_a = (N, L)
-            shape_b = (M, L)
-        elif layout == "TT":
-            shape_a = (L, N)
-            shape_b = (M, L)
-
-        a_np = None
-        b_np = None
-        c_np = None
-        c_np_type = None
-        if dtype == "float16":
-            c_np_type = np.float32
-            a_np = np.random.uniform(size=shape_a).astype(np.float16)
-            b_np = np.random.uniform(size=shape_b).astype(np.float16)
-            if layout == "NN":
-                c_np = np.dot(a_np, b_np)
-            elif layout == "NT":
-                c_np = np.dot(a_np.T, b_np)
-            elif layout == "TN":
-                c_np = np.dot(a_np, b_np.T)
-            elif layout == "TT":
-                c_np = np.dot(a_np.T, b_np.T)
-        elif dtype == "int8":
-            c_np_type = np.int32
-            a_np = np.random.randint(low=-128, high=127, size=shape_a).astype(np.int8)
-            b_np = np.random.randint(low=-128, high=127, size=shape_b).astype(np.int8)
-            if layout == "NN":
-                c_np = np.dot(a_np.astype(np.int32), b_np.astype(np.int32))
-            elif layout == "NT":
-                c_np = np.dot(a_np.astype(np.int32).T, b_np.astype(np.int32))
-            elif layout == "TN":
-                c_np = np.dot(a_np.astype(np.int32), b_np.astype(np.int32).T)
-            elif layout == "TT":
-                c_np = np.dot(a_np.astype(np.int32).T, b_np.astype(np.int32).T)
-        elif dtype == "int4":
-            c_np_type = np.int32
-            a_np_int = np.random.randint(low=-8, high=7, size=shape_a).astype(np.int32)
-            b_np_int = np.random.randint(low=-8, high=7, size=shape_b).astype(np.int32)
-            # "TN"
-            c_np = np.dot(a_np_int.astype(np.int32), b_np_int.astype(np.int32).T)
-            a_np = np.zeros(shape=(N, int(L / 8)), dtype=np.int32)
-            b_np = np.zeros(shape=(M, int(L / 8)), dtype=np.int32)
-            # a_np --> col_major
-            for i in range(N):
-                for j in range(int(L / 8)):
-                    for k in range(8):
-                        a_np[i, j] = a_np[i, j] | ((a_np_int[i, j * 8 + k] & 0xF) << ((7 - k) * 4))
-
-            # b_np --> row_major
-            for i in range(M):
-                for j in range(int(L / 8)):
-                    for k in range(8):
-                        b_np[i, j] = b_np[i, j] | ((b_np_int[i, j * 8 + k] & 0xF) << ((7 - k) * 4))
-        elif dtype == "int1":
-            c_np_type = np.int32
-            a_np_int = np.random.randint(low=0, high=1, size=shape_a).astype(np.int32)
-            b_np_int = np.random.randint(low=0, high=1, size=shape_b).astype(np.int32)
-            # "TN"
-            c_np = np.dot(a_np_int.astype(np.int32), b_np_int.astype(np.int32).T)
-            a_np = np.zeros(shape=(N, int(L / 32)), dtype=np.int32)
-            b_np = np.zeros(shape=(M, int(L / 32)), dtype=np.int32)
-            for i in range(N):
-                for j in range(int(L / 32)):
-                    for k in range(32):
-                        a_np[i, j] = a_np[i, j] | ((a_np_int[i, j * 32 + k] & 0xF) << (31 - k))
-
-            for i in range(M):
-                for j in range(int(L / 32)):
-                    for k in range(32):
-                        b_np[i, j] = b_np[i, j] | ((b_np_int[i, j * 32 + k] & 0xF) << (31 - k))
-
-        c_tvm = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np_type), device=dev)
-        a_tvm = tvm.nd.array(a_np, device=dev)
-        b_tvm = tvm.nd.array(b_np, device=dev)
-        func(a_tvm, b_tvm, c_tvm)
-
-        tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-3)
-
-        evaluator = func.time_evaluator(func.entry_name, dev, number=100)
-        print("Time cost of this operator: %f" % evaluator(a_tvm, b_tvm, c_tvm).mean)
-
-
-    # We do not run the tuning in our webpage server since it takes some time.
-    # Uncomment the following line to run it by yourself.
-
-    # tune_and_evaluate(M, N, L, dtype, layout)
-
-
-
-
-
-
-
-Sample Output
--------------
-.. code-block:: bash
-
-   Best config:
-   [('bx', 4), ('by', 32), ('step_k', 16), ('v', 8)],,None,40
-   Finish loading 162 records
-   produce compute {
-     // attr [iter_var(blockIdx.y, , blockIdx.y)] thread_extent = 1
-     // attr [compute.local] storage_scope = "wmma.accumulator"
-     allocate compute.local[float32 * 256]
-     // attr [A.shared] storage_scope = "shared"
-     allocate A.shared[float16 * 8448]
-     // attr [B.shared] storage_scope = "shared"
-     allocate B.shared[float16 * 8192]
-     // attr [A.shared.local] storage_scope = "wmma.matrix_b"
-     allocate A.shared.local[float16 * 256]
-     // attr [B.shared.local] storage_scope = "wmma.matrix_a"
-     allocate B.shared.local[float16 * 256]
-     // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 16
-     // attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent = 2
-     // attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 32
-     // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 2
-     produce compute.local {
-       for (j.c.init, 0, 1) {
-         tvm_fill_fragment(compute.local, 16, 16, 16, 0, 0f)
-       }
-       // attr [iter_var(k.outer, )] pragma_tensor_core = 1
-       for (k.outer, 0, 2) {
-         produce A.shared {
-           for (ax0.ax1.outer.fused.outer, 0, 8) {
-             // attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 32
-             // attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent = 2
-             // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 2
-             A.shared[ramp((((((ax0.ax1.outer.fused.outer*1056) + (floordiv(threadIdx.y, 8)*264)) + (floormod(threadIdx.y, 8)*32)) + (threadIdx.z*16)) + (threadIdx.x*8)), 1, 8)] = A[ramp(((((((ax0.ax1.outer.fused.outer*2048) + (floordiv(threadIdx.y, 8)*512)) + (k.outer*256)) + (floormod(threadIdx.y, 8)*32)) + (threadIdx.z*16)) + (threadIdx.x*8)), 1, 8)]
-           }
-         }
-         produce B.shared {
-           for (ax0.ax1.outer.fused.outer, 0, 8) {
-             // attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 32
-             // attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent = 2
-             // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 2
-             B.shared[ramp(((((ax0.ax1.outer.fused.outer*1024) + (threadIdx.y*32)) + (threadIdx.z*16)) + (threadIdx.x*8)), 1, 8)] = B[ramp(((((((k.outer*131072) + (ax0.ax1.outer.fused.outer*16384)) + (threadIdx.y*512)) + (blockIdx.x*32)) + (threadIdx.z*16)) + (threadIdx.x*8)), 1, 8)]
-           }
-         }
-         for (k.inner.outer, 0, 16) {
-           produce A.shared.local {
-             for (ax1, 0, 1) {
-               tvm_load_matrix_sync(A.shared.local, 16, 16, 16, 0, &(A.shared[(((threadIdx.y/16)*4224) + (k.inner.outer*16))]), 264, "col_major")
-             }
-           }
-           produce B.shared.local {
-             for (ax0, 0, 1) {
-               for (ax1, 0, 1) {
-                 tvm_load_matrix_sync(B.shared.local, 16, 16, 16, 0, &(B.shared[((k.inner.outer*512) + (threadIdx.z*16))]), 32, "col_major")
-               }
-             }
-           }
-           for (k.inner.inner, 0, 1) {
-             for (j.c, 0, 1) {
-               tvm_mma_sync(compute.local, 0, B.shared.local, 0, A.shared.local, 0, compute.local, 0)
-             }
-           }
-         }
-       }
-     }
-     for (j.inner.inner.inner, 0, 1) {
-       tvm_store_matrix_sync(compute.local, 16, 16, 16, 0, &(compute[((((threadIdx.y/16)*8192) + (blockIdx.x*32)) + (threadIdx.z*16))]), 512, "col_major")
-     }
-   }
-
-   #include <cuda_fp16.h>
-   __device__ half max(const half a, const half b)
-   {
-     return __hgt(__half(a), __half(b)) ? a : b;
-   }
-   __device__ half min(const half a, const half b)
-   {
-     return __hlt(__half(a), __half(b)) ? a : b;
-   }
-   __device__ half operator+(const volatile __half &a,  const volatile __half &b)
-   {
-     return __hadd(a, b);
-   }
-   __device__ half operator<=(const volatile __half &a,  const volatile __half &b)
-   {
-     return __hlt(a, b);
-   }
-   __device__ half operator*(const volatile __half &a,  const volatile __half &b)
-   {
-     return __hmul(a, b);
-   }
-   #include <mma.h>
-   extern "C" __global__ void default_function_kernel0( half* __restrict__ A,  half* __restrict__ B,  float* __restrict__ compute) {
-     nvcuda::wmma::fragment<nvcuda::wmma::accumulator, 16, 16, 16, float> compute_local[1];
-     __shared__ half A_shared[8448];
-     __shared__ half B_shared[8192];
-     nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, 16, 16, 16, half, nvcuda::wmma::col_major> A_shared_local[1];
-     nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, 16, 16, 16, half, nvcuda::wmma::col_major> B_shared_local[1];
-     for (int j_c_init = 0; j_c_init < 1; ++j_c_init) {
-       (void)nvcuda::wmma::fill_fragment(compute_local[0], 0.000000e+00f);
-     }
-     for (int k_outer = 0; k_outer < 2; ++k_outer) {
-       __syncthreads();
-       for (int ax0_ax1_outer_fused_outer = 0; ax0_ax1_outer_fused_outer < 8; ++ax0_ax1_outer_fused_outer) {
-         ((__shared__ float4*)(A_shared + (((((ax0_ax1_outer_fused_outer * 1056) + ((((int)threadIdx.y) >> 3) * 264)) + ((((int)threadIdx.y) & 7) * 32)) + (((int)threadIdx.z) * 16)) + (((int)threadIdx.x) * 8))))[0] = (( float4*)(A + ((((((ax0_ax1_outer_fused_outer * 2048) + ((((int)threadIdx.y) >> 3) * 512)) + (k_outer * 256)) + ((((int)threadIdx.y) & 7) * 32)) + (((int)threadIdx.z) * 16)) + (((int)threadIdx.x) * 8))))[0];
-       }
-       for (int ax0_ax1_outer_fused_outer1 = 0; ax0_ax1_outer_fused_outer1 < 8; ++ax0_ax1_outer_fused_outer1) {
-         ((__shared__ float4*)(B_shared + ((((ax0_ax1_outer_fused_outer1 * 1024) + (((int)threadIdx.y) * 32)) + (((int)threadIdx.z) * 16)) + (((int)threadIdx.x) * 8))))[0] = (( float4*)(B + ((((((k_outer * 131072) + (ax0_ax1_outer_fused_outer1 * 16384)) + (((int)threadIdx.y) * 512)) + (((int)blockIdx.x) * 32)) + (((int)threadIdx.z) * 16)) + (((int)threadIdx.x) * 8))))[0];
-       }
-       __syncthreads();
-       for (int k_inner_outer = 0; k_inner_outer < 16; ++k_inner_outer) {
-         for (int ax1 = 0; ax1 < 1; ++ax1) {
-           (void)nvcuda::wmma::load_matrix_sync(A_shared_local[0], &(A_shared[(((((int)threadIdx.y) / 16) * 4224) + (k_inner_outer * 16))]), 264);
-         }
-         for (int ax0 = 0; ax0 < 1; ++ax0) {
-           for (int ax11 = 0; ax11 < 1; ++ax11) {
-             (void)nvcuda::wmma::load_matrix_sync(B_shared_local[0], &(B_shared[((k_inner_outer * 512) + (((int)threadIdx.z) * 16))]), 32);
-           }
-         }
-         for (int k_inner_inner = 0; k_inner_inner < 1; ++k_inner_inner) {
-           for (int j_c = 0; j_c < 1; ++j_c) {
-             (void)nvcuda::wmma::mma_sync(compute_local[0], B_shared_local[0], A_shared_local[0], compute_local[0]);
-           }
-         }
-       }
-     }
-     for (int j_inner_inner_inner = 0; j_inner_inner_inner < 1; ++j_inner_inner_inner) {
-       (void)nvcuda::wmma::store_matrix_sync(&(compute[((((((int)threadIdx.y) / 16) * 8192) + (((int)blockIdx.x) * 32)) + (((int)threadIdx.z) * 16))]), compute_local[0], 512, nvcuda::wmma::mem_col_major);
-     }
-   }
-
-
-   Time cost of this operator: 0.000008
-
-Summary
--------
-This tutorial demonstrates how to use the AutoTensorCoreCodeGen of TVM
-to generate tensorcore kernels.
-
-
-.. _sphx_glr_download_tutorials_optimize_opt_matmul_auto_tensorcore.py:
-
-
-.. only :: html
-
- .. container:: sphx-glr-footer
-    :class: sphx-glr-footer-example
-
-
-
-  .. container:: sphx-glr-download
-
-     :download:`Download Python source code: opt_matmul_auto_tensorcore.py <opt_matmul_auto_tensorcore.py>`
-
-
-
-  .. container:: sphx-glr-download
-
-     :download:`Download Jupyter notebook: opt_matmul_auto_tensorcore.ipynb <opt_matmul_auto_tensorcore.ipynb>`
-
-
-.. only:: html
-
- .. rst-class:: sphx-glr-signature
-
-    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_
diff --git a/docs/_sources/tutorials/optimize/sg_execution_times.rst.txt b/docs/_sources/tutorials/optimize/sg_execution_times.rst.txt
index 3d661fa..e268cbf 100644
--- a/docs/_sources/tutorials/optimize/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorials/optimize/sg_execution_times.rst.txt
@@ -5,9 +5,8 @@
 
 Computation times
 =================
-**00:28.362** total execution time for **tutorials_optimize** files:
+**00:31.767** total execution time for **tutorials_optimize** files:
 
-- **00:25.944**: :ref:`sphx_glr_tutorials_optimize_opt_gemm.py` (``opt_gemm.py``)
-- **00:01.209**: :ref:`sphx_glr_tutorials_optimize_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``)
-- **00:00.975**: :ref:`sphx_glr_tutorials_optimize_opt_conv_cuda.py` (``opt_conv_cuda.py``)
-- **00:00.233**: :ref:`sphx_glr_tutorials_optimize_opt_matmul_auto_tensorcore.py` (``opt_matmul_auto_tensorcore.py``)
+- **00:29.610**: :ref:`sphx_glr_tutorials_optimize_opt_gemm.py` (``opt_gemm.py``)
+- **00:01.150**: :ref:`sphx_glr_tutorials_optimize_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``)
+- **00:01.007**: :ref:`sphx_glr_tutorials_optimize_opt_conv_cuda.py` (``opt_conv_cuda.py``)
diff --git a/docs/_sources/tutorials/topi/intro_topi.rst.txt b/docs/_sources/tutorials/topi/intro_topi.rst.txt
index 8033f4c..2d07c9a 100644
--- a/docs/_sources/tutorials/topi/intro_topi.rst.txt
+++ b/docs/_sources/tutorials/topi/intro_topi.rst.txt
@@ -231,7 +231,7 @@ As you can see, scheduled stages of computation have been accumulated and we can
 
  .. code-block:: none
 
-    [stage(a, placeholder(a, 0x10266f380)), stage(b, placeholder(b, 0x191459950)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range( [...]
+    [stage(a, placeholder(a, 0xa83745f0)), stage(b, placeholder(b, 0x1345fe8f0)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(m [...]
 
 
 
diff --git a/docs/_sources/tutorials/topi/sg_execution_times.rst.txt b/docs/_sources/tutorials/topi/sg_execution_times.rst.txt
index 08f9d40..f46f3da 100644
--- a/docs/_sources/tutorials/topi/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorials/topi/sg_execution_times.rst.txt
@@ -5,6 +5,6 @@
 
 Computation times
 =================
-**00:00.641** total execution time for **tutorials_topi** files:
+**00:00.606** total execution time for **tutorials_topi** files:
 
-- **00:00.641**: :ref:`sphx_glr_tutorials_topi_intro_topi.py` (``intro_topi.py``)
+- **00:00.606**: :ref:`sphx_glr_tutorials_topi_intro_topi.py` (``intro_topi.py``)
diff --git a/docs/_sources/vta/tutorials/autotvm/sg_execution_times.rst.txt b/docs/_sources/vta/tutorials/autotvm/sg_execution_times.rst.txt
index 666a76d..7e392c1 100644
--- a/docs/_sources/vta/tutorials/autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/vta/tutorials/autotvm/sg_execution_times.rst.txt
@@ -5,7 +5,7 @@
 
 Computation times
 =================
-**00:08.272** total execution time for **vta_tutorials_autotvm** files:
+**00:08.081** total execution time for **vta_tutorials_autotvm** files:
 
-- **00:08.045**: :ref:`sphx_glr_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``)
-- **00:00.228**: :ref:`sphx_glr_vta_tutorials_autotvm_tune_alu_vta.py` (``tune_alu_vta.py``)
+- **00:07.868**: :ref:`sphx_glr_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``)
+- **00:00.213**: :ref:`sphx_glr_vta_tutorials_autotvm_tune_alu_vta.py` (``tune_alu_vta.py``)
diff --git a/docs/_sources/vta/tutorials/autotvm/tune_relay_vta.rst.txt b/docs/_sources/vta/tutorials/autotvm/tune_relay_vta.rst.txt
index e60fc2f..9b4991b 100644
--- a/docs/_sources/vta/tutorials/autotvm/tune_relay_vta.rst.txt
+++ b/docs/_sources/vta/tutorials/autotvm/tune_relay_vta.rst.txt
@@ -190,7 +190,7 @@ Here we use an Pynq-Z1 board as an example.
 
 
     # Tracker host and port can be set by your environment
-    tracker_host = os.environ.get("TVM_TRACKER_HOST", "0.0.0.0")
+    tracker_host = os.environ.get("TVM_TRACKER_HOST", "127.0.0.1")
     tracker_port = int(os.environ.get("TVM_TRACKER_PORT", 9190))
 
     # Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
@@ -499,7 +499,7 @@ Finally, we launch tuning jobs and evaluate the end-to-end performance.
  .. code-block:: none
 
     Extract tasks...
-
    ...1%, 0.01 MB, 172 KB/s, 0 seconds passed
    ...2%, 0.02 MB, 342 KB/s, 0 seconds passed
    ...3%, 0.02 MB, 503 KB/s, 0 seconds passed
    ...4%, 0.03 MB, 669 KB/s, 0 seconds passed
    ...5%, 0.04 MB, 821 KB/s, 0 seconds passed
    ...6%, 0.05 MB, 923 KB/s, 0 seconds passed
    ...7%, 0.05 MB, 1056 KB/s, 0 seconds passed
    ...8%, 0.06 MB, 1205 KB/s, 0 seconds passed
    ...9%, 0.07 MB, 1309 KB/s, 0 seconds passed
    ...10%, 0.08 MB, 1451 KB/s, 0 seconds passed
    ...11%, 0.09 MB, 1593 KB/s, 0 seconds passed
    ...13%, 0.09 MB, 1730 KB/s, 0 seconds passed
    ...14%, 0.10 MB, 1725 KB/s, 0 seconds passed
    ...15%, 0.11 MB, 1854 KB/s, 0 seconds passed
    ...16%, 0.12 MB, 1983 KB/s, 0 seconds passed
    ...17%, 0.12 MB, 2088 KB/s, 0 seconds passed
    ...18%, 0.13 MB, 2200 KB/s, 0 seconds passed
    ...19%, 0.14 MB, 2325 KB/s, 0 seconds passed
    ...20%, 0.15 MB, 2450 KB/s, 0 seconds passed
    ...21%, 0.16 MB, 2438 KB/s, 0 seconds passed
    ...22%, 0.16 MB, 2520 KB/s,
  0 seconds passed
    ...23%, 0.17 MB, 2636 KB/s, 0 seconds passed
    ...24%, 0.18 MB, 2751 KB/s, 0 seconds passed
    ...26%, 0.19 MB, 2867 KB/s, 0 seconds passed
    ...27%, 0.20 MB, 2981 KB/s, 0 seconds passed
    ...28%, 0.20 MB, 3096 KB/s, 0 seconds passed
    ...29%, 0.21 MB, 3169 KB/s, 0 seconds passed
    ...30%, 0.22 MB, 3283 KB/s, 0 seconds passed
    ...31%, 0.23 MB, 3396 KB/s, 0 seconds passed
    ...32%, 0.23 MB, 3510 KB/s, 0 seconds passed
    ...33%, 0.24 MB, 3621 KB/s, 0 seconds passed
    ...34%, 0.25 MB, 3735 KB/s, 0 seconds passed
    ...35%, 0.26 MB, 3846 KB/s, 0 seconds passed
    ...36%, 0.27 MB, 3959 KB/s, 0 seconds passed
    ...38%, 0.27 MB, 4052 KB/s, 0 seconds passed
    ...39%, 0.28 MB, 4164 KB/s, 0 seconds passed
    ...40%, 0.29 MB, 4118 KB/s, 0 seconds passed
    ...41%, 0.30 MB, 4225 KB/s, 0 seconds passed
    ...42%, 0.30 MB, 4274 KB/s, 0 seconds passed
    ...43%, 0.31 MB, 4379 KB/s, 0 seconds passed
    ...44%, 0.32 MB, 4484 KB/s, 0 seconds passed
 
    ...45%, 0.33 MB, 4590 KB/s, 0 seconds passed
    ...46%, 0.34 MB, 4689 KB/s, 0 seconds passed
    ...47%, 0.34 MB, 4794 KB/s, 0 seconds passed
    ...48%, 0.35 MB, 4894 KB/s, 0 seconds passed
    ...49%, 0.36 MB, 4998 KB/s, 0 seconds passed
    ...51%, 0.37 MB, 5100 KB/s, 0 seconds passed
    ...52%, 0.38 MB, 5203 KB/s, 0 seconds passed
    ...53%, 0.38 MB, 5304 KB/s, 0 seconds passed
    ...54%, 0.39 MB, 5408 KB/s, 0 seconds passed
    ...55%, 0.40 MB, 5502 KB/s, 0 seconds passed
    ...56%, 0.41 MB, 5604 KB/s, 0 seconds passed
    ...57%, 0.41 MB, 5705 KB/s, 0 seconds passed
    ...58%, 0.42 MB, 5793 KB/s, 0 seconds passed
    ...59%, 0.43 MB, 5895 KB/s, 0 seconds passed
    ...60%, 0.44 MB, 5996 KB/s, 0 seconds passed
    ...61%, 0.45 MB, 6098 KB/s, 0 seconds passed
    ...63%, 0.45 MB, 6191 KB/s, 0 seconds passed
    ...64%, 0.46 MB, 6292 KB/s, 0 seconds passed
    ...65%, 0.47 MB, 6392 KB/s, 0 seconds passed
    ...66%, 0.48 MB, 6493 KB/s, 0 seconds passed
    ...67%, 0.48
  MB, 6581 KB/s, 0 seconds passed
    ...68%, 0.49 MB, 6680 KB/s, 0 seconds passed
    ...69%, 0.50 MB, 6779 KB/s, 0 seconds passed
    ...70%, 0.51 MB, 6880 KB/s, 0 seconds passed
    ...71%, 0.52 MB, 6979 KB/s, 0 seconds passed
    ...72%, 0.52 MB, 7078 KB/s, 0 seconds passed
    ...73%, 0.53 MB, 7177 KB/s, 0 seconds passed
    ...74%, 0.54 MB, 7277 KB/s, 0 seconds passed
    ...76%, 0.55 MB, 7157 KB/s, 0 seconds passed
    ...77%, 0.55 MB, 7253 KB/s, 0 seconds passed
    ...78%, 0.56 MB, 7348 KB/s, 0 seconds passed
    ...79%, 0.57 MB, 7445 KB/s, 0 seconds passed
    ...80%, 0.58 MB, 7539 KB/s, 0 seconds passed
    ...81%, 0.59 MB, 7635 KB/s, 0 seconds passed
    ...82%, 0.59 MB, 7729 KB/s, 0 seconds passed
    ...83%, 0.60 MB, 7825 KB/s, 0 seconds passed
    ...84%, 0.61 MB, 7860 KB/s, 0 seconds passed
    ...85%, 0.62 MB, 7954 KB/s, 0 seconds passed
    ...86%, 0.62 MB, 8048 KB/s, 0 seconds passed
    ...87%, 0.63 MB, 8140 KB/s, 0 seconds passed
    ...89%, 0.64 MB, 8234 KB/s, 0
  seconds passed
    ...90%, 0.65 MB, 8327 KB/s, 0 seconds passed
    ...91%, 0.66 MB, 8421 KB/s, 0 seconds passed
    ...92%, 0.66 MB, 8513 KB/s, 0 seconds passed
    ...93%, 0.67 MB, 8607 KB/s, 0 seconds passed
    ...94%, 0.68 MB, 8698 KB/s, 0 seconds passed
    ...95%, 0.69 MB, 8791 KB/s, 0 seconds passed
    ...96%, 0.70 MB, 8882 KB/s, 0 seconds passed
    ...97%, 0.70 MB, 8975 KB/s, 0 seconds passed
    ...98%, 0.71 MB, 9065 KB/s, 0 seconds passed
    ...99%, 0.72 MB, 9158 KB/s, 0 seconds passed
    ...100%, 0.73 MB, 9230 KB/s, 0 seconds passed
+
    ...1%, 0.01 MB, 258 KB/s, 0 seconds passed
    ...2%, 0.02 MB, 506 KB/s, 0 seconds passed
    ...3%, 0.02 MB, 737 KB/s, 0 seconds passed
    ...4%, 0.03 MB, 979 KB/s, 0 seconds passed
    ...5%, 0.04 MB, 1191 KB/s, 0 seconds passed
    ...6%, 0.05 MB, 1301 KB/s, 0 seconds passed
    ...7%, 0.05 MB, 1483 KB/s, 0 seconds passed
    ...8%, 0.06 MB, 1689 KB/s, 0 seconds passed
    ...9%, 0.07 MB, 1810 KB/s, 0 seconds passed
    ...10%, 0.08 MB, 2006 KB/s, 0 seconds passed
    ...11%, 0.09 MB, 2201 KB/s, 0 seconds passed
    ...13%, 0.09 MB, 2388 KB/s, 0 seconds passed
    ...14%, 0.10 MB, 2310 KB/s, 0 seconds passed
    ...15%, 0.11 MB, 2481 KB/s, 0 seconds passed
    ...16%, 0.12 MB, 2653 KB/s, 0 seconds passed
    ...17%, 0.12 MB, 2822 KB/s, 0 seconds passed
    ...18%, 0.13 MB, 2960 KB/s, 0 seconds passed
    ...19%, 0.14 MB, 3127 KB/s, 0 seconds passed
    ...20%, 0.15 MB, 3294 KB/s, 0 seconds passed
    ...21%, 0.16 MB, 3460 KB/s, 0 seconds passed
    ...22%, 0.16 MB, 3619 KB/
 s, 0 seconds passed
    ...23%, 0.17 MB, 3783 KB/s, 0 seconds passed
    ...24%, 0.18 MB, 3946 KB/s, 0 seconds passed
    ...26%, 0.19 MB, 4109 KB/s, 0 seconds passed
    ...27%, 0.20 MB, 4196 KB/s, 0 seconds passed
    ...28%, 0.20 MB, 4353 KB/s, 0 seconds passed
    ...29%, 0.21 MB, 4363 KB/s, 0 seconds passed
    ...30%, 0.22 MB, 4376 KB/s, 0 seconds passed
    ...31%, 0.23 MB, 4523 KB/s, 0 seconds passed
    ...32%, 0.23 MB, 4670 KB/s, 0 seconds passed
    ...33%, 0.24 MB, 4802 KB/s, 0 seconds passed
    ...34%, 0.25 MB, 4891 KB/s, 0 seconds passed
    ...35%, 0.26 MB, 5036 KB/s, 0 seconds passed
    ...36%, 0.27 MB, 5181 KB/s, 0 seconds passed
    ...38%, 0.27 MB, 5326 KB/s, 0 seconds passed
    ...39%, 0.28 MB, 5415 KB/s, 0 seconds passed
    ...40%, 0.29 MB, 5558 KB/s, 0 seconds passed
    ...41%, 0.30 MB, 5613 KB/s, 0 seconds passed
    ...42%, 0.30 MB, 5753 KB/s, 0 seconds passed
    ...43%, 0.31 MB, 5877 KB/s, 0 seconds passed
    ...44%, 0.32 MB, 6015 KB/s, 0 seconds pass
 ed
    ...45%, 0.33 MB, 6037 KB/s, 0 seconds passed
    ...46%, 0.34 MB, 6172 KB/s, 0 seconds passed
    ...47%, 0.34 MB, 6139 KB/s, 0 seconds passed
    ...48%, 0.35 MB, 6271 KB/s, 0 seconds passed
    ...49%, 0.36 MB, 6363 KB/s, 0 seconds passed
    ...51%, 0.37 MB, 6492 KB/s, 0 seconds passed
    ...52%, 0.38 MB, 6621 KB/s, 0 seconds passed
    ...53%, 0.38 MB, 6752 KB/s, 0 seconds passed
    ...54%, 0.39 MB, 6815 KB/s, 0 seconds passed
    ...55%, 0.40 MB, 6943 KB/s, 0 seconds passed
    ...56%, 0.41 MB, 6985 KB/s, 0 seconds passed
    ...57%, 0.41 MB, 7111 KB/s, 0 seconds passed
    ...58%, 0.42 MB, 7160 KB/s, 0 seconds passed
    ...59%, 0.43 MB, 7283 KB/s, 0 seconds passed
    ...60%, 0.44 MB, 7405 KB/s, 0 seconds passed
    ...61%, 0.45 MB, 7529 KB/s, 0 seconds passed
    ...63%, 0.45 MB, 7569 KB/s, 0 seconds passed
    ...64%, 0.46 MB, 7691 KB/s, 0 seconds passed
    ...65%, 0.47 MB, 7556 KB/s, 0 seconds passed
    ...66%, 0.48 MB, 7672 KB/s, 0 seconds passed
    ...67%, 0.
 48 MB, 7788 KB/s, 0 seconds passed
    ...68%, 0.49 MB, 7906 KB/s, 0 seconds passed
    ...69%, 0.50 MB, 7997 KB/s, 0 seconds passed
    ...70%, 0.51 MB, 8113 KB/s, 0 seconds passed
    ...71%, 0.52 MB, 8123 KB/s, 0 seconds passed
    ...72%, 0.52 MB, 8237 KB/s, 0 seconds passed
    ...73%, 0.53 MB, 8275 KB/s, 0 seconds passed
    ...74%, 0.54 MB, 8388 KB/s, 0 seconds passed
    ...76%, 0.55 MB, 8500 KB/s, 0 seconds passed
    ...77%, 0.55 MB, 8613 KB/s, 0 seconds passed
    ...78%, 0.56 MB, 8660 KB/s, 0 seconds passed
    ...79%, 0.57 MB, 8771 KB/s, 0 seconds passed
    ...80%, 0.58 MB, 8788 KB/s, 0 seconds passed
    ...81%, 0.59 MB, 8897 KB/s, 0 seconds passed
    ...82%, 0.59 MB, 8974 KB/s, 0 seconds passed
    ...83%, 0.60 MB, 9082 KB/s, 0 seconds passed
    ...84%, 0.61 MB, 8952 KB/s, 0 seconds passed
    ...85%, 0.62 MB, 9058 KB/s, 0 seconds passed
    ...86%, 0.62 MB, 9161 KB/s, 0 seconds passed
    ...87%, 0.63 MB, 9268 KB/s, 0 seconds passed
    ...89%, 0.64 MB, 9342 KB/s,
  0 seconds passed
    ...90%, 0.65 MB, 9447 KB/s, 0 seconds passed
    ...91%, 0.66 MB, 9551 KB/s, 0 seconds passed
    ...92%, 0.66 MB, 9551 KB/s, 0 seconds passed
    ...93%, 0.67 MB, 9654 KB/s, 0 seconds passed
    ...94%, 0.68 MB, 9690 KB/s, 0 seconds passed
    ...95%, 0.69 MB, 9792 KB/s, 0 seconds passed
    ...96%, 0.70 MB, 9886 KB/s, 0 seconds passed
    ...97%, 0.70 MB, 9986 KB/s, 0 seconds passed
    ...98%, 0.71 MB, 10087 KB/s, 0 seconds passed
    ...99%, 0.72 MB, 10189 KB/s, 0 seconds passed
    ...100%, 0.73 MB, 10277 KB/s, 0 seconds passed
     Extracted 10 conv2d tasks:
     (1, 14, 14, 256, 512, 1, 1, 0, 0, 2, 2)
     (1, 28, 28, 128, 256, 1, 1, 0, 0, 2, 2)
diff --git a/docs/_sources/vta/tutorials/frontend/deploy_classification.rst.txt b/docs/_sources/vta/tutorials/frontend/deploy_classification.rst.txt
index 02c04aa..9b04adb 100644
--- a/docs/_sources/vta/tutorials/frontend/deploy_classification.rst.txt
+++ b/docs/_sources/vta/tutorials/frontend/deploy_classification.rst.txt
@@ -259,8 +259,8 @@ The compilation steps are:
 
  .. code-block:: none
 
-
    ...12%, 0.01 MB, 47 KB/s, 0 seconds passed
    ...25%, 0.02 MB, 95 KB/s, 0 seconds passed
    ...38%, 0.02 MB, 143 KB/s, 0 seconds passed
    ...51%, 0.03 MB, 190 KB/s, 0 seconds passed
    ...64%, 0.04 MB, 236 KB/s, 0 seconds passed
    ...77%, 0.05 MB, 275 KB/s, 0 seconds passed
    ...89%, 0.05 MB, 320 KB/s, 0 seconds passed
    ...100%, 0.06 MB, 363 KB/s, 0 seconds passed
-    resnet18_v1 inference graph built in 10.54s!
+
    ...12%, 0.01 MB, 50 KB/s, 0 seconds passed
    ...25%, 0.02 MB, 100 KB/s, 0 seconds passed
    ...38%, 0.02 MB, 149 KB/s, 0 seconds passed
    ...51%, 0.03 MB, 199 KB/s, 0 seconds passed
    ...64%, 0.04 MB, 247 KB/s, 0 seconds passed
    ...77%, 0.05 MB, 289 KB/s, 0 seconds passed
    ...89%, 0.05 MB, 336 KB/s, 0 seconds passed
    ...100%, 0.06 MB, 383 KB/s, 0 seconds passed
+    resnet18_v1 inference graph built in 10.41s!
     /workspace/docs/../python/tvm/contrib/graph_runtime.py:26: UserWarning: This function has been moved to tvm.contrib.graph_executor and will be removed in the next TVM release
       "This function has been moved to tvm.contrib.graph_executor and will be removed "
 
@@ -358,8 +358,8 @@ and an input test image.
 
  .. code-block:: none
 
-    File synset.txt exists, skip.
-    File cat.png exists, skip.
+    File /workspace/vta/tutorials/frontend/synset.txt exists, skip.
+    File /workspace/vta/tutorials/frontend/cat.png exists, skip.
 
     Execution statistics:
             inp_load_nbytes :          5549568
diff --git a/docs/_sources/vta/tutorials/frontend/sg_execution_times.rst.txt b/docs/_sources/vta/tutorials/frontend/sg_execution_times.rst.txt
index 37cd9fa..32d63de 100644
--- a/docs/_sources/vta/tutorials/frontend/sg_execution_times.rst.txt
+++ b/docs/_sources/vta/tutorials/frontend/sg_execution_times.rst.txt
@@ -5,6 +5,6 @@
 
 Computation times
 =================
-**00:29.774** total execution time for **vta_tutorials_frontend** files:
+**00:29.521** total execution time for **vta_tutorials_frontend** files:
 
-- **00:29.774**: :ref:`sphx_glr_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``)
+- **00:29.521**: :ref:`sphx_glr_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``)
diff --git a/docs/_sources/vta/tutorials/optimize/convolution_opt.rst.txt b/docs/_sources/vta/tutorials/optimize/convolution_opt.rst.txt
index 144a05e..0ca92de 100644
--- a/docs/_sources/vta/tutorials/optimize/convolution_opt.rst.txt
+++ b/docs/_sources/vta/tutorials/optimize/convolution_opt.rst.txt
@@ -252,8 +252,8 @@ Those include:
 
     primfn(data_1: handle, kernel_1: handle, res_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
-      buffers = {kernel: Buffer(kernel_2: Pointer(int8), int8, [16, 16, 3, 3, 16, 16], []),
-                 res: Buffer(res_2: Pointer(int8), int8, [1, 16, 14, 14, 1, 16], []),
+      buffers = {res: Buffer(res_2: Pointer(int8), int8, [1, 16, 14, 14, 1, 16], []),
+                 kernel: Buffer(kernel_2: Pointer(int8), int8, [16, 16, 3, 3, 16, 16], []),
                  data: Buffer(data_2: Pointer(int8), int8, [1, 16, 14, 14, 1, 16], [])}
       buffer_map = {data_1: data, kernel_1: kernel, res_1: res} {
       attr [data_buf: Pointer(int8)] "storage_scope" = "global";
@@ -631,8 +631,8 @@ and mapping the shift, and clipping computation to the vector ALU.
 
     primfn(data_1: handle, kernel_1: handle, res_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
-      buffers = {kernel: Buffer(kernel_2: Pointer(int8), int8, [16, 16, 3, 3, 16, 16], []),
-                 res: Buffer(res_2: Pointer(int8), int8, [1, 16, 14, 14, 1, 16], []),
+      buffers = {res: Buffer(res_2: Pointer(int8), int8, [1, 16, 14, 14, 1, 16], []),
+                 kernel: Buffer(kernel_2: Pointer(int8), int8, [16, 16, 3, 3, 16, 16], []),
                  data: Buffer(data_2: Pointer(int8), int8, [1, 16, 14, 14, 1, 16], [])}
       buffer_map = {data_1: data, kernel_1: kernel, res_1: res} {
       attr [res_conv: Pointer(int32)] "storage_scope" = "local.acc_buffer";
diff --git a/docs/_sources/vta/tutorials/optimize/matrix_multiply_opt.rst.txt b/docs/_sources/vta/tutorials/optimize/matrix_multiply_opt.rst.txt
index a27cf37..cc82bc9 100644
--- a/docs/_sources/vta/tutorials/optimize/matrix_multiply_opt.rst.txt
+++ b/docs/_sources/vta/tutorials/optimize/matrix_multiply_opt.rst.txt
@@ -189,8 +189,8 @@ Those include:
 
     primfn(data_1: handle, weight_1: handle, res_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
-      buffers = {weight: Buffer(weight_2: Pointer(int8), int8, [64, 64, 16, 16], []),
-                 res: Buffer(res_2: Pointer(int8), int8, [1, 64, 1, 16], []),
+      buffers = {res: Buffer(res_2: Pointer(int8), int8, [1, 64, 1, 16], []),
+                 weight: Buffer(weight_2: Pointer(int8), int8, [64, 64, 16, 16], []),
                  data: Buffer(data_2: Pointer(int8), int8, [1, 64, 1, 16], [])}
       buffer_map = {data_1: data, weight_1: weight, res_1: res} {
       attr [data_buf: Pointer(int8)] "storage_scope" = "global";
@@ -351,8 +351,8 @@ below:
 
     primfn(data_1: handle, weight_1: handle, res_1: handle) -> ()
       attr = {"global_symbol": "main", "tir.noalias": True}
-      buffers = {weight: Buffer(weight_2: Pointer(int8), int8, [64, 64, 16, 16], []),
-                 res: Buffer(res_2: Pointer(int8), int8, [1, 64, 1, 16], []),
+      buffers = {res: Buffer(res_2: Pointer(int8), int8, [1, 64, 1, 16], []),
+                 weight: Buffer(weight_2: Pointer(int8), int8, [64, 64, 16, 16], []),
                  data: Buffer(data_2: Pointer(int8), int8, [1, 64, 1, 16], [])}
       buffer_map = {data_1: data, weight_1: weight, res_1: res} {
       attr [data_buf: Pointer(int8)] "storage_scope" = "global";
diff --git a/docs/_sources/vta/tutorials/optimize/sg_execution_times.rst.txt b/docs/_sources/vta/tutorials/optimize/sg_execution_times.rst.txt
index cdac377..12c7393 100644
--- a/docs/_sources/vta/tutorials/optimize/sg_execution_times.rst.txt
+++ b/docs/_sources/vta/tutorials/optimize/sg_execution_times.rst.txt
@@ -5,7 +5,7 @@
 
 Computation times
 =================
-**00:03.794** total execution time for **vta_tutorials_optimize** files:
+**00:03.642** total execution time for **vta_tutorials_optimize** files:
 
-- **00:03.262**: :ref:`sphx_glr_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``)
-- **00:00.532**: :ref:`sphx_glr_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``)
+- **00:03.139**: :ref:`sphx_glr_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``)
+- **00:00.503**: :ref:`sphx_glr_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``)
diff --git a/docs/_sources/vta/tutorials/sg_execution_times.rst.txt b/docs/_sources/vta/tutorials/sg_execution_times.rst.txt
index d9afdd1..d2fff84 100644
--- a/docs/_sources/vta/tutorials/sg_execution_times.rst.txt
+++ b/docs/_sources/vta/tutorials/sg_execution_times.rst.txt
@@ -5,7 +5,7 @@
 
 Computation times
 =================
-**00:00.971** total execution time for **vta_tutorials** files:
+**00:00.917** total execution time for **vta_tutorials** files:
 
-- **00:00.495**: :ref:`sphx_glr_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``)
-- **00:00.476**: :ref:`sphx_glr_vta_tutorials_vta_get_started.py` (``vta_get_started.py``)
+- **00:00.474**: :ref:`sphx_glr_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``)
+- **00:00.443**: :ref:`sphx_glr_vta_tutorials_vta_get_started.py` (``vta_get_started.py``)
diff --git a/docs/_static/css/tlcpack_theme.css b/docs/_static/css/tlcpack_theme.css
index 929d317..2dca101 100644
--- a/docs/_static/css/tlcpack_theme.css
+++ b/docs/_static/css/tlcpack_theme.css
@@ -681,6 +681,10 @@ footer .btn.float-right:after {
     color: #303030;
 }
 
+.wy-side-nav-search a:hover {
+    text-decoration: none;
+}
+
 @media only screen and (max-width : 991px) {
   .wy-nav-side .wy-side-nav-search {
     width: 100%;
diff --git a/docs/api/doxygen/_2workspace_2include_2tvm_2tir_2transform_8h-example.html b/docs/api/doxygen/_2workspace_2include_2tvm_2tir_2transform_8h-example.html
new file mode 100644
index 0000000..6ce74a4
--- /dev/null
+++ b/docs/api/doxygen/_2workspace_2include_2tvm_2tir_2transform_8h-example.html
@@ -0,0 +1,94 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.11"/>
+<title>tvm: /workspace/include/tvm/tir/transform.h</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectalign" style="padding-left: 0.5em;">
+   <div id="projectname">tvm
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.11 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+  <div id="navrow1" class="tabs">
+    <ul class="tablist">
+      <li><a href="index.html"><span>Main&#160;Page</span></a></li>
+      <li><a href="namespaces.html"><span>Namespaces</span></a></li>
+      <li><a href="annotated.html"><span>Classes</span></a></li>
+      <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
+      <li>
+        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+      </li>
+    </ul>
+  </div>
+</div><!-- top -->
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">/workspace/include/tvm/tir/transform.h</div>  </div>
+</div><!--header-->
+<div class="contents">
+<p>Compact the buffer access region by removing the buffer regions that are not accessed, i.e. narrowing the buffer shape and adjust the access region if necessary.</p>
+<p>Before narrowing, <code>B</code> is a <code>[16, 16]</code> buffer, but only a skinny vector <code>B[i, 0:16]</code> is accessed. </p><div class="fragment"><div class="line"><span class="keywordflow">for</span> i in range(0, 16):</div><div class="line">    with tir.block([]):</div><div class="line">        B = tir.alloc_buffer(16, 16)</div><div class="line">        for j in range(0, 16):</div><div class="line">            B[i, j] = A[i, j] + 1</div><div class="line">        for j in r [...]
+<div class="fragment"><div class="line"><span class="keywordflow">for</span> i in range(0, 16):</div><div class="line">    with tir.block([]):</div><div class="line">        B = tir.alloc_buffer(1, 16)</div><div class="line">        for j in range(0, 16):</div><div class="line">            B[0, j] = A[i, j] + 1</div><div class="line">        for j in range(0, 16):</div><div class="line">            C[i, j] = B[0, j] + 1</div></div><!-- fragment --><dl class="section return"><dt>Returns</ [...]
+<div class="fragment"><div class="line"><span class="comment">/*</span></div><div class="line"><span class="comment"> * Licensed to the Apache Software Foundation (ASF) under one</span></div><div class="line"><span class="comment"> * or more contributor license agreements.  See the NOTICE file</span></div><div class="line"><span class="comment"> * distributed with this work for additional information</span></div><div class="line"><span class="comment"> * regarding copyright ownership.  T [...]
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by &#160;<a href="http://www.doxygen.org/index.html">
+<img class="footer" src="doxygen.png" alt="doxygen"/>
+</a> 1.8.11
+</small></address>
+</body>
+</html>
diff --git a/docs/api/doxygen/algorithm_8h.html b/docs/api/doxygen/algorithm_8h.html
index e5c7f67..9595282 100644
--- a/docs/api/doxygen/algorithm_8h.html
+++ b/docs/api/doxygen/algorithm_8h.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/algorithm_8h_source.html b/docs/api/doxygen/algorithm_8h_source.html
index df6b99e..050df2f 100644
--- a/docs/api/doxygen/algorithm_8h_source.html
+++ b/docs/api/doxygen/algorithm_8h_source.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/analyzer_8h.html b/docs/api/doxygen/analyzer_8h.html
index aa79b57..bcb2783 100644
--- a/docs/api/doxygen/analyzer_8h.html
+++ b/docs/api/doxygen/analyzer_8h.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/analyzer_8h_source.html b/docs/api/doxygen/analyzer_8h_source.html
index cd10dcd..bcabe90 100644
--- a/docs/api/doxygen/analyzer_8h_source.html
+++ b/docs/api/doxygen/analyzer_8h_source.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/annotated.html b/docs/api/doxygen/annotated.html
index b56bb1c..f53fab1 100644
--- a/docs/api/doxygen/annotated.html
+++ b/docs/api/doxygen/annotated.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/annotation_8h.html b/docs/api/doxygen/annotation_8h.html
index bdc6aa6..540768d 100644
--- a/docs/api/doxygen/annotation_8h.html
+++ b/docs/api/doxygen/annotation_8h.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/annotation_8h_source.html b/docs/api/doxygen/annotation_8h_source.html
index feec11d..32fda38 100644
--- a/docs/api/doxygen/annotation_8h_source.html
+++ b/docs/api/doxygen/annotation_8h_source.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/array__utils_8h.html b/docs/api/doxygen/array__utils_8h.html
index 8beda7c..56df12f 100644
--- a/docs/api/doxygen/array__utils_8h.html
+++ b/docs/api/doxygen/array__utils_8h.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/array__utils_8h_source.html b/docs/api/doxygen/array__utils_8h_source.html
index 2707ea6..8a7bc02 100644
--- a/docs/api/doxygen/array__utils_8h_source.html
+++ b/docs/api/doxygen/array__utils_8h_source.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/attr__registry__map_8h.html b/docs/api/doxygen/attr__registry__map_8h.html
index 02c402b..86dd767 100644
--- a/docs/api/doxygen/attr__registry__map_8h.html
+++ b/docs/api/doxygen/attr__registry__map_8h.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/attr__registry__map_8h_source.html b/docs/api/doxygen/attr__registry__map_8h_source.html
index a3b6323..f59ba8b 100644
--- a/docs/api/doxygen/attr__registry__map_8h_source.html
+++ b/docs/api/doxygen/attr__registry__map_8h_source.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/auto__schedule_8h.html b/docs/api/doxygen/auto__schedule_8h.html
index 12f034e..9aed234 100644
--- a/docs/api/doxygen/auto__schedule_8h.html
+++ b/docs/api/doxygen/auto__schedule_8h.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/auto__schedule_8h_source.html b/docs/api/doxygen/auto__schedule_8h_source.html
index a797138..d449f43 100644
--- a/docs/api/doxygen/auto__schedule_8h_source.html
+++ b/docs/api/doxygen/auto__schedule_8h_source.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/auto__scheduler_2feature_8h.html b/docs/api/doxygen/auto__scheduler_2feature_8h.html
index 149ce40..3fba80e 100644
--- a/docs/api/doxygen/auto__scheduler_2feature_8h.html
+++ b/docs/api/doxygen/auto__scheduler_2feature_8h.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/auto__scheduler_2feature_8h_source.html b/docs/api/doxygen/auto__scheduler_2feature_8h_source.html
index 5ebaedf..8611087 100644
--- a/docs/api/doxygen/auto__scheduler_2feature_8h_source.html
+++ b/docs/api/doxygen/auto__scheduler_2feature_8h_source.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/autodiff_8h.html b/docs/api/doxygen/autodiff_8h.html
index 7066aba..99ddcdb 100644
--- a/docs/api/doxygen/autodiff_8h.html
+++ b/docs/api/doxygen/autodiff_8h.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/autodiff_8h_source.html b/docs/api/doxygen/autodiff_8h_source.html
index ba9291a..d40053c 100644
--- a/docs/api/doxygen/autodiff_8h_source.html
+++ b/docs/api/doxygen/autodiff_8h_source.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/base_8h.html b/docs/api/doxygen/base_8h.html
index b199ed9..042d9b7 100644
--- a/docs/api/doxygen/base_8h.html
+++ b/docs/api/doxygen/base_8h.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/base_8h_source.html b/docs/api/doxygen/base_8h_source.html
index 1186334..bea4b2c 100644
--- a/docs/api/doxygen/base_8h_source.html
+++ b/docs/api/doxygen/base_8h_source.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/bias__add_8h.html b/docs/api/doxygen/bias__add_8h.html
index f551c98..50deab3 100644
--- a/docs/api/doxygen/bias__add_8h.html
+++ b/docs/api/doxygen/bias__add_8h.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/bias__add_8h_source.html b/docs/api/doxygen/bias__add_8h_source.html
index 1dffd0e..5cd753d 100644
--- a/docs/api/doxygen/bias__add_8h_source.html
+++ b/docs/api/doxygen/bias__add_8h_source.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/bitserial_8h.html b/docs/api/doxygen/bitserial_8h.html
index 8db8dcc..aa8dc84 100644
--- a/docs/api/doxygen/bitserial_8h.html
+++ b/docs/api/doxygen/bitserial_8h.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/bitserial_8h_source.html b/docs/api/doxygen/bitserial_8h_source.html
index 5287dce..46311b9 100644
--- a/docs/api/doxygen/bitserial_8h_source.html
+++ b/docs/api/doxygen/bitserial_8h_source.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/block__scope_8h.html b/docs/api/doxygen/block__scope_8h.html
index 24b13c0..f6b406b 100644
--- a/docs/api/doxygen/block__scope_8h.html
+++ b/docs/api/doxygen/block__scope_8h.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/block__scope_8h_source.html b/docs/api/doxygen/block__scope_8h_source.html
index 8bf3a32..01f9d41 100644
--- a/docs/api/doxygen/block__scope_8h_source.html
+++ b/docs/api/doxygen/block__scope_8h_source.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/bound_8h.html b/docs/api/doxygen/bound_8h.html
index da8efa0..5fd10a9 100644
--- a/docs/api/doxygen/bound_8h.html
+++ b/docs/api/doxygen/bound_8h.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/bound_8h_source.html b/docs/api/doxygen/bound_8h_source.html
index 12d1498..5ee77e4 100644
--- a/docs/api/doxygen/bound_8h_source.html
+++ b/docs/api/doxygen/bound_8h_source.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/broadcast_8h.html b/docs/api/doxygen/broadcast_8h.html
index 341e718..06405d3 100644
--- a/docs/api/doxygen/broadcast_8h.html
+++ b/docs/api/doxygen/broadcast_8h.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/broadcast_8h_source.html b/docs/api/doxygen/broadcast_8h_source.html
index 3abdbd8..3d378e1 100644
--- a/docs/api/doxygen/broadcast_8h_source.html
+++ b/docs/api/doxygen/broadcast_8h_source.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/buffer_8h.html b/docs/api/doxygen/buffer_8h.html
index cc805dd..c39b342 100644
--- a/docs/api/doxygen/buffer_8h.html
+++ b/docs/api/doxygen/buffer_8h.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/buffer_8h_source.html b/docs/api/doxygen/buffer_8h_source.html
index 3a57bbe..3d101e5 100644
--- a/docs/api/doxygen/buffer_8h_source.html
+++ b/docs/api/doxygen/buffer_8h_source.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/builtin_8h.html b/docs/api/doxygen/builtin_8h.html
index 6e22a63..880830e 100644
--- a/docs/api/doxygen/builtin_8h.html
+++ b/docs/api/doxygen/builtin_8h.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/builtin_8h_source.html b/docs/api/doxygen/builtin_8h_source.html
index 4ff6dbf..0ed3dc6 100644
--- a/docs/api/doxygen/builtin_8h_source.html
+++ b/docs/api/doxygen/builtin_8h_source.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
@@ -89,16 +90,16 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 <div class="title">builtin.h</div>  </div>
 </div><!--header-->
 <div class="contents">
-<a href="builtin_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno">    1</span>&#160;<span class="comment">/*</span></div><div class="line"><a name="l00002"></a><span class="lineno">    2</span>&#160;<span class="comment"> * Licensed to the Apache Software Foundation (ASF) under one</span></div><div class="line"><a name="l00003"></a><span class="lineno">    3</span>&#160;<span class="comment"> * or more con [...]
-<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1cad28cfc7b69fd8745e12a4f0024d6942a"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1cad28cfc7b69fd8745e12a4f0024d6942a">tvm::tir::builtin::kArrNDim</a></div><div class="ttdef"><b>Definition:</b> builtin.h:568</div></div>
-<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1ca0b8af30aa268164148d5bfe1d8c2ba54"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1ca0b8af30aa268164148d5bfe1d8c2ba54">tvm::tir::builtin::kArrAddr</a></div><div class="ttdef"><b>Definition:</b> builtin.h:564</div></div>
+<a href="builtin_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno">    1</span>&#160;<span class="comment">/*</span></div><div class="line"><a name="l00002"></a><span class="lineno">    2</span>&#160;<span class="comment"> * Licensed to the Apache Software Foundation (ASF) under one</span></div><div class="line"><a name="l00003"></a><span class="lineno">    3</span>&#160;<span class="comment"> * or more con [...]
+<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1cad28cfc7b69fd8745e12a4f0024d6942a"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1cad28cfc7b69fd8745e12a4f0024d6942a">tvm::tir::builtin::kArrNDim</a></div><div class="ttdef"><b>Definition:</b> builtin.h:576</div></div>
+<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1ca0b8af30aa268164148d5bfe1d8c2ba54"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1ca0b8af30aa268164148d5bfe1d8c2ba54">tvm::tir::builtin::kArrAddr</a></div><div class="ttdef"><b>Definition:</b> builtin.h:572</div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a925a45e5bb05e0cbf2daf2ffdbdcf53a"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a925a45e5bb05e0cbf2daf2ffdbdcf53a">tvm::tir::builtin::tvm_storage_sync</a></div><div class="ttdeci">const Op &amp; tvm_storage_sync()</div><div class="ttdoc">See pseudo code. </div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a15c5e0e0478e0ebff91690f60992cf3f"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a15c5e0e0478e0ebff91690f60992cf3f">tvm::tir::builtin::tvm_stack_alloca</a></div><div class="ttdeci">const Op &amp; tvm_stack_alloca()</div><div class="ttdoc">See pesudo code. </div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_aa1d19e758595200998a4e1ea39767b6b"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#aa1d19e758595200998a4e1ea39767b6b">tvm::tir::builtin::tvm_thread_allreduce</a></div><div class="ttdeci">const Op &amp; tvm_thread_allreduce()</div><div class="ttdoc">See pesudo code. </div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_aca44a85c87273dfab1731421f4edd2bf"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#aca44a85c87273dfab1731421f4edd2bf">tvm::tir::builtin::tvm_warp_shuffle</a></div><div class="ttdeci">const Op &amp; tvm_warp_shuffle()</div><div class="ttdoc">See pseudo code. </div></div>
 <div class="ttc" id="namespacetvm_html"><div class="ttname"><a href="namespacetvm.html">tvm</a></div><div class="ttdef"><b>Definition:</b> analyzer.h:36</div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a83892dca19e44a96752625c65c38d645"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a83892dca19e44a96752625c65c38d645">tvm::tir::builtin::call_llvm_intrin</a></div><div class="ttdeci">const Op &amp; call_llvm_intrin()</div><div class="ttdoc">Call an LLVM intrinsic with a given intrinsic id and signature from the types of args in the runtime ...</div></div>
-<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1cabf798b873c868b7d77ced30c9907037d"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1cabf798b873c868b7d77ced30c9907037d">tvm::tir::builtin::kArrDeviceType</a></div><div class="ttdef"><b>Definition:</b> builtin.h:574</div></div>
+<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1cabf798b873c868b7d77ced30c9907037d"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1cabf798b873c868b7d77ced30c9907037d">tvm::tir::builtin::kArrDeviceType</a></div><div class="ttdef"><b>Definition:</b> builtin.h:582</div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a1e15b04fe89f7899e09e528946aa5bb4"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a1e15b04fe89f7899e09e528946aa5bb4">tvm::tir::builtin::fma</a></div><div class="ttdeci">const Op &amp; fma()</div><div class="ttdoc">Fused multiply add. </div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a8e3504415c78f3f8fd719a21e5280b38"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a8e3504415c78f3f8fd719a21e5280b38">tvm::tir::builtin::call_llvm_pure_intrin</a></div><div class="ttdeci">const Op &amp; call_llvm_pure_intrin()</div><div class="ttdoc">Call an LLVM pure intrinsic with a given intrinsic id and signature from the types of args in the run...</div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ab4a648f6e7451af295688f243a215cd7"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ab4a648f6e7451af295688f243a215cd7">tvm::tir::builtin::atomic_add</a></div><div class="ttdeci">const Op &amp; atomic_add()</div><div class="ttdoc">atomic add instruction, corresponding e.g. to atomicAdd in CUDA </div></div>
@@ -107,27 +108,27 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ae0470bd69bb03047aae4cb52e1e6e337"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ae0470bd69bb03047aae4cb52e1e6e337">tvm::tir::builtin::tvm_warp_shuffle_up</a></div><div class="ttdeci">const Op &amp; tvm_warp_shuffle_up()</div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ac54288cc9f1fee8c26db9bd87ac320ee"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ac54288cc9f1fee8c26db9bd87ac320ee">tvm::tir::builtin::tvm_call_trace_packed</a></div><div class="ttdeci">const Op &amp; tvm_call_trace_packed()</div><div class="ttdoc">See pesudo code. </div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_afc81da8cbcd7f34ec5e1e80d837ca265"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#afc81da8cbcd7f34ec5e1e80d837ca265">tvm::tir::builtin::tvm_store_matrix_sync</a></div><div class="ttdeci">const Op &amp; tvm_store_matrix_sync()</div><div class="ttdoc">tvm intrinsic for tensor core store operators. </div></div>
-<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1ca96e7b6492b5b174219cf60e19af0857c"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1ca96e7b6492b5b174219cf60e19af0857c">tvm::tir::builtin::kArrStrides</a></div><div class="ttdef"><b>Definition:</b> builtin.h:567</div></div>
+<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1ca96e7b6492b5b174219cf60e19af0857c"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1ca96e7b6492b5b174219cf60e19af0857c">tvm::tir::builtin::kArrStrides</a></div><div class="ttdef"><b>Definition:</b> builtin.h:575</div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ac4887bd93ad67619ad290a33e2bdd340"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ac4887bd93ad67619ad290a33e2bdd340">tvm::tir::builtin::call_spirv_pure_glsl450</a></div><div class="ttdeci">const Op &amp; call_spirv_pure_glsl450()</div><div class="ttdoc">Call an SPIRV pure GLSL450 intrinsic. </div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a6be181be34fba13d129aadc6c9a23f73"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a6be181be34fba13d129aadc6c9a23f73">tvm::tir::builtin::tvm_thread_context</a></div><div class="ttdeci">const Op &amp; tvm_thread_context()</div><div class="ttdoc">See pesudo code Mark the content as thread local context, can get optimized by only call the call onc...</div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a700b7018f2c1f1fba8b4e28f264d8bbb"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a700b7018f2c1f1fba8b4e28f264d8bbb">tvm::tir::builtin::address_of</a></div><div class="ttdeci">const Op &amp; address_of()</div><div class="ttdoc">See pesudo code. </div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a091ef99dc63f6945588dbb81c968ca15"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a091ef99dc63f6945588dbb81c968ca15">tvm::tir::builtin::bitwise_not</a></div><div class="ttdeci">const Op &amp; bitwise_not()</div><div class="ttdoc">Bitwise not operator. </div></div>
-<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1cafdb925cdf50f17a2b96c7ac4faefa1fb"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1cafdb925cdf50f17a2b96c7ac4faefa1fb">tvm::tir::builtin::kArrByteOffset</a></div><div class="ttdef"><b>Definition:</b> builtin.h:572</div></div>
+<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1cafdb925cdf50f17a2b96c7ac4faefa1fb"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1cafdb925cdf50f17a2b96c7ac4faefa1fb">tvm::tir::builtin::kArrByteOffset</a></div><div class="ttdef"><b>Definition:</b> builtin.h:580</div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a0c2ebdcec34d7c79dc8480e5dab8547a"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a0c2ebdcec34d7c79dc8480e5dab8547a">tvm::tir::builtin::q_multiply_shift</a></div><div class="ttdeci">const Op &amp; q_multiply_shift()</div><div class="ttdoc">Execute a multiplication between two Q-numbers x and y followed by a right shift s The default roundi...</div></div>
 <div class="ttc" id="ir_2op_8h_html"><div class="ttname"><a href="ir_2op_8h.html">op.h</a></div><div class="ttdoc">Primitive operators(builtin intrinsics) and registry for them. </div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a7ed64a9fb0a7f575fc63e1e0395e96a6"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a7ed64a9fb0a7f575fc63e1e0395e96a6">tvm::tir::builtin::vectorlow</a></div><div class="ttdeci">const Op &amp; vectorlow()</div><div class="ttdoc">Get the low-level half of the vector. </div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a0cbd267877168afd5bbea35f0e5d70fe"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a0cbd267877168afd5bbea35f0e5d70fe">tvm::tir::builtin::tvm_mma_sync</a></div><div class="ttdeci">const Op &amp; tvm_mma_sync()</div><div class="ttdoc">tvm intrinsic for tensor core mma_sync operators. </div></div>
-<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1ca344dc1f419339b81024d4d3628083a1e"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1ca344dc1f419339b81024d4d3628083a1e">tvm::tir::builtin::kArrTypeBits</a></div><div class="ttdef"><b>Definition:</b> builtin.h:570</div></div>
+<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1ca344dc1f419339b81024d4d3628083a1e"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1ca344dc1f419339b81024d4d3628083a1e">tvm::tir::builtin::kArrTypeBits</a></div><div class="ttdef"><b>Definition:</b> builtin.h:578</div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a7b555bc5cca2f5e7b26c1037bc0001ce"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a7b555bc5cca2f5e7b26c1037bc0001ce">tvm::tir::builtin::reinterpret</a></div><div class="ttdeci">const Op &amp; reinterpret()</div><div class="ttdoc">Reinterpret the value using the target type. </div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ae2add6e324d391782d367360a68ccf51"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ae2add6e324d391782d367360a68ccf51">tvm::tir::builtin::call_pure_extern</a></div><div class="ttdeci">const Op &amp; call_pure_extern()</div><div class="ttdoc">Call an pure extern C function with given name and signature from the types of args in the runtime en...</div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a2172690dd21d7fd50a4fd4d696ea7bb2"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a2172690dd21d7fd50a4fd4d696ea7bb2">tvm::tir::builtin::popcount</a></div><div class="ttdeci">const Op &amp; popcount()</div><div class="ttdoc">Popcount. </div></div>
-<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1ca0c960782c20a4f16cfe203c516760b00"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1ca0c960782c20a4f16cfe203c516760b00">tvm::tir::builtin::kArrTypeLanes</a></div><div class="ttdef"><b>Definition:</b> builtin.h:571</div></div>
+<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1ca0c960782c20a4f16cfe203c516760b00"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1ca0c960782c20a4f16cfe203c516760b00">tvm::tir::builtin::kArrTypeLanes</a></div><div class="ttdef"><b>Definition:</b> builtin.h:579</div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a2c13c6e4b2f92e17f357665f9f11736c"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a2c13c6e4b2f92e17f357665f9f11736c">tvm::tir::builtin::tvm_call_packed</a></div><div class="ttdeci">const Op &amp; tvm_call_packed()</div><div class="ttdoc">See pesudo code. </div></div>
 <div class="ttc" id="tir_2expr_8h_html"><div class="ttname"><a href="tir_2expr_8h.html">expr.h</a></div><div class="ttdoc">TIR expressions. </div></div>
-<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1ca4d4a5d54434514fd8b0ce57160059c92"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1ca4d4a5d54434514fd8b0ce57160059c92">tvm::tir::builtin::kArrKindBound_</a></div><div class="ttdef"><b>Definition:</b> builtin.h:575</div></div>
+<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1ca4d4a5d54434514fd8b0ce57160059c92"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1ca4d4a5d54434514fd8b0ce57160059c92">tvm::tir::builtin::kArrKindBound_</a></div><div class="ttdef"><b>Definition:</b> builtin.h:583</div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a21d1f0395dca5af4a90cdb42c1d1d154"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a21d1f0395dca5af4a90cdb42c1d1d154">tvm::tir::builtin::likely</a></div><div class="ttdeci">const Op &amp; likely()</div><div class="ttdoc">Marks a condition is likely going to happen. </div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a23003bd9331efaa58d8420529ea96c0b"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a23003bd9331efaa58d8420529ea96c0b">tvm::tir::builtin::tvm_struct_get</a></div><div class="ttdeci">const Op &amp; tvm_struct_get()</div><div class="ttdoc">See pesudo code. </div></div>
-<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1ca779c07403e11f671e936ec2813ce2304"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1ca779c07403e11f671e936ec2813ce2304">tvm::tir::builtin::kTVMValueContent</a></div><div class="ttdef"><b>Definition:</b> builtin.h:577</div></div>
+<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1ca779c07403e11f671e936ec2813ce2304"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1ca779c07403e11f671e936ec2813ce2304">tvm::tir::builtin::kTVMValueContent</a></div><div class="ttdef"><b>Definition:</b> builtin.h:585</div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a28f99e6dd767482765b854ee9fc71f2c"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a28f99e6dd767482765b854ee9fc71f2c">tvm::tir::builtin::tvm_stack_make_array</a></div><div class="ttdeci">const Op &amp; tvm_stack_make_array()</div><div class="ttdoc">Allocate a NDArray(DLTensor) on stack, return the handle. </div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a543f1fc334d2bc830add972895a03f17"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a543f1fc334d2bc830add972895a03f17">tvm::tir::builtin::prefetch</a></div><div class="ttdeci">const Op &amp; prefetch()</div><div class="ttdoc">Prefetch a cacheline. </div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a92624d2aa5c435cd7a0ea8efb698a115"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a92624d2aa5c435cd7a0ea8efb698a115">tvm::tir::builtin::tvm_throw_last_error</a></div><div class="ttdeci">const Op &amp; tvm_throw_last_error()</div><div class="ttdoc">See pesudo code. </div></div>
@@ -135,9 +136,9 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a3e84c73dbbcf7f97008ac84c169feae9"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a3e84c73dbbcf7f97008ac84c169feae9">tvm::tir::builtin::tvm_access_ptr</a></div><div class="ttdeci">const Op &amp; tvm_access_ptr()</div><div class="ttdoc">Get head access address with memory access pattern info. </div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_abd540cb73407771ecfb4f78722ce5a1b"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#abd540cb73407771ecfb4f78722ce5a1b">tvm::tir::builtin::tvm_stack_make_shape</a></div><div class="ttdeci">const Op &amp; tvm_stack_make_shape()</div><div class="ttdoc">Allocate a shape tuple on stack, return the handle. </div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ae741e67259cd4b844a8934f2e2704adc"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ae741e67259cd4b844a8934f2e2704adc">tvm::tir::builtin::if_then_else</a></div><div class="ttdeci">const Op &amp; if_then_else()</div><div class="ttdoc">Same as select, used for unsafe memory access. </div></div>
-<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1caa73457ed97931251f1762cb319adc858"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1caa73457ed97931251f1762cb319adc858">tvm::tir::builtin::kTVMValueKindBound_</a></div><div class="ttdef"><b>Definition:</b> builtin.h:578</div></div>
+<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1caa73457ed97931251f1762cb319adc858"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1caa73457ed97931251f1762cb319adc858">tvm::tir::builtin::kTVMValueKindBound_</a></div><div class="ttdef"><b>Definition:</b> builtin.h:586</div></div>
 <div class="ttc" id="classtvm_1_1Op_html"><div class="ttname"><a href="classtvm_1_1Op.html">tvm::Op</a></div><div class="ttdoc">Managed reference class to OpNode. </div><div class="ttdef"><b>Definition:</b> op.h:165</div></div>
-<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1ca57f69fd3d141caaa7e2e72fda7d6a1da"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1ca57f69fd3d141caaa7e2e72fda7d6a1da">tvm::tir::builtin::kArrShape</a></div><div class="ttdef"><b>Definition:</b> builtin.h:566</div></div>
+<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1ca57f69fd3d141caaa7e2e72fda7d6a1da"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1ca57f69fd3d141caaa7e2e72fda7d6a1da">tvm::tir::builtin::kArrShape</a></div><div class="ttdef"><b>Definition:</b> builtin.h:574</div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_afc4086a245ded9076de226ae802ced32"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#afc4086a245ded9076de226ae802ced32">tvm::tir::builtin::tvm_warp_activemask</a></div><div class="ttdeci">const Op &amp; tvm_warp_activemask()</div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a668eaad07b6c46238f2bf758e61b58a5"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a668eaad07b6c46238f2bf758e61b58a5">tvm::tir::builtin::call_extern</a></div><div class="ttdeci">const Op &amp; call_extern()</div><div class="ttdoc">Call an extern C function with given name and signature from the types of args in the runtime environ...</div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_af103ae0715d4ebcbaccd49d2b6a12afe"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#af103ae0715d4ebcbaccd49d2b6a12afe">tvm::tir::builtin::shift_right</a></div><div class="ttdeci">const Op &amp; shift_right()</div><div class="ttdoc">Right shift. </div></div>
@@ -146,7 +147,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a8d5e173f1a16740172a9ad6f2aa85a08"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a8d5e173f1a16740172a9ad6f2aa85a08">tvm::tir::builtin::tvm_bmma_sync</a></div><div class="ttdeci">const Op &amp; tvm_bmma_sync()</div><div class="ttdoc">tvm intrinsic for tensor core bmma_sync operators. </div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a0cd2ac37b80c498ded412572146ecc67"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a0cd2ac37b80c498ded412572146ecc67">tvm::tir::builtin::bitwise_xor</a></div><div class="ttdeci">const Op &amp; bitwise_xor()</div><div class="ttdoc">Bitwise xor operator. </div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a6aeb24a28d19cdc60e4e1fa7b420d7fd"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a6aeb24a28d19cdc60e4e1fa7b420d7fd">tvm::tir::builtin::tvm_static_handle</a></div><div class="ttdeci">const Op &amp; tvm_static_handle()</div><div class="ttdoc">Create a function local static handle that iniitalizes to nullptr. can be used to cache function loca...</div></div>
-<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1c"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1c">tvm::tir::builtin::TVMStructFieldKind</a></div><div class="ttdeci">TVMStructFieldKind</div><div class="ttdoc">The kind of structure field info used in intrinsic. </div><div class="ttdef"><b>Definition:</b> builtin.h:562</div></div>
+<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1c"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1c">tvm::tir::builtin::TVMStructFieldKind</a></div><div class="ttdeci">TVMStructFieldKind</div><div class="ttdoc">The kind of structure field info used in intrinsic. </div><div class="ttdef"><b>Definition:</b> builtin.h:570</div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a0e633f53c50e14d7e2fc07636a223309"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a0e633f53c50e14d7e2fc07636a223309">tvm::tir::builtin::bitwise_and</a></div><div class="ttdeci">const Op &amp; bitwise_and()</div><div class="ttdoc">Bitwise and operator. </div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a6f53be295396c301082696ca0c113501"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a6f53be295396c301082696ca0c113501">tvm::tir::builtin::isnan</a></div><div class="ttdeci">const Op &amp; isnan()</div><div class="ttdoc">Check if value is nan. </div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_af6d1c48570e10287683d58f22e4de98f"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#af6d1c48570e10287683d58f22e4de98f">tvm::tir::builtin::tvm_warp_shuffle_down</a></div><div class="ttdeci">const Op &amp; tvm_warp_shuffle_down()</div></div>
@@ -157,9 +158,9 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a0117a4a76af962576a6a3bbf32f97b36"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a0117a4a76af962576a6a3bbf32f97b36">tvm::tir::builtin::tvm_call_packed_lowered</a></div><div class="ttdeci">const Op &amp; tvm_call_packed_lowered()</div><div class="ttdoc">Lowered version of call packed, the space of value and type codes are explicitly allocated. </div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a322ae63444ed4e5fcf7247aa93f8bb7c"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a322ae63444ed4e5fcf7247aa93f8bb7c">tvm::tir::builtin::large_uint_imm</a></div><div class="ttdeci">const Op &amp; large_uint_imm()</div><div class="ttdoc">See pesudo code. </div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a21c2ad8b095dcbefa786394981ea0b71"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a21c2ad8b095dcbefa786394981ea0b71">tvm::tir::builtin::tvm_context_id</a></div><div class="ttdeci">const Op &amp; tvm_context_id()</div><div class="ttdoc">Return a unique context id, used for hint of workspace separation. Different context id ganrantees no...</div></div>
-<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1ca9076fb1a58386bac2e0f1fdae9cab844"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1ca9076fb1a58386bac2e0f1fdae9cab844">tvm::tir::builtin::kArrData</a></div><div class="ttdef"><b>Definition:</b> builtin.h:565</div></div>
+<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1ca9076fb1a58386bac2e0f1fdae9cab844"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1ca9076fb1a58386bac2e0f1fdae9cab844">tvm::tir::builtin::kArrData</a></div><div class="ttdef"><b>Definition:</b> builtin.h:573</div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_a26472adf05d821f1929cfbc02bc3c231"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#a26472adf05d821f1929cfbc02bc3c231">tvm::tir::builtin::shift_left</a></div><div class="ttdeci">const Op &amp; shift_left()</div><div class="ttdoc">Left shift. </div></div>
-<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1ca5ce842cabb26975681dd561c5132af1b"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1ca5ce842cabb26975681dd561c5132af1b">tvm::tir::builtin::kArrTypeCode</a></div><div class="ttdef"><b>Definition:</b> builtin.h:569</div></div>
+<div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_ad3b90c881b67ebe8e8fe68f14143bb1ca5ce842cabb26975681dd561c5132af1b"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#ad3b90c881b67ebe8e8fe68f14143bb1ca5ce842cabb26975681dd561c5132af1b">tvm::tir::builtin::kArrTypeCode</a></div><div class="ttdef"><b>Definition:</b> builtin.h:577</div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_aa6e23eac98abb8378b9837011a5c04b5"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#aa6e23eac98abb8378b9837011a5c04b5">tvm::tir::builtin::tvm_call_trace_packed_lowered</a></div><div class="ttdeci">const Op &amp; tvm_call_trace_packed_lowered()</div><div class="ttdoc">Lowered version of trace intrinsic, the space of value and type codes are explicitly allocated...</div></div>
 <div class="ttc" id="namespacetvm_1_1tir_1_1builtin_html_aa5b0e90771b35d78b6c07c0054abe023"><div class="ttname"><a href="namespacetvm_1_1tir_1_1builtin.html#aa5b0e90771b35d78b6c07c0054abe023">tvm::tir::builtin::isnullptr</a></div><div class="ttdeci">const Op &amp; isnullptr()</div><div class="ttdoc">See pesudo code. </div></div>
 </div><!-- fragment --></div><!-- contents -->
diff --git a/docs/api/doxygen/bytecode_8h.html b/docs/api/doxygen/bytecode_8h.html
index 3d19049..00dd87e 100644
--- a/docs/api/doxygen/bytecode_8h.html
+++ b/docs/api/doxygen/bytecode_8h.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/bytecode_8h_source.html b/docs/api/doxygen/bytecode_8h_source.html
index 20320af..49802c8 100644
--- a/docs/api/doxygen/bytecode_8h_source.html
+++ b/docs/api/doxygen/bytecode_8h_source.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
@@ -93,7 +94,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 <div class="ttc" id="structtvm_1_1runtime_1_1vm_1_1Instruction_html_adbf563aa259f209ad20619c25921cdc1"><div class="ttname"><a href="structtvm_1_1runtime_1_1vm_1_1Instruction.html#adbf563aa259f209ad20619c25921cdc1">tvm::runtime::vm::Instruction::num_freevar</a></div><div class="ttdeci">Index num_freevar</div><div class="ttdoc">The number of free variables to capture. </div><div class="ttdef"><b>Definition:</b> bytecode.h:190</div></div>
 <div class="ttc" id="structtvm_1_1runtime_1_1vm_1_1Instruction_html_aaea1385d3562231a2e242faa0b51c616"><div class="ttname"><a href="structtvm_1_1runtime_1_1vm_1_1Instruction.html#aaea1385d3562231a2e242faa0b51c616">tvm::runtime::vm::Instruction::clo_index</a></div><div class="ttdeci">Index clo_index</div><div class="ttdoc">The index into the function table. </div><div class="ttdef"><b>Definition:</b> bytecode.h:188</div></div>
 <div class="ttc" id="structtvm_1_1runtime_1_1vm_1_1Instruction_html_ac60e60c96d7bb22d8ac06169d5969232"><div class="ttname"><a href="structtvm_1_1runtime_1_1vm_1_1Instruction.html#ac60e60c96d7bb22d8ac06169d5969232">tvm::runtime::vm::Instruction::packed_index</a></div><div class="ttdeci">Index packed_index</div><div class="ttdoc">The index into the packed function table. </div><div class="ttdef"><b>Definition:</b> bytecode.h:130</div></div>
-<div class="ttc" id="namespacetvm_1_1tir_1_1attr_html_a7e4e7cd47471a9089022214d63d24206"><div class="ttname"><a href="namespacetvm_1_1tir_1_1attr.html#a7e4e7cd47471a9089022214d63d24206">tvm::tir::attr::device_type</a></div><div class="ttdeci">constexpr const char * device_type</div><div class="ttdoc">The device type. </div><div class="ttdef"><b>Definition:</b> stmt.h:1237</div></div>
+<div class="ttc" id="namespacetvm_1_1tir_1_1attr_html_a7e4e7cd47471a9089022214d63d24206"><div class="ttname"><a href="namespacetvm_1_1tir_1_1attr.html#a7e4e7cd47471a9089022214d63d24206">tvm::tir::attr::device_type</a></div><div class="ttdeci">constexpr const char * device_type</div><div class="ttdoc">The device type. </div><div class="ttdef"><b>Definition:</b> stmt.h:1247</div></div>
 <div class="ttc" id="namespacetvm_1_1runtime_1_1vm_html_a8d8d95ce8d629c7213f2f595917870ecac8086b44868c71384cfec25bf1f1a6e6"><div class="ttname"><a href="namespacetvm_1_1runtime_1_1vm.html#a8d8d95ce8d629c7213f2f595917870ecac8086b44868c71384cfec25bf1f1a6e6">tvm::runtime::vm::Opcode::AllocTensorReg</a></div></div>
 <div class="ttc" id="namespacetvm_1_1runtime_1_1vm_html_a8d8d95ce8d629c7213f2f595917870eca9558cee150d88040130f7b86d8d5dc58"><div class="ttname"><a href="namespacetvm_1_1runtime_1_1vm.html#a8d8d95ce8d629c7213f2f595917870eca9558cee150d88040130f7b86d8d5dc58">tvm::runtime::vm::Opcode::LoadConsti</a></div></div>
 <div class="ttc" id="structtvm_1_1runtime_1_1vm_1_1Instruction_html_ae0d33229af059c727db2abd3616660e0"><div class="ttname"><a href="structtvm_1_1runtime_1_1vm_1_1Instruction.html#ae0d33229af059c727db2abd3616660e0">tvm::runtime::vm::Instruction::result</a></div><div class="ttdeci">RegName result</div><div class="ttdoc">The register to return. </div><div class="ttdef"><b>Definition:</b> bytecode.h:122</div></div>
@@ -131,7 +132,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 <div class="ttc" id="structtvm_1_1runtime_1_1vm_1_1Instruction_html_a46879dbe84105fb621a6167f8d73b223"><div class="ttname"><a href="structtvm_1_1runtime_1_1vm_1_1Instruction.html#a46879dbe84105fb621a6167f8d73b223">tvm::runtime::vm::Instruction::target</a></div><div class="ttdeci">RegName target</div><div class="ttdoc">The register containing the target value. </div><div class="ttdef"><b>Definition:</b> bytecode.h:142</div></div>
 <div class="ttc" id="structtvm_1_1runtime_1_1vm_1_1Instruction_html_a360b264ed892e620935b648e5a91a5ea"><div class="ttname"><a href="structtvm_1_1runtime_1_1vm_1_1Instruction.html#a360b264ed892e620935b648e5a91a5ea">tvm::runtime::vm::Instruction::arity</a></div><div class="ttdeci">Index arity</div><div class="ttdoc">The arity of the packed function. </div><div class="ttdef"><b>Definition:</b> bytecode.h:132</div></div>
 <div class="ttc" id="structtvm_1_1runtime_1_1vm_1_1Instruction_html_a3a175836bc0893d99935f32911e45bfd"><div class="ttname"><a href="structtvm_1_1runtime_1_1vm_1_1Instruction.html#a3a175836bc0893d99935f32911e45bfd">tvm::runtime::vm::Instruction::closure</a></div><div class="ttdeci">RegName closure</div><div class="ttdoc">The register containing the closure. </div><div class="ttdef"><b>Definition:</b> bytecode.h:114</div></div>
-<div class="ttc" id="namespacetvm_1_1topi_html_af30c02f3a3f37c7963b3af60fb9c72a1"><div class="ttname"><a href="namespacetvm_1_1topi.html#af30c02f3a3f37c7963b3af60fb9c72a1">tvm::topi::shape</a></div><div class="ttdeci">Tensor shape(const Tensor &amp;src, DataType dtype, const std::string name=&quot;T_shape&quot;, const std::string tag=kInjective)</div><div class="ttdoc">Get the shape of input tensor. </div><div class="ttdef"><b>Definition:</b> transform.h:1508</div></div>
+<div class="ttc" id="namespacetvm_1_1topi_html_af30c02f3a3f37c7963b3af60fb9c72a1"><div class="ttname"><a href="namespacetvm_1_1topi.html#af30c02f3a3f37c7963b3af60fb9c72a1">tvm::topi::shape</a></div><div class="ttdeci">Tensor shape(const Tensor &amp;src, DataType dtype, const std::string name=&quot;T_shape&quot;, const std::string tag=kInjective)</div><div class="ttdoc">Get the shape of input tensor. </div><div class="ttdef"><b>Definition:</b> transform.h:1564</div></div>
 <div class="ttc" id="structtvm_1_1runtime_1_1vm_1_1Instruction_html_a3412cabd3b4f42f106f56fc22257f6ca"><div class="ttname"><a href="structtvm_1_1runtime_1_1vm_1_1Instruction.html#a3412cabd3b4f42f106f56fc22257f6ca">tvm::runtime::vm::Instruction::storage</a></div><div class="ttdeci">RegName storage</div><div class="ttdoc">The storage to allocate from. </div><div class="ttdef"><b>Definition:</b> bytecode.h:92</div></div>
 <div class="ttc" id="namespacetvm_1_1runtime_1_1vm_html_a3597867d2db714bf760876a23d6b7d3d"><div class="ttname"><a href="namespacetvm_1_1runtime_1_1vm.html#a3597867d2db714bf760876a23d6b7d3d">tvm::runtime::vm::Index</a></div><div class="ttdeci">int64_t Index</div><div class="ttdoc">An alias for the integer type used ubiquitously in the VM. </div><div class="ttdef"><b>Definition:</b> bytecode.h:43</div></div>
 <div class="ttc" id="structtvm_1_1runtime_1_1vm_1_1Instruction_html_a342c8c66b1886eb1fa0ca8a5b23f92aa"><div class="ttname"><a href="structtvm_1_1runtime_1_1vm_1_1Instruction.html#a342c8c66b1886eb1fa0ca8a5b23f92aa">tvm::runtime::vm::Instruction::device_type</a></div><div class="ttdeci">Index device_type</div><div class="ttdoc">The device type of the allocation. </div><div class="ttdef"><b>Definition:</b> bytecode.h:202</div></div>
diff --git a/docs/api/doxygen/c__backend__api_8h.html b/docs/api/doxygen/c__backend__api_8h.html
index b673a24..c79ca20 100644
--- a/docs/api/doxygen/c__backend__api_8h.html
+++ b/docs/api/doxygen/c__backend__api_8h.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/c__backend__api_8h_source.html b/docs/api/doxygen/c__backend__api_8h_source.html
index 6bbc1c6..414c357 100644
--- a/docs/api/doxygen/c__backend__api_8h_source.html
+++ b/docs/api/doxygen/c__backend__api_8h_source.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
@@ -89,13 +90,13 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 <div class="title">c_backend_api.h</div>  </div>
 </div><!--header-->
 <div class="contents">
-<a href="c__backend__api_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno">    1</span>&#160;<span class="comment">/*</span></div><div class="line"><a name="l00002"></a><span class="lineno">    2</span>&#160;<span class="comment"> * Licensed to the Apache Software Foundation (ASF) under one</span></div><div class="line"><a name="l00003"></a><span class="lineno">    3</span>&#160;<span class="comment"> * or  [...]
+<a href="c__backend__api_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno">    1</span>&#160;<span class="comment">/*</span></div><div class="line"><a name="l00002"></a><span class="lineno">    2</span>&#160;<span class="comment"> * Licensed to the Apache Software Foundation (ASF) under one</span></div><div class="line"><a name="l00003"></a><span class="lineno">    3</span>&#160;<span class="comment"> * or  [...]
 <div class="ttc" id="c__backend__api_8h_html_a56a654a9aeba2f1ccf3e10918dd88ec5"><div class="ttname"><a href="c__backend__api_8h.html#a56a654a9aeba2f1ccf3e10918dd88ec5">TVMBackendGetFuncFromEnv</a></div><div class="ttdeci">int TVMBackendGetFuncFromEnv(void *mod_node, const char *func_name, TVMFunctionHandle *out)</div><div class="ttdoc">Backend function for modules to get function from its environment mod_node (its imports and global fu...</div></div>
 <div class="ttc" id="c__backend__api_8h_html_ae70bd3ee026eb55b438ada05b08f0ce8"><div class="ttname"><a href="c__backend__api_8h.html#ae70bd3ee026eb55b438ada05b08f0ce8">TVMBackendRunOnce</a></div><div class="ttdeci">int TVMBackendRunOnce(void **handle, int(*f)(void *), void *cdata, int nbytes)</div><div class="ttdoc">Simple static initialization function. Run f once and set handle to be not null. This function is mai...</div></div>
 <div class="ttc" id="c__backend__api_8h_html_a07eaf7d1b748d99aa7715c7adbdea231"><div class="ttname"><a href="c__backend__api_8h.html#a07eaf7d1b748d99aa7715c7adbdea231">TVMBackendAllocWorkspace</a></div><div class="ttdeci">void * TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t nbytes, int dtype_code_hint, int dtype_bits_hint)</div><div class="ttdoc">Backend function to allocate temporal workspace. </div></div>
 <div class="ttc" id="unionTVMValue_html"><div class="ttname"><a href="unionTVMValue.html">TVMValue</a></div><div class="ttdoc">Union type of values being passed through API and function calls. </div><div class="ttdef"><b>Definition:</b> c_runtime_api.h:139</div></div>
 <div class="ttc" id="structTVMParallelGroupEnv_html"><div class="ttname"><a href="structTVMParallelGroupEnv.html">TVMParallelGroupEnv</a></div><div class="ttdoc">Environment for TVM parallel task. </div><div class="ttdef"><b>Definition:</b> c_backend_api.h:105</div></div>
-<div class="ttc" id="namespacetvm_1_1tir_1_1attr_html_a61b1ef1047fb722a4e5ec2167c9963d7"><div class="ttname"><a href="namespacetvm_1_1tir_1_1attr.html#a61b1ef1047fb722a4e5ec2167c9963d7">tvm::tir::attr::device_id</a></div><div class="ttdeci">constexpr const char * device_id</div><div class="ttdoc">The allocation device for global malloc in host. </div><div class="ttdef"><b>Definition:</b> stmt.h:1235</div></div>
+<div class="ttc" id="namespacetvm_1_1tir_1_1attr_html_a61b1ef1047fb722a4e5ec2167c9963d7"><div class="ttname"><a href="namespacetvm_1_1tir_1_1attr.html#a61b1ef1047fb722a4e5ec2167c9963d7">tvm::tir::attr::device_id</a></div><div class="ttdeci">constexpr const char * device_id</div><div class="ttdoc">The allocation device for global malloc in host. </div><div class="ttdef"><b>Definition:</b> stmt.h:1245</div></div>
 <div class="ttc" id="c__runtime__api_8h_html_acf57d257a6e0841d84ebbd2a339d183e"><div class="ttname"><a href="c__runtime__api_8h.html#acf57d257a6e0841d84ebbd2a339d183e">TVMFunctionHandle</a></div><div class="ttdeci">void * TVMFunctionHandle</div><div class="ttdoc">Handle to packed function handle. </div><div class="ttdef"><b>Definition:</b> c_runtime_api.h:160</div></div>
 <div class="ttc" id="c__backend__api_8h_html_a6ff8662943d0f003d55d9046cd24daf8"><div class="ttname"><a href="c__backend__api_8h.html#a6ff8662943d0f003d55d9046cd24daf8">TVMBackendParallelLaunch</a></div><div class="ttdeci">int TVMBackendParallelLaunch(FTVMParallelLambda flambda, void *cdata, int num_task)</div><div class="ttdoc">Backend function for running parallel jobs. </div></div>
 <div class="ttc" id="c__backend__api_8h_html_a0c57deb5acb9338ec778d91bd6e42191"><div class="ttname"><a href="c__backend__api_8h.html#a0c57deb5acb9338ec778d91bd6e42191">TVMBackendRegisterSystemLibSymbol</a></div><div class="ttdeci">int TVMBackendRegisterSystemLibSymbol(const char *name, void *ptr)</div><div class="ttdoc">Backend function to register system-wide library symbol. </div></div>
diff --git a/docs/api/doxygen/c__runtime__api_8h.html b/docs/api/doxygen/c__runtime__api_8h.html
index 2dab0ef..4754832 100644
--- a/docs/api/doxygen/c__runtime__api_8h.html
+++ b/docs/api/doxygen/c__runtime__api_8h.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/c__runtime__api_8h_source.html b/docs/api/doxygen/c__runtime__api_8h_source.html
index 61f5899..b91cf79 100644
--- a/docs/api/doxygen/c__runtime__api_8h_source.html
+++ b/docs/api/doxygen/c__runtime__api_8h_source.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li><a href="annotated.html"><span>Classes</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
@@ -93,7 +94,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 <div class="ttc" id="unionTVMValue_html_aa1c40fa9e74fbf97541fd9735062c4cc"><div class="ttname"><a href="unionTVMValue.html#aa1c40fa9e74fbf97541fd9735062c4cc">TVMValue::v_int64</a></div><div class="ttdeci">int64_t v_int64</div><div class="ttdef"><b>Definition:</b> c_runtime_api.h:140</div></div>
 <div class="ttc" id="c__runtime__api_8h_html_a07954203342f2787acf988c4c351d9c3"><div class="ttname"><a href="c__runtime__api_8h.html#a07954203342f2787acf988c4c351d9c3">TVMPackedCFunc</a></div><div class="ttdeci">int(* TVMPackedCFunc)(TVMValue *args, int *type_codes, int num_args, TVMRetValueHandle ret, void *resource_handle)</div><div class="ttdoc">C type of packed function. </div><div class="ttdef"><b>Definition:</b> c_runtime_api.h:300</div></div>
 <div class="ttc" id="c__runtime__api_8h_html_a477111f9accd70633dc5f2e7139b6cf4"><div class="ttname"><a href="c__runtime__api_8h.html#a477111f9accd70633dc5f2e7139b6cf4">TVMAPISetLastError</a></div><div class="ttdeci">void TVMAPISetLastError(const char *msg)</div><div class="ttdoc">Used for implementing C API function. Set last error message before return. </div></div>
-<div class="ttc" id="namespacetvm_1_1tir_1_1attr_html_a7e4e7cd47471a9089022214d63d24206"><div class="ttname"><a href="namespacetvm_1_1tir_1_1attr.html#a7e4e7cd47471a9089022214d63d24206">tvm::tir::attr::device_type</a></div><div class="ttdeci">constexpr const char * device_type</div><div class="ttdoc">The device type. </div><div class="ttdef"><b>Definition:</b> stmt.h:1237</div></div>
+<div class="ttc" id="namespacetvm_1_1tir_1_1attr_html_a7e4e7cd47471a9089022214d63d24206"><div class="ttname"><a href="namespacetvm_1_1tir_1_1attr.html#a7e4e7cd47471a9089022214d63d24206">tvm::tir::attr::device_type</a></div><div class="ttdeci">constexpr const char * device_type</div><div class="ttdoc">The device type. </div><div class="ttdef"><b>Definition:</b> stmt.h:1247</div></div>
 <div class="ttc" id="c__runtime__api_8h_html_af420f0d74004615c03bb40270ad5d489"><div class="ttname"><a href="c__runtime__api_8h.html#af420f0d74004615c03bb40270ad5d489">TVMFuncFree</a></div><div class="ttdeci">int TVMFuncFree(TVMFunctionHandle func)</div><div class="ttdoc">Free the function when it is no longer needed. </div></div>
 <div class="ttc" id="c__runtime__api_8h_html_a591e48e52098965e235657dab56bc096"><div class="ttname"><a href="c__runtime__api_8h.html#a591e48e52098965e235657dab56bc096">TVMArrayFree</a></div><div class="ttdeci">int TVMArrayFree(TVMArrayHandle handle)</div><div class="ttdoc">Free the TVM Array. </div></div>
 <div class="ttc" id="c__runtime__api_8h_html_a57cbccb14c35a0e62dbc1b911188fcefacdc33f5efa9ddabe89e886c28d1ff65b"><div class="ttname"><a href="c__runtime__api_8h.html#a57cbccb14c35a0e62dbc1b911188fcefacdc33f5efa9ddabe89e886c28d1ff65b">kDLSDAccel</a></div><div class="ttdef"><b>Definition:</b> c_runtime_api.h:81</div></div>
@@ -125,7 +126,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 <div class="ttc" id="c__runtime__api_8h_html_a8a01e20ffd5784465df0759c950154b5"><div class="ttname"><a href="c__runtime__api_8h.html#a8a01e20ffd5784465df0759c950154b5">TVMFuncCall</a></div><div class="ttdeci">int TVMFuncCall(TVMFunctionHandle func, TVMValue *arg_values, int *type_codes, int num_args, TVMValue *ret_val, int *ret_type_code)</div><div class="ttdoc">Call a Packed TVM Function. </div></div>
 <div class="ttc" id="c__runtime__api_8h_html_aa91f776ed41a36790409f78725f81419"><div class="ttname"><a href="c__runtime__api_8h.html#aa91f776ed41a36790409f78725f81419">TVMStreamStreamSynchronize</a></div><div class="ttdeci">int TVMStreamStreamSynchronize(int device_type, int device_id, TVMStreamHandle src, TVMStreamHandle dst)</div><div class="ttdoc">Synchronize two streams of execution. </div></div>
 <div class="ttc" id="c__runtime__api_8h_html_a6fee438ab81fbf4235375d9f17f36d6f"><div class="ttname"><a href="c__runtime__api_8h.html#a6fee438ab81fbf4235375d9f17f36d6f">TVMCbArgToReturn</a></div><div class="ttdeci">int TVMCbArgToReturn(TVMValue *value, int *code)</div><div class="ttdoc">Inplace translate callback argument value to return value. This is only needed for non-POD arguments...</div></div>
-<div class="ttc" id="namespacetvm_1_1tir_1_1attr_html_a61b1ef1047fb722a4e5ec2167c9963d7"><div class="ttname"><a href="namespacetvm_1_1tir_1_1attr.html#a61b1ef1047fb722a4e5ec2167c9963d7">tvm::tir::attr::device_id</a></div><div class="ttdeci">constexpr const char * device_id</div><div class="ttdoc">The allocation device for global malloc in host. </div><div class="ttdef"><b>Definition:</b> stmt.h:1235</div></div>
+<div class="ttc" id="namespacetvm_1_1tir_1_1attr_html_a61b1ef1047fb722a4e5ec2167c9963d7"><div class="ttname"><a href="namespacetvm_1_1tir_1_1attr.html#a61b1ef1047fb722a4e5ec2167c9963d7">tvm::tir::attr::device_id</a></div><div class="ttdeci">constexpr const char * device_id</div><div class="ttdoc">The allocation device for global malloc in host. </div><div class="ttdef"><b>Definition:</b> stmt.h:1245</div></div>
 <div class="ttc" id="c__runtime__api_8h_html_ab98d6b66089da8b33b662ccdb243b26b"><div class="ttname"><a href="c__runtime__api_8h.html#ab98d6b66089da8b33b662ccdb243b26b">TVMFuncRemoveGlobal</a></div><div class="ttdeci">int TVMFuncRemoveGlobal(const char *name)</div><div class="ttdoc">Remove a global function. </div></div>
 <div class="ttc" id="c__runtime__api_8h_html_a57cbccb14c35a0e62dbc1b911188fcefaad7f2a6159af2f349840446d850f1402"><div class="ttname"><a href="c__runtime__api_8h.html#a57cbccb14c35a0e62dbc1b911188fcefaad7f2a6159af2f349840446d850f1402">kDLHexagon</a></div><div class="ttdef"><b>Definition:</b> c_runtime_api.h:84</div></div>
 <div class="ttc" id="c__runtime__api_8h_html_ad3bd42da244a0e32ac82d7428e01a010"><div class="ttname"><a href="c__runtime__api_8h.html#ad3bd42da244a0e32ac82d7428e01a010">TVMFuncGetGlobal</a></div><div class="ttdeci">int TVMFuncGetGlobal(const char *name, TVMFunctionHandle *out)</div><div class="ttdoc">Get a global function. </div></div>
@@ -149,7 +150,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
 <div class="ttc" id="c__runtime__api_8h_html_ae690840d1af9c7b0fe5b9b457456f60d"><div class="ttname"><a href="c__runtime__api_8h.html#ae690840d1af9c7b0fe5b9b457456f60d">TVMArrayToDLPack</a></div><div class="ttdeci">int TVMArrayToDLPack(TVMArrayHandle from, DLManagedTensor **out)</div><div class="ttdoc">Produce a DLMangedTensor from the array that shares data memory with the array. </div></div>
 <div class="ttc" id="c__runtime__api_8h_html_a6cd1076476117e74454f67931c2da1d4"><div class="ttname"><a href="c__runtime__api_8h.html#a6cd1076476117e74454f67931c2da1d4">TVMRetValueHandle</a></div><div class="ttdeci">void * TVMRetValueHandle</div><div class="ttdoc">Handle to hold return value. </div><div class="ttdef"><b>Definition:</b> c_runtime_api.h:162</div></div>
 <div class="ttc" id="c__runtime__api_8h_html_acf57d257a6e0841d84ebbd2a339d183e"><div class="ttname"><a href="c__runtime__api_8h.html#acf57d257a6e0841d84ebbd2a339d183e">TVMFunctionHandle</a></div><div class="ttdeci">void * TVMFunctionHandle</div><div class="ttdoc">Handle to packed function handle. </div><div class="ttdef"><b>Definition:</b> c_runtime_api.h:160</div></div>
-<div class="ttc" id="namespacetvm_1_1topi_html_af30c02f3a3f37c7963b3af60fb9c72a1"><div class="ttname"><a href="namespacetvm_1_1topi.html#af30c02f3a3f37c7963b3af60fb9c72a1">tvm::topi::shape</a></div><div class="ttdeci">Tensor shape(const Tensor &amp;src, DataType dtype, const std::string name=&quot;T_shape&quot;, const std::string tag=kInjective)</div><div class="ttdoc">Get the shape of input tensor. </div><div class="ttdef"><b>Definition:</b> transform.h:1508</div></div>
+<div class="ttc" id="namespacetvm_1_1topi_html_af30c02f3a3f37c7963b3af60fb9c72a1"><div class="ttname"><a href="namespacetvm_1_1topi.html#af30c02f3a3f37c7963b3af60fb9c72a1">tvm::topi::shape</a></div><div class="ttdeci">Tensor shape(const Tensor &amp;src, DataType dtype, const std::string name=&quot;T_shape&quot;, const std::string tag=kInjective)</div><div class="ttdoc">Get the shape of input tensor. </div><div class="ttdef"><b>Definition:</b> transform.h:1564</div></div>
 <div class="ttc" id="c__runtime__api_8h_html_ace8007daffd9f2c6d954c24d870bfcc4"><div class="ttname"><a href="c__runtime__api_8h.html#ace8007daffd9f2c6d954c24d870bfcc4">tvm_index_t</a></div><div class="ttdeci">int64_t tvm_index_t</div><div class="ttdoc">type of array index. </div><div class="ttdef"><b>Definition:</b> c_runtime_api.h:76</div></div>
 <div class="ttc" id="c__runtime__api_8h_html_a190e81769e805cca153514137a66e793a9387f774bc8453afe4aa4cd17789a405"><div class="ttname"><a href="c__runtime__api_8h.html#a190e81769e805cca153514137a66e793a9387f774bc8453afe4aa4cd17789a405">kTVMOpaqueHandle</a></div><div class="ttdef"><b>Definition:</b> c_runtime_api.h:108</div></div>
 <div class="ttc" id="c__runtime__api_8h_html_ae899a6a211b7913e92420a01b804db64"><div class="ttname"><a href="c__runtime__api_8h.html#ae899a6a211b7913e92420a01b804db64">TVMObjectRetain</a></div><div class="ttdeci">int TVMObjectRetain(TVMObjectHandle obj)</div><div class="ttdoc">Increase the reference count of an object. </div></div>
diff --git a/docs/api/doxygen/classes.html b/docs/api/doxygen/classes.html
index 483d128..5554972 100644
--- a/docs/api/doxygen/classes.html
+++ b/docs/api/doxygen/classes.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1AttrFieldInfo-members.html b/docs/api/doxygen/classtvm_1_1AttrFieldInfo-members.html
index c9edef3..38cb6f2 100644
--- a/docs/api/doxygen/classtvm_1_1AttrFieldInfo-members.html
+++ b/docs/api/doxygen/classtvm_1_1AttrFieldInfo-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1AttrFieldInfo.html b/docs/api/doxygen/classtvm_1_1AttrFieldInfo.html
index 090a511..c03c738 100644
--- a/docs/api/doxygen/classtvm_1_1AttrFieldInfo.html
+++ b/docs/api/doxygen/classtvm_1_1AttrFieldInfo.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1AttrFieldInfoNode-members.html b/docs/api/doxygen/classtvm_1_1AttrFieldInfoNode-members.html
index fe76174..cf71d31 100644
--- a/docs/api/doxygen/classtvm_1_1AttrFieldInfoNode-members.html
+++ b/docs/api/doxygen/classtvm_1_1AttrFieldInfoNode-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1AttrFieldInfoNode.html b/docs/api/doxygen/classtvm_1_1AttrFieldInfoNode.html
index f986f42..7f20559 100644
--- a/docs/api/doxygen/classtvm_1_1AttrFieldInfoNode.html
+++ b/docs/api/doxygen/classtvm_1_1AttrFieldInfoNode.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1AttrRegistryMap-members.html b/docs/api/doxygen/classtvm_1_1AttrRegistryMap-members.html
index 2c4c01a..3159303 100644
--- a/docs/api/doxygen/classtvm_1_1AttrRegistryMap-members.html
+++ b/docs/api/doxygen/classtvm_1_1AttrRegistryMap-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1AttrRegistryMap.html b/docs/api/doxygen/classtvm_1_1AttrRegistryMap.html
index c1c9caa..233a751 100644
--- a/docs/api/doxygen/classtvm_1_1AttrRegistryMap.html
+++ b/docs/api/doxygen/classtvm_1_1AttrRegistryMap.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1AttrRegistryMapContainerMap-members.html b/docs/api/doxygen/classtvm_1_1AttrRegistryMapContainerMap-members.html
index b730bc0..541f9ff 100644
--- a/docs/api/doxygen/classtvm_1_1AttrRegistryMapContainerMap-members.html
+++ b/docs/api/doxygen/classtvm_1_1AttrRegistryMapContainerMap-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1AttrRegistryMapContainerMap.html b/docs/api/doxygen/classtvm_1_1AttrRegistryMapContainerMap.html
index 6717b4b..6c0367f 100644
--- a/docs/api/doxygen/classtvm_1_1AttrRegistryMapContainerMap.html
+++ b/docs/api/doxygen/classtvm_1_1AttrRegistryMapContainerMap.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1AttrVisitor.html b/docs/api/doxygen/classtvm_1_1AttrVisitor.html
index 6349c97..036d64a 100644
--- a/docs/api/doxygen/classtvm_1_1AttrVisitor.html
+++ b/docs/api/doxygen/classtvm_1_1AttrVisitor.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1Attrs-members.html b/docs/api/doxygen/classtvm_1_1Attrs-members.html
index 7294c93..7ed4b2e 100644
--- a/docs/api/doxygen/classtvm_1_1Attrs-members.html
+++ b/docs/api/doxygen/classtvm_1_1Attrs-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1Attrs.html b/docs/api/doxygen/classtvm_1_1Attrs.html
index f409ff1..2b97105 100644
--- a/docs/api/doxygen/classtvm_1_1Attrs.html
+++ b/docs/api/doxygen/classtvm_1_1Attrs.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1AttrsNode-members.html b/docs/api/doxygen/classtvm_1_1AttrsNode-members.html
index 9a0759d..bfba2a6 100644
--- a/docs/api/doxygen/classtvm_1_1AttrsNode-members.html
+++ b/docs/api/doxygen/classtvm_1_1AttrsNode-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1AttrsNode.html b/docs/api/doxygen/classtvm_1_1AttrsNode.html
index a10f06e..8dbc2c3 100644
--- a/docs/api/doxygen/classtvm_1_1AttrsNode.html
+++ b/docs/api/doxygen/classtvm_1_1AttrsNode.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1BaseAttrsNode-members.html b/docs/api/doxygen/classtvm_1_1BaseAttrsNode-members.html
index 2e19c0b..08cf1fb 100644
--- a/docs/api/doxygen/classtvm_1_1BaseAttrsNode-members.html
+++ b/docs/api/doxygen/classtvm_1_1BaseAttrsNode-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1BaseAttrsNode.html b/docs/api/doxygen/classtvm_1_1BaseAttrsNode.html
index 3ab3f77..abe73e9 100644
--- a/docs/api/doxygen/classtvm_1_1BaseAttrsNode.html
+++ b/docs/api/doxygen/classtvm_1_1BaseAttrsNode.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1BaseExpr-members.html b/docs/api/doxygen/classtvm_1_1BaseExpr-members.html
index fddeaf5..e8cf695 100644
--- a/docs/api/doxygen/classtvm_1_1BaseExpr-members.html
+++ b/docs/api/doxygen/classtvm_1_1BaseExpr-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1BaseExpr.html b/docs/api/doxygen/classtvm_1_1BaseExpr.html
index bdd522a..24b556d 100644
--- a/docs/api/doxygen/classtvm_1_1BaseExpr.html
+++ b/docs/api/doxygen/classtvm_1_1BaseExpr.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1BaseExprNode-members.html b/docs/api/doxygen/classtvm_1_1BaseExprNode-members.html
index b622f49..6340555 100644
--- a/docs/api/doxygen/classtvm_1_1BaseExprNode-members.html
+++ b/docs/api/doxygen/classtvm_1_1BaseExprNode-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1BaseExprNode.html b/docs/api/doxygen/classtvm_1_1BaseExprNode.html
index 1a4f584..9dea29f 100644
--- a/docs/api/doxygen/classtvm_1_1BaseExprNode.html
+++ b/docs/api/doxygen/classtvm_1_1BaseExprNode.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1BaseExpr__inherit__graph.svg b/docs/api/doxygen/classtvm_1_1BaseExpr__inherit__graph.svg
index 49e81d8..5b0686e 100644
--- a/docs/api/doxygen/classtvm_1_1BaseExpr__inherit__graph.svg
+++ b/docs/api/doxygen/classtvm_1_1BaseExpr__inherit__graph.svg
@@ -207,20 +207,22 @@
 <!-- Node11 -->
 <g id="node11" class="node"><title>Node11</title>
 <g id="a_node11"><a xlink:href="classtvm_1_1tir_1_1BufferLoad.html" target="_top" xlink:title="Managed reference to BufferLoadNode. ">
-<polygon fill="white" stroke="black" points="1190,-17 1190,-96 1340,-96 1340,-17 1190,-17"/>
-<text text-anchor="middle" x="1265" y="-84" font-family="Helvetica,sans-Serif" font-size="10.00">tvm::tir::BufferLoad</text>
-<polyline fill="none" stroke="black" points="1190,-77 1340,-77 "/>
-<text text-anchor="middle" x="1265" y="-65" font-family="Helvetica,sans-Serif" font-size="10.00"> </text>
-<polyline fill="none" stroke="black" points="1190,-58 1340,-58 "/>
-<text text-anchor="start" x="1198" y="-46" font-family="Helvetica,sans-Serif" font-size="10.00">+ BufferLoad()</text>
-<text text-anchor="start" x="1198" y="-35" font-family="Helvetica,sans-Serif" font-size="10.00">+ TVM_DEFINE_OBJECT_REF</text>
-<text text-anchor="start" x="1198" y="-24" font-family="Helvetica,sans-Serif" font-size="10.00">_METHODS()</text>
+<polygon fill="white" stroke="black" points="1190,-6 1190,-107 1340,-107 1340,-6 1190,-6"/>
+<text text-anchor="middle" x="1265" y="-95" font-family="Helvetica,sans-Serif" font-size="10.00">tvm::tir::BufferLoad</text>
+<polyline fill="none" stroke="black" points="1190,-88 1340,-88 "/>
+<text text-anchor="middle" x="1265" y="-76" font-family="Helvetica,sans-Serif" font-size="10.00"> </text>
+<polyline fill="none" stroke="black" points="1190,-69 1340,-69 "/>
+<text text-anchor="start" x="1198" y="-57" font-family="Helvetica,sans-Serif" font-size="10.00">+ BufferLoad()</text>
+<text text-anchor="start" x="1198" y="-46" font-family="Helvetica,sans-Serif" font-size="10.00">+ TVM_DEFINE_OBJECT_REF</text>
+<text text-anchor="start" x="1198" y="-35" font-family="Helvetica,sans-Serif" font-size="10.00">_METHODS()</text>
+<text text-anchor="start" x="1198" y="-24" font-family="Helvetica,sans-Serif" font-size="10.00">+ TVM_DEFINE_OBJECT_REF</text>
+<text text-anchor="start" x="1198" y="-13" font-family="Helvetica,sans-Serif" font-size="10.00">_COW_METHOD()</text>
 </a>
 </g>
 </g>
 <!-- Node3&#45;&gt;Node11 -->
 <g id="edge10" class="edge"><title>Node3&#45;&gt;Node11</title>
-<path fill="none" stroke="midnightblue" d="M2943.64,-197.264C2622.98,-190.382 1500.95,-162.933 1349,-113 1337.04,-109.071 1325.17,-102.962 1314.27,-96.2099"/>
+<path fill="none" stroke="midnightblue" d="M2943.64,-197.264C2622.98,-190.382 1500.95,-162.933 1349,-113 1344.02,-111.363 1339.05,-109.347 1334.16,-107.064"/>
 <polygon fill="none" stroke="midnightblue" points="2943.73,-200.766 2953.8,-197.481 2943.88,-193.768 2943.73,-200.766"/>
 </g>
 <!-- Node12 -->
diff --git a/docs/api/doxygen/classtvm_1_1BaseFunc-members.html b/docs/api/doxygen/classtvm_1_1BaseFunc-members.html
index 36ad9fe..1f478a4 100644
--- a/docs/api/doxygen/classtvm_1_1BaseFunc-members.html
+++ b/docs/api/doxygen/classtvm_1_1BaseFunc-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1BaseFunc.html b/docs/api/doxygen/classtvm_1_1BaseFunc.html
index de9ca89..0e43cd3 100644
--- a/docs/api/doxygen/classtvm_1_1BaseFunc.html
+++ b/docs/api/doxygen/classtvm_1_1BaseFunc.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1BaseFuncNode-members.html b/docs/api/doxygen/classtvm_1_1BaseFuncNode-members.html
index 42f6786..f984222 100644
--- a/docs/api/doxygen/classtvm_1_1BaseFuncNode-members.html
+++ b/docs/api/doxygen/classtvm_1_1BaseFuncNode-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1BaseFuncNode.html b/docs/api/doxygen/classtvm_1_1BaseFuncNode.html
index 44f2dea..34759cd 100644
--- a/docs/api/doxygen/classtvm_1_1BaseFuncNode.html
+++ b/docs/api/doxygen/classtvm_1_1BaseFuncNode.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1BaseTensorType-members.html b/docs/api/doxygen/classtvm_1_1BaseTensorType-members.html
index fad7009..a6ced3e 100644
--- a/docs/api/doxygen/classtvm_1_1BaseTensorType-members.html
+++ b/docs/api/doxygen/classtvm_1_1BaseTensorType-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1BaseTensorType.html b/docs/api/doxygen/classtvm_1_1BaseTensorType.html
index c1fe45b..0ad3320 100644
--- a/docs/api/doxygen/classtvm_1_1BaseTensorType.html
+++ b/docs/api/doxygen/classtvm_1_1BaseTensorType.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1BaseTensorTypeNode-members.html b/docs/api/doxygen/classtvm_1_1BaseTensorTypeNode-members.html
index e964b4b..6b93e42 100644
--- a/docs/api/doxygen/classtvm_1_1BaseTensorTypeNode-members.html
+++ b/docs/api/doxygen/classtvm_1_1BaseTensorTypeNode-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1BaseTensorTypeNode.html b/docs/api/doxygen/classtvm_1_1BaseTensorTypeNode.html
index 37aea85..774863c 100644
--- a/docs/api/doxygen/classtvm_1_1BaseTensorTypeNode.html
+++ b/docs/api/doxygen/classtvm_1_1BaseTensorTypeNode.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1BaseValueEqual-members.html b/docs/api/doxygen/classtvm_1_1BaseValueEqual-members.html
index 97014ac..6394fe9 100644
--- a/docs/api/doxygen/classtvm_1_1BaseValueEqual-members.html
+++ b/docs/api/doxygen/classtvm_1_1BaseValueEqual-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1BaseValueEqual.html b/docs/api/doxygen/classtvm_1_1BaseValueEqual.html
index 669fda3..0c43f50 100644
--- a/docs/api/doxygen/classtvm_1_1BaseValueEqual.html
+++ b/docs/api/doxygen/classtvm_1_1BaseValueEqual.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1BaseValueHash-members.html b/docs/api/doxygen/classtvm_1_1BaseValueHash-members.html
index b3fb7aa..8260b46 100644
--- a/docs/api/doxygen/classtvm_1_1BaseValueHash-members.html
+++ b/docs/api/doxygen/classtvm_1_1BaseValueHash-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1BaseValueHash.html b/docs/api/doxygen/classtvm_1_1BaseValueHash.html
index be933a3..068060e 100644
--- a/docs/api/doxygen/classtvm_1_1BaseValueHash.html
+++ b/docs/api/doxygen/classtvm_1_1BaseValueHash.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1Bool-members.html b/docs/api/doxygen/classtvm_1_1Bool-members.html
index 451a7dc..2c40b79 100644
--- a/docs/api/doxygen/classtvm_1_1Bool-members.html
+++ b/docs/api/doxygen/classtvm_1_1Bool-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1Bool.html b/docs/api/doxygen/classtvm_1_1Bool.html
index 401031e..9c25df7 100644
--- a/docs/api/doxygen/classtvm_1_1Bool.html
+++ b/docs/api/doxygen/classtvm_1_1Bool.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1CompileError-members.html b/docs/api/doxygen/classtvm_1_1CompileError-members.html
index 7d7b418..9b64949 100644
--- a/docs/api/doxygen/classtvm_1_1CompileError-members.html
+++ b/docs/api/doxygen/classtvm_1_1CompileError-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1CompileError.html b/docs/api/doxygen/classtvm_1_1CompileError.html
index a47d48e..3d0c76a 100644
--- a/docs/api/doxygen/classtvm_1_1CompileError.html
+++ b/docs/api/doxygen/classtvm_1_1CompileError.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1Constructor-members.html b/docs/api/doxygen/classtvm_1_1Constructor-members.html
index 578e1a9..1f10306 100644
--- a/docs/api/doxygen/classtvm_1_1Constructor-members.html
+++ b/docs/api/doxygen/classtvm_1_1Constructor-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1Constructor.html b/docs/api/doxygen/classtvm_1_1Constructor.html
index de297e6..5d62487 100644
--- a/docs/api/doxygen/classtvm_1_1Constructor.html
+++ b/docs/api/doxygen/classtvm_1_1Constructor.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1ConstructorNode-members.html b/docs/api/doxygen/classtvm_1_1ConstructorNode-members.html
index 03a0cb6..9f028f2 100644
--- a/docs/api/doxygen/classtvm_1_1ConstructorNode-members.html
+++ b/docs/api/doxygen/classtvm_1_1ConstructorNode-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1ConstructorNode.html b/docs/api/doxygen/classtvm_1_1ConstructorNode.html
index fa7e7f7..6b3ba3a 100644
--- a/docs/api/doxygen/classtvm_1_1ConstructorNode.html
+++ b/docs/api/doxygen/classtvm_1_1ConstructorNode.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1Diagnostic-members.html b/docs/api/doxygen/classtvm_1_1Diagnostic-members.html
index 8800a44..0415ca7 100644
--- a/docs/api/doxygen/classtvm_1_1Diagnostic-members.html
+++ b/docs/api/doxygen/classtvm_1_1Diagnostic-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1Diagnostic.html b/docs/api/doxygen/classtvm_1_1Diagnostic.html
index b474d46..2d93538 100644
--- a/docs/api/doxygen/classtvm_1_1Diagnostic.html
+++ b/docs/api/doxygen/classtvm_1_1Diagnostic.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1DiagnosticBuilder-members.html b/docs/api/doxygen/classtvm_1_1DiagnosticBuilder-members.html
index 3165399..34947f2 100644
--- a/docs/api/doxygen/classtvm_1_1DiagnosticBuilder-members.html
+++ b/docs/api/doxygen/classtvm_1_1DiagnosticBuilder-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1DiagnosticBuilder.html b/docs/api/doxygen/classtvm_1_1DiagnosticBuilder.html
index 67fcbc8..0dc3417 100644
--- a/docs/api/doxygen/classtvm_1_1DiagnosticBuilder.html
+++ b/docs/api/doxygen/classtvm_1_1DiagnosticBuilder.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1DiagnosticContext-members.html b/docs/api/doxygen/classtvm_1_1DiagnosticContext-members.html
index 2ccd099..15dbae6 100644
--- a/docs/api/doxygen/classtvm_1_1DiagnosticContext-members.html
+++ b/docs/api/doxygen/classtvm_1_1DiagnosticContext-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1DiagnosticContext.html b/docs/api/doxygen/classtvm_1_1DiagnosticContext.html
index b9d1b57..d5715a5 100644
--- a/docs/api/doxygen/classtvm_1_1DiagnosticContext.html
+++ b/docs/api/doxygen/classtvm_1_1DiagnosticContext.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1DiagnosticContextNode-members.html b/docs/api/doxygen/classtvm_1_1DiagnosticContextNode-members.html
index e8d4ecd..f5fdfb7 100644
--- a/docs/api/doxygen/classtvm_1_1DiagnosticContextNode-members.html
+++ b/docs/api/doxygen/classtvm_1_1DiagnosticContextNode-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1DiagnosticContextNode.html b/docs/api/doxygen/classtvm_1_1DiagnosticContextNode.html
index 952af5a..34c9e5e 100644
--- a/docs/api/doxygen/classtvm_1_1DiagnosticContextNode.html
+++ b/docs/api/doxygen/classtvm_1_1DiagnosticContextNode.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1DiagnosticNode-members.html b/docs/api/doxygen/classtvm_1_1DiagnosticNode-members.html
index a8ee008..45b31d7 100644
--- a/docs/api/doxygen/classtvm_1_1DiagnosticNode-members.html
+++ b/docs/api/doxygen/classtvm_1_1DiagnosticNode-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1DiagnosticNode.html b/docs/api/doxygen/classtvm_1_1DiagnosticNode.html
index d579796..431b54a 100644
--- a/docs/api/doxygen/classtvm_1_1DiagnosticNode.html
+++ b/docs/api/doxygen/classtvm_1_1DiagnosticNode.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1DiagnosticRenderer-members.html b/docs/api/doxygen/classtvm_1_1DiagnosticRenderer-members.html
index dd251cd..de8689d 100644
--- a/docs/api/doxygen/classtvm_1_1DiagnosticRenderer-members.html
+++ b/docs/api/doxygen/classtvm_1_1DiagnosticRenderer-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1DiagnosticRenderer.html b/docs/api/doxygen/classtvm_1_1DiagnosticRenderer.html
index 1b917f9..5eb6878 100644
--- a/docs/api/doxygen/classtvm_1_1DiagnosticRenderer.html
+++ b/docs/api/doxygen/classtvm_1_1DiagnosticRenderer.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1DiagnosticRendererNode-members.html b/docs/api/doxygen/classtvm_1_1DiagnosticRendererNode-members.html
index 89ec0f5..4aa71a0 100644
--- a/docs/api/doxygen/classtvm_1_1DiagnosticRendererNode-members.html
+++ b/docs/api/doxygen/classtvm_1_1DiagnosticRendererNode-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1DiagnosticRendererNode.html b/docs/api/doxygen/classtvm_1_1DiagnosticRendererNode.html
index a62ff23..ba31230 100644
--- a/docs/api/doxygen/classtvm_1_1DiagnosticRendererNode.html
+++ b/docs/api/doxygen/classtvm_1_1DiagnosticRendererNode.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1DictAttrs-members.html b/docs/api/doxygen/classtvm_1_1DictAttrs-members.html
index b64fc70..56c2eea 100644
--- a/docs/api/doxygen/classtvm_1_1DictAttrs-members.html
+++ b/docs/api/doxygen/classtvm_1_1DictAttrs-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1DictAttrs.html b/docs/api/doxygen/classtvm_1_1DictAttrs.html
index 2874911..fcb1b5b 100644
--- a/docs/api/doxygen/classtvm_1_1DictAttrs.html
+++ b/docs/api/doxygen/classtvm_1_1DictAttrs.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1DictAttrsNode-members.html b/docs/api/doxygen/classtvm_1_1DictAttrsNode-members.html
index c8473c3..7796028 100644
--- a/docs/api/doxygen/classtvm_1_1DictAttrsNode-members.html
+++ b/docs/api/doxygen/classtvm_1_1DictAttrsNode-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1DictAttrsNode.html b/docs/api/doxygen/classtvm_1_1DictAttrsNode.html
index 0611a7b..efb249f 100644
--- a/docs/api/doxygen/classtvm_1_1DictAttrsNode.html
+++ b/docs/api/doxygen/classtvm_1_1DictAttrsNode.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1EnvFunc-members.html b/docs/api/doxygen/classtvm_1_1EnvFunc-members.html
index a2190e1..cb96168 100644
--- a/docs/api/doxygen/classtvm_1_1EnvFunc-members.html
+++ b/docs/api/doxygen/classtvm_1_1EnvFunc-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1EnvFunc.html b/docs/api/doxygen/classtvm_1_1EnvFunc.html
index 6d61c0a..2ddb049 100644
--- a/docs/api/doxygen/classtvm_1_1EnvFunc.html
+++ b/docs/api/doxygen/classtvm_1_1EnvFunc.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1EnvFuncNode-members.html b/docs/api/doxygen/classtvm_1_1EnvFuncNode-members.html
index 3eee013..eccbd1c 100644
--- a/docs/api/doxygen/classtvm_1_1EnvFuncNode-members.html
+++ b/docs/api/doxygen/classtvm_1_1EnvFuncNode-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1EnvFuncNode.html b/docs/api/doxygen/classtvm_1_1EnvFuncNode.html
index 88a5325..a5f2017 100644
--- a/docs/api/doxygen/classtvm_1_1EnvFuncNode.html
+++ b/docs/api/doxygen/classtvm_1_1EnvFuncNode.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1ErrorReporter-members.html b/docs/api/doxygen/classtvm_1_1ErrorReporter-members.html
index 86d00e9..d6e345d 100644
--- a/docs/api/doxygen/classtvm_1_1ErrorReporter-members.html
+++ b/docs/api/doxygen/classtvm_1_1ErrorReporter-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1ErrorReporter.html b/docs/api/doxygen/classtvm_1_1ErrorReporter.html
index 81cfc6a..7fcd6ec 100644
--- a/docs/api/doxygen/classtvm_1_1ErrorReporter.html
+++ b/docs/api/doxygen/classtvm_1_1ErrorReporter.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1FloatImm-members.html b/docs/api/doxygen/classtvm_1_1FloatImm-members.html
index e152ea3..1e01c33 100644
--- a/docs/api/doxygen/classtvm_1_1FloatImm-members.html
+++ b/docs/api/doxygen/classtvm_1_1FloatImm-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1FloatImm.html b/docs/api/doxygen/classtvm_1_1FloatImm.html
index 06bb2a6..5dca536 100644
--- a/docs/api/doxygen/classtvm_1_1FloatImm.html
+++ b/docs/api/doxygen/classtvm_1_1FloatImm.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1FloatImmNode-members.html b/docs/api/doxygen/classtvm_1_1FloatImmNode-members.html
index f76acfd..b3d97a3 100644
--- a/docs/api/doxygen/classtvm_1_1FloatImmNode-members.html
+++ b/docs/api/doxygen/classtvm_1_1FloatImmNode-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1FloatImmNode.html b/docs/api/doxygen/classtvm_1_1FloatImmNode.html
index f601972..11d0f4d 100644
--- a/docs/api/doxygen/classtvm_1_1FloatImmNode.html
+++ b/docs/api/doxygen/classtvm_1_1FloatImmNode.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1FuncType-members.html b/docs/api/doxygen/classtvm_1_1FuncType-members.html
index a7eedaa..93bf894 100644
--- a/docs/api/doxygen/classtvm_1_1FuncType-members.html
+++ b/docs/api/doxygen/classtvm_1_1FuncType-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1FuncType.html b/docs/api/doxygen/classtvm_1_1FuncType.html
index c80029f..8b8b4dd 100644
--- a/docs/api/doxygen/classtvm_1_1FuncType.html
+++ b/docs/api/doxygen/classtvm_1_1FuncType.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1FuncTypeNode-members.html b/docs/api/doxygen/classtvm_1_1FuncTypeNode-members.html
index 4fc51a8..e6e7b8d 100644
--- a/docs/api/doxygen/classtvm_1_1FuncTypeNode-members.html
+++ b/docs/api/doxygen/classtvm_1_1FuncTypeNode-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1FuncTypeNode.html b/docs/api/doxygen/classtvm_1_1FuncTypeNode.html
index f389e87..ea5138f 100644
--- a/docs/api/doxygen/classtvm_1_1FuncTypeNode.html
+++ b/docs/api/doxygen/classtvm_1_1FuncTypeNode.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1GenericFunc-members.html b/docs/api/doxygen/classtvm_1_1GenericFunc-members.html
index bebbf67..150e86c 100644
--- a/docs/api/doxygen/classtvm_1_1GenericFunc-members.html
+++ b/docs/api/doxygen/classtvm_1_1GenericFunc-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1GenericFunc.html b/docs/api/doxygen/classtvm_1_1GenericFunc.html
index 197f5c3..430ed90 100644
--- a/docs/api/doxygen/classtvm_1_1GenericFunc.html
+++ b/docs/api/doxygen/classtvm_1_1GenericFunc.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1GenericFuncNode-members.html b/docs/api/doxygen/classtvm_1_1GenericFuncNode-members.html
index ec893fc..864e010 100644
--- a/docs/api/doxygen/classtvm_1_1GenericFuncNode-members.html
+++ b/docs/api/doxygen/classtvm_1_1GenericFuncNode-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1GenericFuncNode.html b/docs/api/doxygen/classtvm_1_1GenericFuncNode.html
index 1a7cc10..9175be7 100644
--- a/docs/api/doxygen/classtvm_1_1GenericFuncNode.html
+++ b/docs/api/doxygen/classtvm_1_1GenericFuncNode.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1GlobalTypeVar-members.html b/docs/api/doxygen/classtvm_1_1GlobalTypeVar-members.html
index 8282576..c1c2e75 100644
--- a/docs/api/doxygen/classtvm_1_1GlobalTypeVar-members.html
+++ b/docs/api/doxygen/classtvm_1_1GlobalTypeVar-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1GlobalTypeVar.html b/docs/api/doxygen/classtvm_1_1GlobalTypeVar.html
index 83690c2..eaa5d41 100644
--- a/docs/api/doxygen/classtvm_1_1GlobalTypeVar.html
+++ b/docs/api/doxygen/classtvm_1_1GlobalTypeVar.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1GlobalTypeVarNode-members.html b/docs/api/doxygen/classtvm_1_1GlobalTypeVarNode-members.html
index 73f9822..0323704 100644
--- a/docs/api/doxygen/classtvm_1_1GlobalTypeVarNode-members.html
+++ b/docs/api/doxygen/classtvm_1_1GlobalTypeVarNode-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1GlobalTypeVarNode.html b/docs/api/doxygen/classtvm_1_1GlobalTypeVarNode.html
index b0feb39..d6dd7f4 100644
--- a/docs/api/doxygen/classtvm_1_1GlobalTypeVarNode.html
+++ b/docs/api/doxygen/classtvm_1_1GlobalTypeVarNode.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1GlobalVar-members.html b/docs/api/doxygen/classtvm_1_1GlobalVar-members.html
index 6679e99..4de9fbb 100644
--- a/docs/api/doxygen/classtvm_1_1GlobalVar-members.html
+++ b/docs/api/doxygen/classtvm_1_1GlobalVar-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1GlobalVar.html b/docs/api/doxygen/classtvm_1_1GlobalVar.html
index 7a40a3d..daccbf2 100644
--- a/docs/api/doxygen/classtvm_1_1GlobalVar.html
+++ b/docs/api/doxygen/classtvm_1_1GlobalVar.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1GlobalVarNode-members.html b/docs/api/doxygen/classtvm_1_1GlobalVarNode-members.html
index 77d96d3..bbbc70b 100644
--- a/docs/api/doxygen/classtvm_1_1GlobalVarNode-members.html
+++ b/docs/api/doxygen/classtvm_1_1GlobalVarNode-members.html
@@ -41,6 +41,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
       <li><a href="namespaces.html"><span>Namespaces</span></a></li>
       <li class="current"><a href="annotated.html"><span>Classes</span></a></li>
       <li><a href="files.html"><span>Files</span></a></li>
+      <li><a href="examples.html"><span>Examples</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
diff --git a/docs/api/doxygen/classtvm_1_1GlobalVarNode.html b/docs/api/doxygen/classtvm_1_1GlobalVarNode.html
index 5aaea10..dd75404 100644
... 45354 lines suppressed ...