You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by lm...@apache.org on 2020/09/17 21:55:22 UTC

[incubator-tvm-site] branch asf-site updated: Docs build at Thu Sep 17 14:54:07 PDT 2020

This is an automated email from the ASF dual-hosted git repository.

lmzheng pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/incubator-tvm-site.git


The following commit(s) were added to refs/heads/asf-site by this push:
     new f8c3a2c  Docs build at Thu Sep 17 14:54:07 PDT 2020
f8c3a2c is described below

commit f8c3a2c7e714d715b5140cada12aafcf50567dfb
Author: Lianmin Zheng <li...@gmail.com>
AuthorDate: Thu Sep 17 14:54:08 2020 -0700

    Docs build at Thu Sep 17 14:54:07 PDT 2020
---
 .../tuple_inputs.ipynb                             |    6 +-
 .../from_tflite.py                                 |   37 +-
 .../tune_simple_template.py                        |   56 +-
 .../from_tflite.ipynb                              |   12 +-
 .../tune_simple_template.ipynb                     |   12 +-
 .../opt_matmul_auto_tensorcore.py                  |  407 +-
 .../matrix_multiply.py                             |   42 +-
 .../use_pass_infra.ipynb                           |   12 +-
 .../143c743c62f58570eabd77fd3395ca8c/scan.py       |   14 +-
 .../deploy_prequantized_tflite.ipynb               |   16 +-
 .../tune_conv2d_cuda.ipynb                         |    6 +-
 .../relay_quick_start.ipynb                        |   10 +-
 .../tune_relay_cuda.py                             |  140 +-
 .../tune_relay_mobile_gpu.ipynb                    |    8 +-
 .../tune_conv2d_cuda.py                            |   55 +-
 .../matrix_multiply_opt.ipynb                      |    4 +-
 .../from_keras.ipynb                               |   10 +-
 .../from_coreml.ipynb                              |   10 +-
 .../from_caffe2.ipynb                              |   10 +-
 .../deploy_sparse.py                               |   30 +-
 .../tensor_expr_get_started.py                     |   22 +-
 .../deploy_model_on_android.ipynb                  |   12 +-
 .../deploy_model_on_android.py                     |   96 +-
 .../opt_gemm.ipynb                                 |   14 +-
 .../using_external_lib.py                          |   13 +-
 .../tune_relay_vta.ipynb                           |   10 +-
 .../cross_compilation_and_rpc.ipynb                |   12 +-
 .../4e9540fc014621d8d3bd14869c1ab227/scan.ipynb    |    6 +-
 .../deploy_quantized.ipynb                         |   12 +-
 .../from_tensorflow.py                             |   85 +-
 .../intro_topi.ipynb                               |    8 +-
 .../deploy_quantized.py                            |   26 +-
 .../5bd1bb9c6505ea40407fa19f01579414/reduction.py  |   33 +-
 .../deploy_prequantized_tflite.py                  |   47 +-
 .../intrin_math.ipynb                              |    6 +-
 .../tune_relay_vta.py                              |  158 +-
 .../schedule_primitives.ipynb                      |   24 +-
 .../tensorize.ipynb                                |   12 +-
 .../opt_conv_cuda.ipynb                            |    6 +-
 .../matrix_multiply_opt.py                         |   95 +-
 .../tuple_inputs.py                                |   30 +-
 .../696dd37904ef92773435ca321ff41bfb/from_onnx.py  |   34 +-
 .../relay_quick_start.py                           |   17 +-
 .../using_external_lib.ipynb                       |    6 +-
 .../from_pytorch.ipynb                             |   12 +-
 .../tensor_expr_get_started.ipynb                  |   14 +-
 .../70d345c5409f99cb5de9dc44f147ff6f/build_gcn.py  |  122 +-
 .../from_caffe2.py                                 |   48 +-
 .../tune_relay_cuda.ipynb                          |   10 +-
 .../deploy_prequantized.ipynb                      |    6 +-
 .../7ece74acc230c7d55086182cc8884b09/extern_op.py  |   40 +-
 .../deploy_ssd_gluoncv.ipynb                       |   10 +-
 .../from_darknet.ipynb                             |   10 +-
 .../836dc3852acf09662e9eb37c4c5e1e1b/opt_gemm.py   |  100 +-
 .../deploy_model_on_rasp.py                        |   48 +-
 .../tune_relay_x86.py                              |  107 +-
 .../extern_op.ipynb                                |   10 +-
 .../opt_matmul_auto_tensorcore.ipynb               |    8 +-
 .../deploy_sparse.ipynb                            |    8 +-
 .../deploy_prequantized.py                         |   38 +-
 .../tune_matmul_x86.py                             |  173 +
 .../schedule_primitives.py                         |   58 +-
 .../opt_conv_tensorcore.ipynb                      |   12 +-
 .../9a950897eeef498440fbe2f0afe2601f/tedd.py       |   26 +-
 .../9b0365fd5723f7c4d4e996637ab9a487/intro_topi.py |    8 +-
 .../low_level_custom_pass.py                       |   18 +-
 .../use_pass_infra.py                              |   43 +-
 .../a2f661bf234a167b5458fa28d8fafedc/tedd.ipynb    |   10 +-
 .../from_darknet.py                                |  118 +-
 .../opt_conv_cuda.py                               |   39 +-
 .../tune_relay_x86.ipynb                           |    6 +-
 .../baa4de13ce6d932de43e0eb5c4cb8f16/tensorize.py  |  107 +-
 .../tune_relay_arm.py                              |  135 +-
 .../vta_get_started.py                             |   23 +-
 .../deploy_model_on_rasp.ipynb                     |   12 +-
 .../build_gcn.ipynb                                |   16 +-
 .../deploy_object_detection_pytorch.ipynb          |  162 +
 .../deploy_classification.ipynb                    |    6 +-
 .../convolution_opt.ipynb                          |    6 +-
 .../deploy_ssd_gluoncv.py                          |   38 +-
 .../convolution_opt.py                             |  150 +-
 .../micro_tflite.ipynb                             |   55 +-
 .../cross_compilation_and_rpc.py                   |   26 +-
 .../matrix_multiply.ipynb                          |   10 +-
 .../from_tensorflow.ipynb                          |   18 +-
 .../opt_conv_tensorcore.py                         |  233 +-
 .../tune_relay_mobile_gpu.py                       |  141 +-
 .../from_coreml.py                                 |   41 +-
 .../low_level_custom_pass.ipynb                    |    6 +-
 .../from_onnx.ipynb                                |   10 +-
 .../ea0c81cab71096d16b825a33fd276c58/from_mxnet.py |   55 +-
 .../reduction.ipynb                                |    8 +-
 .../deploy_object_detection_pytorch.py             |  154 +
 .../deploy_classification.py                       |   45 +-
 .../tune_matmul_x86.ipynb                          |  212 +
 .../from_mxnet.ipynb                               |   12 +-
 .../f59fd8b968f7dcde34ed872c8527c192/from_keras.py |   54 +-
 .../vta_get_started.ipynb                          |    8 +-
 .../from_pytorch.py                                |   75 +-
 .../tune_relay_arm.ipynb                           |    8 +-
 .../intrin_math.py                                 |   11 +-
 .../micro_tflite.py                                |  120 +-
 ...x_glr_deploy_object_detection_pytorch_thumb.png |  Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_tune_matmul_x86_thumb.png    |  Bin 0 -> 26786 bytes
 .../{index.rst.txt => auto_scheduler.rst.txt}      |   49 +-
 docs/_sources/api/python/autotvm.rst.txt           |    2 +-
 docs/_sources/api/python/index.rst.txt             |    1 +
 docs/_sources/contribute/code_guide.rst.txt        |    2 +-
 docs/_sources/dev/inferbound.rst.txt               |    8 +-
 docs/_sources/install/docker.rst.txt               |    2 +-
 docs/_sources/install/from_source.rst.txt          |    8 +-
 .../auto_scheduler/sg_execution_times.rst.txt      |   10 +
 .../auto_scheduler/tune_matmul_x86.rst.txt         |  364 ++
 .../tutorials/autotvm/sg_execution_times.rst.txt   |   16 +-
 .../tutorials/autotvm/tune_conv2d_cuda.rst.txt     |   99 +-
 .../tutorials/autotvm/tune_relay_arm.rst.txt       |  135 +-
 .../tutorials/autotvm/tune_relay_cuda.rst.txt      |  140 +-
 .../autotvm/tune_relay_mobile_gpu.rst.txt          |  141 +-
 .../tutorials/autotvm/tune_relay_x86.rst.txt       |  107 +-
 .../tutorials/autotvm/tune_simple_template.rst.txt |   76 +-
 .../tutorials/dev/low_level_custom_pass.rst.txt    |   18 +-
 .../tutorials/dev/sg_execution_times.rst.txt       |    6 +-
 docs/_sources/tutorials/dev/use_pass_infra.rst.txt |  125 +-
 docs/_sources/tutorials/frontend/build_gcn.rst.txt |  122 +-
 .../frontend/deploy_model_on_android.rst.txt       |   98 +-
 .../frontend/deploy_model_on_rasp.rst.txt          |   48 +-
 .../deploy_object_detection_pytorch.rst.txt        |  268 ++
 .../tutorials/frontend/deploy_prequantized.rst.txt |   40 +-
 .../frontend/deploy_prequantized_tflite.rst.txt    |   51 +-
 .../tutorials/frontend/deploy_quantized.rst.txt    |   26 +-
 .../tutorials/frontend/deploy_sparse.rst.txt       |   28 +-
 .../tutorials/frontend/deploy_ssd_gluoncv.rst.txt  |   40 +-
 .../tutorials/frontend/from_caffe2.rst.txt         |   48 +-
 .../tutorials/frontend/from_coreml.rst.txt         |   41 +-
 .../tutorials/frontend/from_darknet.rst.txt        |  118 +-
 .../_sources/tutorials/frontend/from_keras.rst.txt |   54 +-
 .../_sources/tutorials/frontend/from_mxnet.rst.txt |   55 +-
 docs/_sources/tutorials/frontend/from_onnx.rst.txt |   38 +-
 .../tutorials/frontend/from_pytorch.rst.txt        |   75 +-
 .../tutorials/frontend/from_tensorflow.rst.txt     |   91 +-
 .../tutorials/frontend/from_tflite.rst.txt         |   37 +-
 .../tutorials/frontend/sg_execution_times.rst.txt  |   39 +-
 .../tutorials/frontend/using_external_lib.rst.txt  |   13 +-
 .../get_started/cross_compilation_and_rpc.rst.txt  |   28 +-
 .../get_started/relay_quick_start.rst.txt          |   21 +-
 .../get_started/sg_execution_times.rst.txt         |    8 +-
 .../get_started/tensor_expr_get_started.rst.txt    |   22 +-
 docs/_sources/tutorials/index.rst.txt              |   74 +-
 docs/_sources/tutorials/language/extern_op.rst.txt |   40 +-
 .../tutorials/language/intrin_math.rst.txt         |   11 +-
 docs/_sources/tutorials/language/reduction.rst.txt |   33 +-
 docs/_sources/tutorials/language/scan.rst.txt      |   26 +-
 .../tutorials/language/schedule_primitives.rst.txt |   80 +-
 .../tutorials/language/sg_execution_times.rst.txt  |   18 +-
 docs/_sources/tutorials/language/tedd.rst.txt      |   26 +-
 docs/_sources/tutorials/language/tensorize.rst.txt |  111 +-
 .../tutorials/language/tuple_inputs.rst.txt        |   46 +-
 docs/_sources/tutorials/micro/micro_tflite.rst.txt |  122 +-
 .../tutorials/micro/sg_execution_times.rst.txt     |    4 +-
 .../tutorials/optimize/opt_conv_cuda.rst.txt       |   41 +-
 .../tutorials/optimize/opt_conv_tensorcore.rst.txt |  235 +-
 docs/_sources/tutorials/optimize/opt_gemm.rst.txt  |  120 +-
 .../optimize/opt_matmul_auto_tensorcore.rst.txt    |  407 +-
 .../tutorials/optimize/sg_execution_times.rst.txt  |   10 +-
 docs/_sources/tutorials/topi/intro_topi.rst.txt    |   10 +-
 .../tutorials/topi/sg_execution_times.rst.txt      |    4 +-
 docs/_sources/vta/install.rst.txt                  |   24 +-
 .../tutorials/autotvm/sg_execution_times.rst.txt   |    4 +-
 .../vta/tutorials/autotvm/tune_relay_vta.rst.txt   |  160 +-
 .../frontend/deploy_classification.rst.txt         |   49 +-
 .../tutorials/frontend/sg_execution_times.rst.txt  |    4 +-
 .../_sources/vta/tutorials/matrix_multiply.rst.txt |   46 +-
 .../vta/tutorials/optimize/convolution_opt.rst.txt |  158 +-
 .../tutorials/optimize/matrix_multiply_opt.rst.txt |   99 +-
 .../tutorials/optimize/sg_execution_times.rst.txt  |    6 +-
 .../vta/tutorials/sg_execution_times.rst.txt       |    6 +-
 .../_sources/vta/tutorials/vta_get_started.rst.txt |   31 +-
 docs/api/doxygen/analyzer_8h.html                  |    2 +-
 docs/api/doxygen/analyzer_8h__dep__incl.svg        |  931 ++---
 docs/api/doxygen/analyzer_8h_source.html           |    4 +-
 docs/api/doxygen/annotated.html                    |  199 +-
 docs/api/doxygen/attr__registry__map_8h.html       |    2 +-
 .../doxygen/attr__registry__map_8h__dep__incl.svg  |  738 ++--
 .../api/doxygen/attr__registry__map_8h_source.html |    2 +-
 docs/api/doxygen/auto__schedule_8h.html            |    2 +-
 docs/api/doxygen/auto__schedule_8h__incl.svg       | 1526 ++++---
 docs/api/doxygen/auto__schedule_8h_source.html     |    4 +-
 docs/api/doxygen/auto__scheduler_2feature_8h.html  |    2 +-
 .../doxygen/auto__scheduler_2feature_8h__incl.svg  | 1724 ++++----
 docs/api/doxygen/base_8h_source.html               |   17 +-
 docs/api/doxygen/bias__add_8h.html                 |    2 +-
 docs/api/doxygen/bias__add_8h__incl.svg            | 1061 ++---
 docs/api/doxygen/bias__add_8h_source.html          |    2 +-
 docs/api/doxygen/broadcast_8h.html                 |    2 +-
 docs/api/doxygen/broadcast_8h__dep__incl.svg       |   92 +-
 docs/api/doxygen/buffer_8h_source.html             |    6 +-
 docs/api/doxygen/bytecode_8h.html                  |    3 +-
 docs/api/doxygen/bytecode_8h_source.html           |   91 +-
 docs/api/doxygen/c__runtime__api_8h.html           |    2 +-
 docs/api/doxygen/c__runtime__api_8h__dep__incl.svg | 1554 +++----
 docs/api/doxygen/c__runtime__api_8h_source.html    |    2 +-
 docs/api/doxygen/classes.html                      |  324 +-
 docs/api/doxygen/classtvm_1_1Target-members.html   |   17 +-
 docs/api/doxygen/classtvm_1_1Target.html           |  190 +-
 .../doxygen/classtvm_1_1TargetKind-members.html    |   13 +-
 docs/api/doxygen/classtvm_1_1TargetKind.html       |   43 +-
 .../classtvm_1_1TargetKindNode-members.html        |    9 +-
 docs/api/doxygen/classtvm_1_1TargetKindNode.html   |   79 +-
 .../classtvm_1_1TargetKindNode__coll__graph.svg    |  199 +-
 .../classtvm_1_1TargetKindNode__inherit__graph.svg |   37 +-
 .../classtvm_1_1TargetKindRegEntry-members.html    |    9 +-
 .../doxygen/classtvm_1_1TargetKindRegEntry.html    |   48 +-
 ...classtvm_1_1TargetKindRegEntry__coll__graph.svg |   25 +-
 .../doxygen/classtvm_1_1TargetNode-members.html    |    2 +-
 docs/api/doxygen/classtvm_1_1TargetNode.html       |   13 +-
 ...ers.html => classtvm_1_1TargetTag-members.html} |   15 +-
 ...1TargetKind.html => classtvm_1_1TargetTag.html} |  178 +-
 ...html => classtvm_1_1TargetTagNode-members.html} |   21 +-
 ...getKind.html => classtvm_1_1TargetTagNode.html} |  209 +-
 .../classtvm_1_1TargetTagNode__coll__graph.svg     |  176 +
 ... classtvm_1_1TargetTagNode__inherit__graph.svg} |   39 +-
 ... => classtvm_1_1TargetTagRegEntry-members.html} |   15 +-
 .../api/doxygen/classtvm_1_1TargetTagRegEntry.html |  276 ++
 .../classtvm_1_1TargetTagRegEntry__coll__graph.svg |   24 +
 .../doxygen/classtvm_1_1TargetTag__coll__graph.svg |   40 +
 .../classtvm_1_1TargetTag__inherit__graph.svg      |   40 +
 .../doxygen/classtvm_1_1Target__coll__graph.svg    |   43 +-
 .../doxygen/classtvm_1_1Target__inherit__graph.svg |   43 +-
 ...sstvm_1_1auto__scheduler_1_1AccessAnalyzer.html |    4 +-
 ...m_1_1auto__scheduler_1_1AccessAnalyzerNode.html |   12 +-
 ...m_1_1auto__scheduler_1_1ComputeDAG-members.html |    8 +-
 .../classtvm_1_1auto__scheduler_1_1ComputeDAG.html |   77 +-
 ...1auto__scheduler_1_1ComputeDAG__coll__graph.svg |   39 +-
 ...to__scheduler_1_1ComputeDAG__inherit__graph.svg |   39 +-
 ...m_1_1auto__scheduler_1_1HardwareParamsNode.html |    4 +-
 ...tvm_1_1auto__scheduler_1_1Iterator-members.html |    2 +-
 .../classtvm_1_1auto__scheduler_1_1Iterator.html   |   17 +-
 ...1_1auto__scheduler_1_1IteratorNode-members.html |    7 +-
 ...lasstvm_1_1auto__scheduler_1_1IteratorNode.html |   19 +-
 ...uto__scheduler_1_1IteratorNode__coll__graph.svg |  193 +-
 ...__scheduler_1_1IteratorNode__inherit__graph.svg |   41 +-
 ..._1_1auto__scheduler_1_1LocalRunner-members.html |    2 +-
 ...classtvm_1_1auto__scheduler_1_1LocalRunner.html |   19 +-
 ...auto__scheduler_1_1LocalRunnerNode-members.html |   15 +-
 ...stvm_1_1auto__scheduler_1_1LocalRunnerNode.html |    7 +-
 ...__scheduler_1_1LocalRunnerNode__coll__graph.svg |   47 +-
 ...cheduler_1_1LocalRunnerNode__inherit__graph.svg |   47 +-
 ...lasstvm_1_1auto__scheduler_1_1MeasureInput.html |    2 +-
 ...auto__scheduler_1_1ProgramMeasurer-members.html |    2 +-
 ...stvm_1_1auto__scheduler_1_1ProgramMeasurer.html |   12 +-
 ...__scheduler_1_1ProgramMeasurerNode-members.html |    4 +-
 ..._1_1auto__scheduler_1_1ProgramMeasurerNode.html |   20 +-
 ...heduler_1_1ProgramMeasurerNode__coll__graph.svg |    4 +-
 ...uler_1_1ProgramMeasurerNode__inherit__graph.svg |    4 +-
 ...to__scheduler_1_1ProgramRunnerNode-members.html |   13 +-
 ...vm_1_1auto__scheduler_1_1ProgramRunnerNode.html |   21 +-
 ...scheduler_1_1ProgramRunnerNode__coll__graph.svg |   43 +-
 ...eduler_1_1ProgramRunnerNode__inherit__graph.svg |   51 +-
 ...vm_1_1auto__scheduler_1_1RPCRunner-members.html |    2 +-
 .../classtvm_1_1auto__scheduler_1_1RPCRunner.html  |   19 +-
 ..._1auto__scheduler_1_1RPCRunnerNode-members.html |   25 +-
 ...asstvm_1_1auto__scheduler_1_1RPCRunnerNode.html |    5 +-
 ...to__scheduler_1_1RPCRunnerNode__coll__graph.svg |   33 +-
 ..._scheduler_1_1RPCRunnerNode__inherit__graph.svg |   47 +-
 ...sstvm_1_1auto__scheduler_1_1SearchTaskNode.html |    2 +-
 ...o__scheduler_1_1SearchTaskNode__coll__graph.svg |  264 +-
 .../classtvm_1_1relay_1_1IdNode-members.html       |    4 +
 docs/api/doxygen/classtvm_1_1relay_1_1IdNode.html  |  110 +-
 .../classtvm_1_1relay_1_1IdNode__coll__graph.svg   |  180 +-
 ...classtvm_1_1relay_1_1IdNode__inherit__graph.svg |   44 +-
 ...classtvm_1_1runtime_1_1IterAdapter-members.html |   15 +-
 .../classtvm_1_1runtime_1_1IterAdapter.html        |   30 +-
 ...stvm_1_1runtime_1_1IterAdapter__coll__graph.svg |   33 +-
 ...vm_1_1runtime_1_1ModuleNode__inherit__graph.svg |   43 +-
 ...asstvm_1_1runtime_1_1Object__inherit__graph.svg |   43 +-
 ...time_1_1micro__rpc_1_1FrameBuffer-members.html} |   20 +-
 ...vm_1_1runtime_1_1micro__rpc_1_1FrameBuffer.html |  309 ++
 ...e_1_1micro__rpc_1_1FrameBuffer__coll__graph.svg |   29 +
 ..._1runtime_1_1micro__rpc_1_1Framer-members.html} |   20 +-
 ...lasstvm_1_1runtime_1_1micro__rpc_1_1Framer.html |  329 ++
 ...untime_1_1micro__rpc_1_1Framer__coll__graph.svg |   28 +
 ...icro__rpc_1_1PacketFieldSizeBytes-members.html} |   10 +-
 ...ime_1_1micro__rpc_1_1PacketFieldSizeBytes.html} |   79 +-
 ...o__rpc_1_1PacketFieldSizeBytes__coll__graph.svg |   24 +
 ...1runtime_1_1micro__rpc_1_1Session-members.html} |   35 +-
 ...asstvm_1_1runtime_1_1micro__rpc_1_1Session.html |  567 +++
 ...ntime_1_1micro__rpc_1_1Session__coll__graph.svg |   34 +
 ...runtime_1_1micro__rpc_1_1Unframer-members.html} |   14 +-
 ...sstvm_1_1runtime_1_1micro__rpc_1_1Unframer.html |  241 ++
 ...time_1_1micro__rpc_1_1Unframer__coll__graph.svg |   26 +
 ...time_1_1micro__rpc_1_1WriteStream-members.html} |   14 +-
 ...m_1_1runtime_1_1micro__rpc_1_1WriteStream.html} |  139 +-
 ...e_1_1micro__rpc_1_1WriteStream__coll__graph.svg |   26 +
 ...tvm_1_1runtime_1_1vm_1_1Executable-members.html |   93 +-
 .../classtvm_1_1runtime_1_1vm_1_1Executable.html   |   21 +-
 ...1_1runtime_1_1vm_1_1Executable__coll__graph.svg |  237 +-
 ...runtime_1_1vm_1_1Executable__inherit__graph.svg |  135 +-
 ...1_1runtime_1_1vm_1_1VirtualMachine-members.html |   12 +-
 ...lasstvm_1_1runtime_1_1vm_1_1VirtualMachine.html |   69 +-
 ...untime_1_1vm_1_1VirtualMachine__coll__graph.svg |   53 +-
 ...ime_1_1vm_1_1VirtualMachine__inherit__graph.svg |    2 +-
 .../classtvm_1_1tir_1_1BijectiveLayout.html        |    4 +-
 docs/api/doxygen/codegen_8h.html                   |    2 +-
 docs/api/doxygen/codegen_8h__incl.svg              | 1592 ++++----
 docs/api/doxygen/codegen_8h_source.html            |    2 +-
 docs/api/doxygen/compute__dag_8h.html              |    2 +-
 docs/api/doxygen/compute__dag_8h__incl.svg         |  955 ++---
 docs/api/doxygen/compute__dag_8h_source.html       |   14 +-
 docs/api/doxygen/constant__utils_8h.html           |    2 +-
 docs/api/doxygen/constant__utils_8h__dep__incl.svg |  213 +-
 docs/api/doxygen/cost__model_8h.html               |    2 +-
 docs/api/doxygen/cost__model_8h__incl.svg          |  979 +++--
 docs/api/doxygen/cost__model_8h_source.html        |    8 +-
 docs/api/doxygen/crt_2packed__func_8h.html         |    2 +-
 docs/api/doxygen/crt_2packed__func_8h__incl.svg    |   59 +-
 docs/api/doxygen/crt_2packed__func_8h_source.html  |    4 +-
 docs/api/doxygen/crt_8h.html                       |   41 +-
 docs/api/doxygen/crt_8h__incl.svg                  |   38 +-
 docs/api/doxygen/crt_8h_source.html                |    6 +-
 docs/api/doxygen/cublas_8h_source.html             |    2 +-
 docs/api/doxygen/cuda_2dense_8h.html               |    2 +-
 docs/api/doxygen/cuda_2dense_8h__incl.svg          | 1596 ++++----
 docs/api/doxygen/cuda_2dense_8h_source.html        |   12 +-
 docs/api/doxygen/cuda_2injective_8h.html           |    2 +-
 docs/api/doxygen/cuda_2injective_8h__incl.svg      | 1988 +++++----
 docs/api/doxygen/cuda_2injective_8h_source.html    |    9 +-
 docs/api/doxygen/cuda_2normalization_8h.html       |    2 +-
 docs/api/doxygen/cuda_2normalization_8h__incl.svg  | 1976 +++++----
 .../api/doxygen/cuda_2normalization_8h_source.html |    7 +-
 docs/api/doxygen/cuda_2pooling_8h.html             |    2 +-
 docs/api/doxygen/cuda_2pooling_8h__incl.svg        | 2000 +++++----
 docs/api/doxygen/cuda_2pooling_8h_source.html      |   10 +-
 docs/api/doxygen/cuda_2reduction_8h.html           |    2 +-
 docs/api/doxygen/cuda_2reduction_8h__incl.svg      | 1988 +++++----
 docs/api/doxygen/cuda_2reduction_8h_source.html    |   14 +-
 docs/api/doxygen/cuda_2softmax_8h.html             |    2 +-
 docs/api/doxygen/cuda_2softmax_8h__incl.svg        | 1988 +++++----
 docs/api/doxygen/cuda_2softmax_8h_source.html      |    9 +-
 docs/api/doxygen/data__layout_8h.html              |    2 +-
 docs/api/doxygen/data__layout_8h_source.html       |   12 +-
 docs/api/doxygen/data__type_8h.html                |    2 +-
 docs/api/doxygen/data__type_8h__dep__incl.svg      | 1057 +++--
 docs/api/doxygen/dataflow__matcher_8h_source.html  |    4 +-
 docs/api/doxygen/dataflow__pattern_8h_source.html  |    6 +-
 docs/api/doxygen/detail_2broadcast_8h.html         |    2 +-
 .../doxygen/detail_2broadcast_8h__dep__incl.svg    |  102 +-
 docs/api/doxygen/detail_2broadcast_8h_source.html  |    5 +-
 docs/api/doxygen/detail_2extern_8h_source.html     |    4 +-
 docs/api/doxygen/dilate_8h_source.html             |    6 +-
 ...r_000002_000018.html => dir_000002_000019.html} |    0
 docs/api/doxygen/dir_000003_000019.html            |   90 -
 docs/api/doxygen/dir_000003_000020.html            |    4 +-
 ...r_000003_000020.html => dir_000003_000021.html} |    0
 docs/api/doxygen/dir_000004_000019.html            |   90 -
 docs/api/doxygen/dir_000004_000020.html            |    4 +-
 ...r_000004_000020.html => dir_000004_000021.html} |    0
 docs/api/doxygen/dir_000005_000018.html            |   90 -
 docs/api/doxygen/dir_000005_000019.html            |    4 +-
 docs/api/doxygen/dir_000005_000020.html            |    4 +-
 ...r_000005_000020.html => dir_000005_000021.html} |    0
 ...r_000006_000018.html => dir_000006_000019.html} |    0
 docs/api/doxygen/dir_000012_000019.html            |   90 -
 docs/api/doxygen/dir_000012_000020.html            |    4 +-
 ...r_000012_000020.html => dir_000012_000021.html} |    0
 docs/api/doxygen/dir_000018_000009.html            |   90 -
 docs/api/doxygen/dir_000019_000006.html            |   90 -
 docs/api/doxygen/dir_000019_000009.html            |    6 +-
 docs/api/doxygen/dir_000019_000013.html            |   90 -
 docs/api/doxygen/dir_000020_000002.html            |   90 -
 ...r_000019_000008.html => dir_000020_000006.html} |    4 +-
 docs/api/doxygen/dir_000020_000008.html            |    6 +-
 docs/api/doxygen/dir_000020_000009.html            |    6 +-
 docs/api/doxygen/dir_000020_000013.html            |    6 +-
 ...r_000019_000018.html => dir_000020_000019.html} |    2 +-
 docs/api/doxygen/dir_000021_000002.html            |    6 +-
 docs/api/doxygen/dir_000021_000008.html            |    6 +-
 ...r_000020_000009.html => dir_000021_000009.html} |    0
 ...r_000020_000013.html => dir_000021_000013.html} |    0
 ...r_000020_000018.html => dir_000021_000019.html} |    0
 docs/api/doxygen/dir_000021_000020.html            |   90 -
 ...r_000021_000002.html => dir_000022_000002.html} |    0
 ...r_000021_000008.html => dir_000022_000008.html} |    0
 docs/api/doxygen/dir_000022_000020.html            |   90 -
 docs/api/doxygen/dir_000022_000021.html            |    6 +-
 docs/api/doxygen/dir_000023_000019.html            |   90 -
 docs/api/doxygen/dir_000023_000020.html            |   90 -
 docs/api/doxygen/dir_000023_000021.html            |    6 +-
 docs/api/doxygen/dir_000023_000022.html            |    6 +-
 docs/api/doxygen/dir_000024_000020.html            |    6 +-
 docs/api/doxygen/dir_000024_000021.html            |    6 +-
 ...r_000023_000021.html => dir_000024_000022.html} |    0
 ...r_000023_000022.html => dir_000024_000023.html} |    0
 ...r_000023_000024.html => dir_000024_000025.html} |    0
 ...r_000023_000026.html => dir_000024_000027.html} |    0
 ...r_000024_000002.html => dir_000025_000002.html} |    0
 docs/api/doxygen/dir_000025_000019.html            |   90 -
 docs/api/doxygen/dir_000025_000020.html            |   90 -
 docs/api/doxygen/dir_000025_000021.html            |    6 +-
 docs/api/doxygen/dir_000025_000022.html            |    6 +-
 docs/api/doxygen/dir_000026_000019.html            |   90 -
 docs/api/doxygen/dir_000026_000020.html            |    6 +-
 docs/api/doxygen/dir_000026_000021.html            |    6 +-
 ...r_000025_000021.html => dir_000026_000022.html} |    0
 ...r_000025_000022.html => dir_000026_000023.html} |    0
 ...r_000025_000023.html => dir_000026_000024.html} |    0
 ...r_000025_000024.html => dir_000026_000025.html} |    0
 ...r_000025_000026.html => dir_000026_000027.html} |    0
 docs/api/doxygen/dir_000027_000019.html            |   90 -
 docs/api/doxygen/dir_000027_000020.html            |    6 +-
 docs/api/doxygen/dir_000027_000021.html            |    6 +-
 ...r_000026_000021.html => dir_000027_000022.html} |    0
 docs/api/doxygen/dir_000028_000020.html            |    6 +-
 docs/api/doxygen/dir_000028_000021.html            |    6 +-
 ...r_000027_000021.html => dir_000028_000022.html} |    0
 ...r_000028_000020.html => dir_000029_000021.html} |    0
 ...r_000028_000021.html => dir_000029_000022.html} |    0
 .../dir_02be2c9d68e402f80df60bd528724ee5_dep.svg   |   26 +-
 .../dir_194ecda214f05a38134392ac6a69b970_dep.svg   |    4 +-
 .../dir_1f1b12d204a071c9e67e47fcbb552b86_dep.svg   |   10 +-
 .../dir_2b0ef9f1c86b565a92e96353e1195b2c_dep.svg   |    8 +-
 .../dir_3a038e7bfa2370c6aee2a5aecd5d3ef1_dep.svg   |   12 +-
 .../dir_404558507ed35459f0d7a6d81d8c508d_dep.svg   |    2 +-
 .../dir_54983dd6d74c59f67ee9e8e5a50aafc4_dep.svg   |   42 +-
 .../dir_5da96592f3a7c442b838b075c58254c2.html      |    3 +
 .../dir_5da96592f3a7c442b838b075c58254c2_dep.svg   |   22 +-
 .../dir_63946bee875c6d52bce55e72a67a86ad_dep.svg   |   24 +-
 .../dir_8395ded0a3205c0748976a0d4487d38d_dep.svg   |    8 +-
 .../dir_8e4e25e66b8623d88c5b5dd2040bca97_dep.svg   |   74 +-
 .../dir_a2900df4deca8dd2bcded616f0fe650a.html      |   10 +-
 .../dir_a2900df4deca8dd2bcded616f0fe650a_dep.svg   |   28 +-
 .../dir_a98464176f1216e334ac3bbacd433085_dep.svg   |   16 +-
 .../dir_ac57496531ccbad72f774fa62e6de987_dep.svg   |   28 +-
 .../dir_b4c7d8e826c599ba55146c099a14beb5_dep.svg   |   52 +-
 .../dir_d331277d4303e21ded95616eb56c1a9e_dep.svg   |    6 +-
 .../dir_d3953cf7eb33eca56fc6850c0e98447d_dep.svg   |    6 +-
 .../dir_dc867ff9a37cad1764f1670dc7eba6c1_dep.svg   |    4 +-
 ...l => dir_f13f0b82f2bd345d2d28ad76dc90e0ea.html} |   28 +-
 .../dir_f13f0b82f2bd345d2d28ad76dc90e0ea_dep.svg   |   28 +
 .../dir_f97d855a3173728370e632aa77170e34_dep.svg   |   14 +-
 .../dir_fb1b1bc11a768ab8cf63a96a73170118_dep.svg   |    2 +-
 docs/api/doxygen/driver__api_8h.html               |    2 +-
 docs/api/doxygen/driver__api_8h__incl.svg          | 1563 ++++----
 docs/api/doxygen/elemwise_8h_source.html           |    6 +-
 docs/api/doxygen/env__func_8h_source.html          |    2 +-
 docs/api/doxygen/error_8h.html                     |    2 +-
 docs/api/doxygen/error_8h__dep__incl.svg           |  877 +---
 docs/api/doxygen/error__codes_8h.html              |   63 +-
 docs/api/doxygen/error__codes_8h__dep__incl.svg    |  141 +-
 docs/api/doxygen/error__codes_8h_source.html       |   35 +-
 docs/api/doxygen/executable_8h_source.html         |    5 +-
 docs/api/doxygen/files.html                        |   29 +-
 docs/api/doxygen/flatten_8h_source.html            |    2 +-
 ...registry__map_8h.html => frame__buffer_8h.html} |   33 +-
 docs/api/doxygen/frame__buffer_8h__dep__incl.svg   |   34 +
 docs/api/doxygen/frame__buffer_8h__incl.svg        |   40 +
 docs/api/doxygen/frame__buffer_8h_source.html      |  109 +
 .../{runtime_2memory_8h.html => framing_8h.html}   |   55 +-
 docs/api/doxygen/framing_8h__dep__incl.svg         |   33 +
 docs/api/doxygen/framing_8h__incl.svg              |   92 +
 docs/api/doxygen/framing_8h_source.html            |  115 +
 docs/api/doxygen/func__registry_8h_source.html     |    2 +-
 docs/api/doxygen/functions.html                    |    3 +
 docs/api/doxygen/functions_0x7e.html               |    3 +
 docs/api/doxygen/functions_a.html                  |   19 +-
 docs/api/doxygen/functions_b.html                  |    3 +
 docs/api/doxygen/functions_c.html                  |   29 +-
 docs/api/doxygen/functions_d.html                  |   11 +-
 docs/api/doxygen/functions_e.html                  |    3 +
 docs/api/doxygen/functions_f.html                  |   15 +-
 docs/api/doxygen/functions_func_0x7e.html          |    3 +
 docs/api/doxygen/functions_func_a.html             |   15 +-
 docs/api/doxygen/functions_func_b.html             |    3 +
 docs/api/doxygen/functions_func_c.html             |   20 +-
 docs/api/doxygen/functions_func_d.html             |    3 +
 docs/api/doxygen/functions_func_f.html             |   17 +-
 docs/api/doxygen/functions_func_g.html             |   11 +-
 docs/api/doxygen/functions_func_i.html             |   16 +-
 docs/api/doxygen/functions_func_l.html             |    7 +-
 docs/api/doxygen/functions_func_o.html             |   13 +-
 docs/api/doxygen/functions_func_p.html             |   10 +-
 docs/api/doxygen/functions_func_r.html             |   19 +-
 docs/api/doxygen/functions_func_s.html             |   44 +-
 docs/api/doxygen/functions_func_t.html             |   16 +-
 docs/api/doxygen/functions_func_u.html             |    5 +-
 docs/api/doxygen/functions_func_v.html             |   25 +-
 docs/api/doxygen/functions_func_w.html             |   10 +
 docs/api/doxygen/functions_g.html                  |   11 +-
 docs/api/doxygen/functions_i.html                  |   19 +-
 docs/api/doxygen/functions_k.html                  |   12 +
 docs/api/doxygen/functions_l.html                  |    8 +-
 docs/api/doxygen/functions_m.html                  |   10 +-
 docs/api/doxygen/functions_n.html                  |    1 +
 docs/api/doxygen/functions_o.html                  |   20 +-
 docs/api/doxygen/functions_p.html                  |   16 +-
 docs/api/doxygen/functions_r.html                  |   19 +-
 docs/api/doxygen/functions_rela.html               |   27 +-
 docs/api/doxygen/functions_s.html                  |   56 +-
 docs/api/doxygen/functions_t.html                  |   34 +-
 docs/api/doxygen/functions_type.html               |   14 +-
 docs/api/doxygen/functions_u.html                  |    3 +
 docs/api/doxygen/functions_v.html                  |   29 +-
 docs/api/doxygen/functions_vars.html               |    3 +
 docs/api/doxygen/functions_vars_a.html             |    2 +-
 docs/api/doxygen/functions_vars_c.html             |    6 +-
 docs/api/doxygen/functions_vars_d.html             |    8 +-
 docs/api/doxygen/functions_vars_e.html             |    3 +
 docs/api/doxygen/functions_vars_k.html             |   12 +
 docs/api/doxygen/functions_vars_l.html             |    3 +
 docs/api/doxygen/functions_vars_m.html             |    7 +-
 docs/api/doxygen/functions_vars_n.html             |    1 +
 docs/api/doxygen/functions_vars_o.html             |    3 +
 docs/api/doxygen/functions_vars_p.html             |    6 +
 docs/api/doxygen/functions_vars_s.html             |    7 +
 docs/api/doxygen/functions_w.html                  |   13 +
 docs/api/doxygen/functor_8h.html                   |    2 +-
 docs/api/doxygen/functor_8h__dep__incl.svg         | 1139 +++---
 docs/api/doxygen/generic_2default_8h.html          |    2 +-
 docs/api/doxygen/generic_2default_8h__incl.svg     | 1988 +++++----
 docs/api/doxygen/generic_2default_8h_source.html   |    6 +-
 docs/api/doxygen/generic_2extern_8h.html           |    2 +-
 docs/api/doxygen/generic_2extern_8h__incl.svg      | 2016 +++++-----
 docs/api/doxygen/generic_2extern_8h_source.html    |    6 +-
 docs/api/doxygen/generic_2injective_8h.html        |    2 +-
 docs/api/doxygen/generic_2injective_8h__incl.svg   | 1988 +++++----
 docs/api/doxygen/generic_2injective_8h_source.html |    6 +-
 docs/api/doxygen/generic__func_8h.html             |    2 +-
 docs/api/doxygen/generic__func_8h__incl.svg        | 1342 +++----
 docs/api/doxygen/generic__func_8h_source.html      |    2 +-
 docs/api/doxygen/globals_defs.html                 |    8 +-
 docs/api/doxygen/globals_eval.html                 |   51 +
 docs/api/doxygen/globals_func.html                 |   13 +-
 docs/api/doxygen/globals_k.html                    |   51 +
 docs/api/doxygen/globals_t.html                    |   12 +-
 docs/api/doxygen/globals_type.html                 |    6 +
 docs/api/doxygen/globals_u.html                    |   15 +
 docs/api/doxygen/graph__runtime_8h.html            |    2 +-
 docs/api/doxygen/graph__runtime_8h__incl.svg       |   67 +-
 docs/api/doxygen/hierarchy.html                    | 1648 ++++----
 docs/api/doxygen/image_8h_source.html              |    2 +-
 docs/api/doxygen/inherit_graph_100.svg             |   15 +-
 docs/api/doxygen/inherit_graph_101.svg             |   15 +-
 docs/api/doxygen/inherit_graph_102.svg             |   15 +-
 docs/api/doxygen/inherit_graph_103.svg             |   16 +-
 docs/api/doxygen/inherit_graph_104.svg             |   15 +-
 docs/api/doxygen/inherit_graph_105.svg             |  146 +-
 docs/api/doxygen/inherit_graph_106.svg             |   14 +-
 docs/api/doxygen/inherit_graph_107.svg             |   14 +-
 docs/api/doxygen/inherit_graph_108.svg             |   17 +-
 docs/api/doxygen/inherit_graph_109.svg             |   12 +-
 docs/api/doxygen/inherit_graph_110.svg             |   17 +-
 docs/api/doxygen/inherit_graph_111.svg             |   17 +-
 docs/api/doxygen/inherit_graph_112.svg             |   14 +-
 docs/api/doxygen/inherit_graph_113.svg             |   14 +-
 docs/api/doxygen/inherit_graph_114.svg             |   15 +-
 docs/api/doxygen/inherit_graph_115.svg             |   16 +-
 docs/api/doxygen/inherit_graph_116.svg             |   54 +-
 docs/api/doxygen/inherit_graph_117.svg             |   19 +-
 docs/api/doxygen/inherit_graph_118.svg             |   17 +-
 docs/api/doxygen/inherit_graph_119.svg             |   14 +-
 docs/api/doxygen/inherit_graph_120.svg             |   19 +-
 docs/api/doxygen/inherit_graph_121.svg             |   17 +-
 docs/api/doxygen/inherit_graph_122.svg             |   12 +-
 docs/api/doxygen/inherit_graph_123.svg             |   54 +-
 docs/api/doxygen/inherit_graph_124.svg             |   18 +-
 docs/api/doxygen/inherit_graph_125.svg             |   15 +-
 docs/api/doxygen/inherit_graph_126.svg             |   14 +-
 docs/api/doxygen/inherit_graph_127.svg             |   18 +-
 docs/api/doxygen/inherit_graph_128.svg             |   17 +-
 docs/api/doxygen/inherit_graph_129.svg             |   15 +-
 docs/api/doxygen/inherit_graph_130.svg             |   12 +-
 docs/api/doxygen/inherit_graph_131.svg             |   15 +-
 docs/api/doxygen/inherit_graph_132.svg             |   12 +-
 docs/api/doxygen/inherit_graph_133.svg             |   15 +-
 docs/api/doxygen/inherit_graph_134.svg             |   12 +-
 docs/api/doxygen/inherit_graph_135.svg             |   12 +-
 docs/api/doxygen/inherit_graph_136.svg             |    6 +-
 docs/api/doxygen/inherit_graph_137.svg             |   16 +-
 docs/api/doxygen/inherit_graph_138.svg             |   15 +-
 docs/api/doxygen/inherit_graph_139.svg             |   15 +-
 docs/api/doxygen/inherit_graph_140.svg             |   15 +-
 docs/api/doxygen/inherit_graph_141.svg             |   15 +-
 docs/api/doxygen/inherit_graph_142.svg             |   68 +-
 docs/api/doxygen/inherit_graph_143.svg             |   68 +-
 docs/api/doxygen/inherit_graph_144.svg             |   19 +-
 docs/api/doxygen/inherit_graph_145.svg             |   17 +-
 docs/api/doxygen/inherit_graph_146.svg             |   12 +-
 docs/api/doxygen/inherit_graph_147.svg             |   14 +-
 docs/api/doxygen/inherit_graph_148.svg             |   27 +-
 docs/api/doxygen/inherit_graph_149.svg             |   27 +-
 docs/api/doxygen/inherit_graph_150.svg             |   68 +-
 docs/api/doxygen/inherit_graph_151.svg             |   68 +-
 docs/api/doxygen/inherit_graph_152.svg             |   18 +-
 docs/api/doxygen/inherit_graph_153.svg             |   15 +-
 docs/api/doxygen/inherit_graph_154.svg             |   12 +-
 docs/api/doxygen/inherit_graph_155.svg             |   15 +-
 docs/api/doxygen/inherit_graph_156.svg             |   28 +-
 docs/api/doxygen/inherit_graph_157.svg             |   28 +-
 docs/api/doxygen/inherit_graph_158.svg             |   12 +-
 docs/api/doxygen/inherit_graph_159.svg             |   12 +-
 docs/api/doxygen/inherit_graph_160.svg             |   12 +-
 ...inherit_graph_153.svg => inherit_graph_161.svg} |    0
 ...inherit_graph_154.svg => inherit_graph_162.svg} |    0
 ...inherit_graph_155.svg => inherit_graph_163.svg} |    0
 ...inherit_graph_156.svg => inherit_graph_164.svg} |    0
 ...inherit_graph_157.svg => inherit_graph_165.svg} |    0
 ...inherit_graph_158.svg => inherit_graph_166.svg} |    0
 ...inherit_graph_159.svg => inherit_graph_167.svg} |    0
 ...inherit_graph_160.svg => inherit_graph_168.svg} |    0
 docs/api/doxygen/inherit_graph_60.svg              | 2041 +++++-----
 docs/api/doxygen/inherit_graph_86.svg              | 4233 ++++++++++----------
 docs/api/doxygen/inherit_graph_88.svg              |   15 +-
 docs/api/doxygen/inherit_graph_89.svg              |   14 +-
 docs/api/doxygen/inherit_graph_90.svg              |   27 +-
 docs/api/doxygen/inherit_graph_91.svg              |  216 +-
 docs/api/doxygen/inherit_graph_92.svg              |   15 +-
 docs/api/doxygen/inherit_graph_93.svg              |   15 +-
 docs/api/doxygen/inherit_graph_94.svg              |   15 +-
 docs/api/doxygen/inherit_graph_95.svg              |   15 +-
 docs/api/doxygen/inherit_graph_96.svg              |   15 +-
 docs/api/doxygen/inherit_graph_97.svg              |   28 +-
 docs/api/doxygen/inherit_graph_98.svg              |  236 +-
 docs/api/doxygen/inherit_graph_99.svg              |   15 +-
 docs/api/doxygen/inherits.html                     |  164 +-
 docs/api/doxygen/int__set_8h.html                  |    2 +-
 docs/api/doxygen/int__set_8h__dep__incl.svg        |  967 ++---
 docs/api/doxygen/int__set_8h_source.html           |    4 +-
 docs/api/doxygen/int__solver_8h_source.html        |    4 +-
 docs/api/doxygen/interpreter_8h.html               |    2 +-
 docs/api/doxygen/interpreter_8h__incl.svg          | 1949 +++++----
 docs/api/doxygen/interpreter_8h_source.html        |   10 +-
 docs/api/doxygen/ir_2adt_8h.html                   |    2 +-
 docs/api/doxygen/ir_2adt_8h__dep__incl.svg         | 1326 +++---
 docs/api/doxygen/ir_2adt_8h_source.html            |    4 +-
 docs/api/doxygen/ir_2attrs_8h.html                 |    2 +-
 docs/api/doxygen/ir_2attrs_8h__dep__incl.svg       |    2 +-
 docs/api/doxygen/ir_2attrs_8h_source.html          |   12 +-
 docs/api/doxygen/ir_2expr_8h.html                  |    2 +-
 docs/api/doxygen/ir_2expr_8h__dep__incl.svg        |  973 +++--
 docs/api/doxygen/ir_2expr_8h_source.html           |   12 +-
 docs/api/doxygen/ir_2function_8h.html              |    2 +-
 docs/api/doxygen/ir_2function_8h__dep__incl.svg    | 1315 +++---
 docs/api/doxygen/ir_2function_8h_source.html       |    6 +-
 docs/api/doxygen/ir_2module_8h.html                |    2 +-
 docs/api/doxygen/ir_2module_8h__dep__incl.svg      | 1334 +++---
 docs/api/doxygen/ir_2module_8h_source.html         |    6 +-
 docs/api/doxygen/ir_2op_8h.html                    |    2 +-
 docs/api/doxygen/ir_2op_8h_source.html             |    2 +-
 docs/api/doxygen/ir_2transform_8h.html             |    4 +-
 docs/api/doxygen/ir_2transform_8h__dep__incl.svg   |  883 +---
 docs/api/doxygen/ir_2transform_8h_source.html      |   12 +-
 docs/api/doxygen/ir_2type_8h.html                  |    2 +-
 docs/api/doxygen/ir_2type_8h__dep__incl.svg        | 1575 ++++----
 docs/api/doxygen/ir_2type_8h_source.html           |    6 +-
 .../doxygen/local__response__norm_8h_source.html   |    4 +-
 docs/api/doxygen/loop__state_8h.html               |    2 +-
 docs/api/doxygen/loop__state_8h__incl.svg          |  949 ++---
 docs/api/doxygen/loop__state_8h_source.html        |   14 +-
 docs/api/doxygen/measure_8h.html                   |    4 +-
 docs/api/doxygen/measure_8h__incl.svg              | 1607 ++++----
 docs/api/doxygen/measure_8h_source.html            |   65 +-
 docs/api/doxygen/measure__record_8h.html           |    2 +-
 docs/api/doxygen/measure__record_8h__incl.svg      | 1563 ++++----
 docs/api/doxygen/measure__record_8h_source.html    |    4 +-
 docs/api/doxygen/memory__manager_8h_source.html    |    8 +-
 docs/api/doxygen/namespacemembers.html             |    3 +
 docs/api/doxygen/namespacemembers_c.html           |    9 +-
 docs/api/doxygen/namespacemembers_e.html           |   10 +-
 docs/api/doxygen/namespacemembers_enum.html        |    6 +
 docs/api/doxygen/namespacemembers_func.html        |    3 +
 docs/api/doxygen/namespacemembers_func_c.html      |    9 +-
 docs/api/doxygen/namespacemembers_func_e.html      |    5 +-
 docs/api/doxygen/namespacemembers_func_h.html      |    3 -
 docs/api/doxygen/namespacemembers_func_i.html      |    3 -
 docs/api/doxygen/namespacemembers_func_l.html      |    5 +-
 docs/api/doxygen/namespacemembers_func_m.html      |   12 +-
 docs/api/doxygen/namespacemembers_func_o.html      |   47 +-
 docs/api/doxygen/namespacemembers_func_r.html      |    6 -
 docs/api/doxygen/namespacemembers_func_s.html      |    7 +-
 docs/api/doxygen/namespacemembers_h.html           |    3 -
 docs/api/doxygen/namespacemembers_i.html           |    3 -
 docs/api/doxygen/namespacemembers_l.html           |    7 +-
 docs/api/doxygen/namespacemembers_m.html           |   13 +-
 docs/api/doxygen/namespacemembers_o.html           |   39 +-
 docs/api/doxygen/namespacemembers_r.html           |    8 +-
 docs/api/doxygen/namespacemembers_s.html           |    6 +-
 docs/api/doxygen/namespacemembers_type.html        |    3 +
 docs/api/doxygen/namespaces.html                   |   38 +-
 docs/api/doxygen/namespacetvm.html                 |   11 +-
 .../doxygen/namespacetvm_1_1auto__scheduler.html   |    8 +-
 docs/api/doxygen/namespacetvm_1_1relay.html        |   39 +
 docs/api/doxygen/namespacetvm_1_1runtime.html      |    2 +
 .../namespacetvm_1_1runtime_1_1micro__rpc.html     |  259 ++
 .../api/doxygen/namespacetvm_1_1runtime_1_1vm.html |    5 +-
 docs/api/doxygen/namespacetvm_1_1target.html       |  321 --
 docs/api/doxygen/namespacetvm_1_1tir.html          |    2 +-
 docs/api/doxygen/namespacetvm_1_1topi.html         |  191 +-
 docs/api/doxygen/ndarray_8h.html                   |    2 +-
 docs/api/doxygen/ndarray_8h__dep__incl.svg         | 1437 ++++---
 docs/api/doxygen/ndarray_8h_source.html            |   18 +-
 docs/api/doxygen/nn_2bnn_8h_source.html            |    6 +-
 docs/api/doxygen/nn_2dense_8h_source.html          |    2 +-
 docs/api/doxygen/nn_2pooling_8h.html               |    2 +-
 docs/api/doxygen/nn_2pooling_8h__incl.svg          | 1107 ++---
 docs/api/doxygen/nn_2pooling_8h_source.html        |   12 +-
 docs/api/doxygen/nn_2softmax_8h.html               |    2 +-
 docs/api/doxygen/nn_2softmax_8h__incl.svg          | 1083 ++---
 docs/api/doxygen/nn_2softmax_8h_source.html        |    8 +-
 docs/api/doxygen/node_2container_8h.html           |    2 +-
 docs/api/doxygen/node_2container_8h__dep__incl.svg | 1570 ++++----
 docs/api/doxygen/node_2container_8h_source.html    |  124 +-
 docs/api/doxygen/node_8h.html                      |    2 +-
 docs/api/doxygen/node_8h__dep__incl.svg            | 1505 ++++---
 docs/api/doxygen/node_8h_source.html               |    4 +-
 docs/api/doxygen/object_8h.html                    |   16 +-
 docs/api/doxygen/object_8h__dep__incl.svg          | 1349 ++++---
 docs/api/doxygen/object_8h_source.html             |   94 +-
 docs/api/doxygen/op__strategy_8h.html              |    2 +-
 docs/api/doxygen/op__strategy_8h__incl.svg         | 2235 +++++------
 docs/api/doxygen/op__strategy_8h_source.html       |    8 +-
 docs/api/doxygen/operation_8h.html                 |    2 +-
 docs/api/doxygen/operation_8h__dep__incl.svg       |  841 ++--
 docs/api/doxygen/operation_8h_source.html          |   12 +-
 docs/api/doxygen/packed__func_8h.html              |    2 +-
 docs/api/doxygen/packed__func_8h__dep__incl.svg    | 1428 ++++---
 docs/api/doxygen/packed__func_8h_source.html       |   16 +-
 docs/api/doxygen/pattern__functor_8h_source.html   |    2 +-
 docs/api/doxygen/platform_8h.html                  |   19 +-
 .../{crt_8h__incl.svg => platform_8h__incl.svg}    |    6 +-
 docs/api/doxygen/platform_8h_source.html           |    4 +-
 docs/api/doxygen/ravel__unravel_8h_source.html     |    2 +-
 docs/api/doxygen/reduction_8h.html                 |    2 +-
 docs/api/doxygen/reduction_8h__incl.svg            | 1007 ++---
 docs/api/doxygen/reduction_8h_source.html          |   10 +-
 docs/api/doxygen/reflection_8h.html                |    6 +-
 docs/api/doxygen/reflection_8h__dep__incl.svg      | 1289 +++---
 docs/api/doxygen/reflection_8h_source.html         |    6 +-
 docs/api/doxygen/relay_2adt_8h_source.html         |    6 +-
 docs/api/doxygen/relay_2analysis_8h.html           |    3 +
 docs/api/doxygen/relay_2analysis_8h_source.html    |    4 +-
 docs/api/doxygen/relay_2attrs_2nn_8h_source.html   |    2 +-
 docs/api/doxygen/relay_2expr_8h_source.html        |  124 +-
 .../doxygen/relay_2expr__functor_8h_source.html    |   14 +-
 docs/api/doxygen/relay_2function_8h_source.html    |    4 +-
 docs/api/doxygen/relay_2op__attr__types_8h.html    |    2 +-
 .../doxygen/relay_2op__attr__types_8h__incl.svg    | 2244 +++++------
 .../doxygen/relay_2op__attr__types_8h_source.html  |    4 +-
 docs/api/doxygen/relay_2qnn_2transform_8h.html     |    2 +-
 .../api/doxygen/relay_2qnn_2transform_8h__incl.svg | 1013 +++--
 docs/api/doxygen/relay_2transform_8h.html          |    2 +-
 docs/api/doxygen/relay_2transform_8h__incl.svg     | 1132 +++---
 docs/api/doxygen/relay_2transform_8h_source.html   |    6 +-
 docs/api/doxygen/reorg_8h.html                     |    2 +-
 docs/api/doxygen/reorg_8h__incl.svg                | 1041 ++---
 docs/api/doxygen/reorg_8h_source.html              |    4 +-
 docs/api/doxygen/repr__printer_8h.html             |    2 +-
 docs/api/doxygen/repr__printer_8h__dep__incl.svg   | 1506 ++++---
 docs/api/doxygen/repr__printer_8h_source.html      |    2 +-
 docs/api/doxygen/rocblas_8h_source.html            |    2 +-
 docs/api/doxygen/rocm_2dense_8h.html               |    2 +-
 docs/api/doxygen/rocm_2dense_8h__incl.svg          | 1742 ++++----
 docs/api/doxygen/rocm_2dense_8h_source.html        |   10 +-
 docs/api/doxygen/rocm_2injective_8h.html           |    2 +-
 docs/api/doxygen/rocm_2injective_8h__incl.svg      | 1980 +++++----
 docs/api/doxygen/rocm_2injective_8h_source.html    |    7 +-
 docs/api/doxygen/rocm_2normalization_8h.html       |    2 +-
 docs/api/doxygen/rocm_2normalization_8h__incl.svg  | 1728 ++++----
 .../api/doxygen/rocm_2normalization_8h_source.html |    5 +-
 docs/api/doxygen/rocm_2pooling_8h.html             |    2 +-
 docs/api/doxygen/rocm_2pooling_8h__incl.svg        | 1998 +++++----
 docs/api/doxygen/rocm_2pooling_8h_source.html      |    7 +-
 docs/api/doxygen/rocm_2reduction_8h.html           |    2 +-
 docs/api/doxygen/rocm_2reduction_8h__incl.svg      | 1980 +++++----
 docs/api/doxygen/rocm_2reduction_8h_source.html    |    7 +-
 docs/api/doxygen/rocm_2softmax_8h.html             |    2 +-
 docs/api/doxygen/rocm_2softmax_8h__incl.svg        | 1980 +++++----
 docs/api/doxygen/rocm_2softmax_8h_source.html      |    7 +-
 docs/api/doxygen/runtime_2container_8h.html        |    2 +-
 .../doxygen/runtime_2container_8h__dep__incl.svg   | 1338 +++----
 docs/api/doxygen/runtime_2container_8h_source.html |  279 +-
 docs/api/doxygen/runtime_2memory_8h.html           |    2 +-
 docs/api/doxygen/runtime_2memory_8h__dep__incl.svg | 1434 ++++---
 docs/api/doxygen/runtime_2memory_8h_source.html    |    4 +-
 docs/api/doxygen/runtime_2module_8h.html           |    2 +-
 docs/api/doxygen/runtime_2module_8h__dep__incl.svg | 1438 ++++---
 docs/api/doxygen/runtime_2module_8h_source.html    |    8 +-
 docs/api/doxygen/runtime_2vm_2vm_8h_source.html    |   65 +-
 docs/api/doxygen/schedule_8h_source.html           |    8 +-
 docs/api/doxygen/search/all_0.js                   |    6 +-
 docs/api/doxygen/search/all_1.js                   |   30 +-
 docs/api/doxygen/search/all_10.js                  |   27 +-
 docs/api/doxygen/search/all_12.js                  |   25 +-
 docs/api/doxygen/search/all_13.js                  |   51 +-
 docs/api/doxygen/search/all_14.js                  |   84 +-
 docs/api/doxygen/search/all_15.js                  |    8 +
 docs/api/doxygen/search/all_16.js                  |    6 +-
 docs/api/doxygen/search/all_17.js                  |    9 +-
 docs/api/doxygen/search/all_19.js                  |    3 +-
 docs/api/doxygen/search/all_2.js                   |   11 +-
 docs/api/doxygen/search/all_3.js                   |   20 +-
 docs/api/doxygen/search/all_4.js                   |   10 +-
 docs/api/doxygen/search/all_5.js                   |   13 +-
 docs/api/doxygen/search/all_6.js                   |   19 +-
 docs/api/doxygen/search/all_7.js                   |    6 +-
 docs/api/doxygen/search/all_8.js                   |    3 +-
 docs/api/doxygen/search/all_9.js                   |   19 +-
 docs/api/doxygen/search/all_b.js                   |   31 +-
 docs/api/doxygen/search/all_c.js                   |    8 +-
 docs/api/doxygen/search/all_d.js                   |   13 +-
 docs/api/doxygen/search/all_e.js                   |    2 +-
 docs/api/doxygen/search/all_f.js                   |    8 +-
 docs/api/doxygen/search/classes_10.js              |    6 +-
 docs/api/doxygen/search/classes_11.js              |    3 +
 docs/api/doxygen/search/classes_12.js              |    1 +
 docs/api/doxygen/search/classes_14.js              |    3 +-
 docs/api/doxygen/search/classes_5.js               |    2 +
 docs/api/doxygen/search/classes_d.js               |    1 +
 docs/api/doxygen/search/classes_f.js               |    2 +-
 docs/api/doxygen/search/defines_7.js               |    4 +-
 docs/api/doxygen/search/enums_4.js                 |    3 +-
 docs/api/doxygen/search/enums_5.js                 |    5 +-
 docs/api/doxygen/search/enums_6.js                 |    4 +-
 docs/api/doxygen/search/enums_7.js                 |    4 +-
 docs/api/doxygen/search/enums_8.js                 |    3 +-
 docs/api/doxygen/search/enums_9.js                 |    3 +-
 docs/api/doxygen/search/enums_a.js                 |    9 +-
 docs/api/doxygen/search/enums_b.html               |   26 +
 docs/api/doxygen/search/{enums_a.js => enums_b.js} |    0
 docs/api/doxygen/search/enumvalues_1.js            |   19 +-
 docs/api/doxygen/search/enumvalues_2.js            |   21 +-
 docs/api/doxygen/search/enumvalues_3.js            |    7 +-
 docs/api/doxygen/search/enumvalues_4.js            |  155 +-
 docs/api/doxygen/search/enumvalues_5.js            |  178 +-
 docs/api/doxygen/search/enumvalues_6.js            |    3 +-
 docs/api/doxygen/search/enumvalues_7.js            |    2 +-
 docs/api/doxygen/search/enumvalues_8.js            |    2 +-
 docs/api/doxygen/search/enumvalues_9.js            |    3 +-
 docs/api/doxygen/search/enumvalues_a.js            |    4 +-
 docs/api/doxygen/search/enumvalues_b.js            |    3 +-
 docs/api/doxygen/search/enumvalues_c.js            |    2 +-
 docs/api/doxygen/search/enumvalues_d.html          |   26 +
 .../search/{enumvalues_c.js => enumvalues_d.js}    |    0
 docs/api/doxygen/search/files_10.js                |    1 +
 docs/api/doxygen/search/files_12.js                |    3 +-
 docs/api/doxygen/search/files_5.js                 |    2 +
 docs/api/doxygen/search/files_e.js                 |    1 +
 docs/api/doxygen/search/files_f.js                 |    1 +
 docs/api/doxygen/search/functions_1.js             |    6 +-
 docs/api/doxygen/search/functions_10.js            |    6 +-
 docs/api/doxygen/search/functions_12.js            |   13 +-
 docs/api/doxygen/search/functions_13.js            |   19 +-
 docs/api/doxygen/search/functions_14.js            |   11 +-
 docs/api/doxygen/search/functions_15.js            |    4 +
 docs/api/doxygen/search/functions_16.js            |    4 +-
 docs/api/doxygen/search/functions_17.js            |    4 +-
 docs/api/doxygen/search/functions_19.js            |    3 +-
 docs/api/doxygen/search/functions_2.js             |    3 +-
 docs/api/doxygen/search/functions_3.js             |    9 +-
 docs/api/doxygen/search/functions_4.js             |    1 +
 docs/api/doxygen/search/functions_5.js             |    3 +-
 docs/api/doxygen/search/functions_6.js             |    7 +-
 docs/api/doxygen/search/functions_7.js             |    4 +-
 docs/api/doxygen/search/functions_8.js             |    1 -
 docs/api/doxygen/search/functions_9.js             |    5 +-
 docs/api/doxygen/search/functions_c.js             |    4 +-
 docs/api/doxygen/search/functions_d.js             |    4 +-
 docs/api/doxygen/search/functions_f.js             |    5 +-
 docs/api/doxygen/search/namespaces_2.js            |    4 +-
 docs/api/doxygen/search/related_0.js               |    4 +-
 docs/api/doxygen/search/related_5.js               |    2 +-
 docs/api/doxygen/search/related_b.js               |    7 +-
 docs/api/doxygen/search/searchdata.js              |    6 +-
 docs/api/doxygen/search/typedefs_10.html           |   26 +
 .../search/{typedefs_f.js => typedefs_10.js}       |    0
 docs/api/doxygen/search/typedefs_11.html           |   26 +
 docs/api/doxygen/search/typedefs_11.js             |    4 +
 docs/api/doxygen/search/typedefs_2.js              |    2 +-
 docs/api/doxygen/search/typedefs_9.js              |    3 +-
 docs/api/doxygen/search/typedefs_d.js              |    1 +
 docs/api/doxygen/search/typedefs_f.js              |    3 +-
 docs/api/doxygen/search/variables_0.js             |    6 +-
 docs/api/doxygen/search/variables_1.js             |    2 +-
 docs/api/doxygen/search/variables_11.js            |    4 +-
 docs/api/doxygen/search/variables_3.js             |    3 +-
 docs/api/doxygen/search/variables_4.js             |    5 +-
 docs/api/doxygen/search/variables_5.js             |    1 +
 docs/api/doxygen/search/variables_a.js             |    6 +-
 docs/api/doxygen/search/variables_b.js             |    1 +
 docs/api/doxygen/search/variables_c.js             |    3 +-
 docs/api/doxygen/search/variables_d.js             |    2 +-
 docs/api/doxygen/search/variables_e.js             |    1 +
 docs/api/doxygen/search/variables_f.js             |    2 +
 docs/api/doxygen/search__policy_8h.html            |    2 +-
 docs/api/doxygen/search__policy_8h__incl.svg       | 1579 ++++----
 docs/api/doxygen/search__policy_8h_source.html     |    8 +-
 docs/api/doxygen/search__task_8h.html              |    2 +-
 docs/api/doxygen/search__task_8h__incl.svg         | 1725 ++++----
 docs/api/doxygen/search__task_8h_source.html       |    8 +-
 docs/api/doxygen/serializer_8h.html                |    2 +-
 docs/api/doxygen/serializer_8h__dep__incl.svg      | 1435 ++++---
 .../{runtime_2memory_8h.html => session_8h.html}   |   72 +-
 docs/api/doxygen/session_8h__incl.svg              |  150 +
 docs/api/doxygen/session_8h_source.html            |  122 +
 docs/api/doxygen/source__map_8h_source.html        |    6 +-
 docs/api/doxygen/span_8h.html                      |    2 +-
 docs/api/doxygen/span_8h__dep__incl.svg            | 1424 ++++---
 docs/api/doxygen/span_8h_source.html               |    6 +-
 docs/api/doxygen/stmt_8h.html                      |    2 +-
 docs/api/doxygen/stmt_8h__dep__incl.svg            |  929 ++---
 docs/api/doxygen/stmt_8h_source.html               |    8 +-
 docs/api/doxygen/stmt__functor_8h_source.html      |    8 +-
 ...me_1_1micro__rpc_1_1SessionHeader-members.html} |   10 +-
 ...1_1runtime_1_1micro__rpc_1_1SessionHeader.html} |   63 +-
 ...1_1micro__rpc_1_1SessionHeader__coll__graph.svg |   24 +
 ...vm_1_1runtime_1_1vm_1_1Instruction-members.html |  115 +-
 .../structtvm_1_1runtime_1_1vm_1_1Instruction.html |  146 +-
 ..._1runtime_1_1vm_1_1Instruction__coll__graph.svg |    4 +-
 ...vm_1_1runtime_1_1vm_1_1VMFrame__coll__graph.svg |    4 +-
 ...tvm_1_1runtime_1_1vm_1_1VMFunction-members.html |    7 +-
 .../structtvm_1_1runtime_1_1vm_1_1VMFunction.html  |   33 +-
 ...1_1runtime_1_1vm_1_1VMFunction__coll__graph.svg |   23 +-
 docs/api/doxygen/structural__equal_8h.html         |    2 +-
 .../doxygen/structural__equal_8h__dep__incl.svg    | 1307 +++---
 docs/api/doxygen/structural__hash_8h.html          |    2 +-
 .../api/doxygen/structural__hash_8h__dep__incl.svg | 1307 +++---
 .../doxygen/{generic__func_8h.html => tag_8h.html} |   83 +-
 docs/api/doxygen/tag_8h__incl.svg                  |  954 +++++
 docs/api/doxygen/tag_8h_source.html                |  122 +
 docs/api/doxygen/tags_8h.html                      |    2 +-
 docs/api/doxygen/tags_8h__dep__incl.svg            |  615 +--
 docs/api/doxygen/target_8h.html                    |   39 +-
 docs/api/doxygen/target_8h__dep__incl.svg          |  130 +-
 docs/api/doxygen/target_8h__incl.svg               | 1787 ++++-----
 docs/api/doxygen/target_8h_source.html             |   53 +-
 docs/api/doxygen/target__info_8h_source.html       |    2 +-
 docs/api/doxygen/target__kind_8h.html              |   42 +-
 docs/api/doxygen/target__kind_8h__dep__incl.svg    |  140 +-
 docs/api/doxygen/target__kind_8h__incl.svg         | 1772 ++++----
 docs/api/doxygen/target__kind_8h_source.html       |   55 +-
 docs/api/doxygen/tensor_8h_source.html             |    8 +-
 docs/api/doxygen/tensor__intrin_8h_source.html     |    2 +-
 docs/api/doxygen/tensor__type_8h_source.html       |    6 +-
 docs/api/doxygen/tir_2analysis_8h.html             |    2 +-
 docs/api/doxygen/tir_2analysis_8h__dep__incl.svg   |  221 +-
 docs/api/doxygen/tir_2analysis_8h_source.html      |    2 +-
 docs/api/doxygen/tir_2expr_8h_source.html          |   10 +-
 docs/api/doxygen/tir_2function_8h.html             |    2 +-
 docs/api/doxygen/tir_2function_8h__dep__incl.svg   |  423 +-
 docs/api/doxygen/tir_2function_8h_source.html      |    6 +-
 docs/api/doxygen/tir_2op__attr__types_8h.html      |    2 +-
 .../doxygen/tir_2op__attr__types_8h__dep__incl.svg |  231 +-
 .../doxygen/tir_2op__attr__types_8h_source.html    |    2 +-
 docs/api/doxygen/topi_2nn_8h_source.html           |    4 +-
 docs/api/doxygen/topi_2transform_8h.html           |    6 +-
 docs/api/doxygen/topi_2transform_8h__incl.svg      | 2110 +++++-----
 docs/api/doxygen/topi_2transform_8h_source.html    |   79 +-
 docs/api/doxygen/transform__step_8h.html           |    3 +-
 docs/api/doxygen/transform__step_8h__incl.svg      |  935 ++---
 docs/api/doxygen/transform__step_8h_source.html    |  149 +-
 docs/api/doxygen/type__functor_8h_source.html      |    2 +-
 docs/api/doxygen/type__relation_8h_source.html     |    2 +-
 docs/api/doxygen/util_8h_source.html               |    4 +-
 docs/api/doxygen/utvm__rpc__server_8h.html         |  298 ++
 docs/api/doxygen/utvm__rpc__server_8h__incl.svg    |   39 +
 docs/api/doxygen/utvm__rpc__server_8h_source.html  |  105 +
 docs/api/doxygen/var_8h_source.html                |    6 +-
 docs/api/doxygen/with_8h.html                      |    2 +-
 docs/api/doxygen/with_8h__dep__incl.svg            | 1089 +++--
 ...registry__map_8h.html => write__stream_8h.html} |   38 +-
 docs/api/doxygen/write__stream_8h__dep__incl.svg   |   53 +
 docs/api/doxygen/write__stream_8h__incl.svg        |   64 +
 docs/api/doxygen/write__stream_8h_source.html      |  108 +
 docs/api/doxygen/x86_2bnn_8h.html                  |    2 +-
 docs/api/doxygen/x86_2bnn_8h__incl.svg             | 1738 ++++----
 docs/api/doxygen/x86_2bnn_8h_source.html           |    6 +-
 docs/api/doxygen/x86_2default_8h.html              |    2 +-
 docs/api/doxygen/x86_2default_8h__incl.svg         | 1988 +++++----
 docs/api/doxygen/x86_2default_8h_source.html       |    6 +-
 docs/api/doxygen/x86_2injective_8h.html            |    2 +-
 docs/api/doxygen/x86_2injective_8h__incl.svg       | 1738 ++++----
 docs/api/doxygen/x86_2injective_8h_source.html     |    6 +-
 docs/api/links.html                                |    3 +-
 docs/api/python/auto_scheduler.html                |  398 ++
 docs/api/python/autotvm.html                       |   22 +-
 docs/api/python/contrib.html                       |    4 +-
 docs/api/python/driver.html                        |    4 +-
 docs/api/python/error.html                         |    4 +-
 docs/api/python/graph_runtime.html                 |   28 +-
 docs/api/python/index.html                         |    8 +-
 docs/api/python/ir.html                            |   22 +-
 docs/api/python/micro.html                         |  722 +++-
 docs/api/python/ndarray.html                       |    4 +-
 docs/api/python/relay/analysis.html                |   44 +-
 docs/api/python/relay/backend.html                 |    4 +-
 docs/api/python/relay/dataflow_pattern.html        |    4 +-
 docs/api/python/relay/frontend.html                |   13 +-
 docs/api/python/relay/image.html                   |    4 +-
 docs/api/python/relay/index.html                   |  290 +-
 docs/api/python/relay/nn.html                      |    4 +-
 docs/api/python/relay/testing.html                 |    4 +-
 docs/api/python/relay/transform.html               |  100 +-
 docs/api/python/relay/vision.html                  |    4 +-
 docs/api/python/rpc.html                           |   66 +-
 docs/api/python/runtime.html                       |    4 +-
 docs/api/python/target.html                        |   92 +-
 docs/api/python/te.html                            |    4 +-
 docs/api/python/tir.html                           |    4 +-
 docs/api/python/topi.html                          |  273 +-
 docs/api/python/vta/index.html                     |    4 +-
 docs/api/typedoc/classes/bytestreamreader.html     |   12 +-
 docs/api/typedoc/classes/cachedcallstack.html      |   34 +-
 docs/api/typedoc/classes/dlcontext.html            |   10 +-
 docs/api/typedoc/classes/dldatatype.html           |   12 +-
 docs/api/typedoc/classes/environment.html          |   12 +-
 docs/api/typedoc/classes/ffilibrary.html           |   20 +-
 docs/api/typedoc/classes/graphruntime.html         |   16 +-
 docs/api/typedoc/classes/instance.html             |   40 +-
 docs/api/typedoc/classes/memory.html               |   34 +-
 docs/api/typedoc/classes/module.html               |   10 +-
 docs/api/typedoc/classes/ndarray.html              |   22 +-
 docs/api/typedoc/classes/packedfunccell.html       |    6 +-
 docs/api/typedoc/classes/rpcserver.html            |   14 +-
 docs/api/typedoc/classes/scalar.html               |    6 +-
 docs/api/typedoc/classes/webgpucontext.html        |   12 +-
 docs/api/typedoc/enums/argtypecode.html            |   30 +-
 docs/api/typedoc/enums/aynccallbackcode.html       |    4 +-
 docs/api/typedoc/enums/dldatatypecode.html         |    8 +-
 docs/api/typedoc/enums/rpcserverstate.html         |   12 +-
 docs/api/typedoc/enums/sizeof.html                 |   18 +-
 docs/api/typedoc/index.html                        |  114 +-
 docs/api/typedoc/interfaces/disposable.html        |    2 +-
 docs/api/typedoc/interfaces/functioninfo.html      |    6 +-
 docs/api/typedoc/interfaces/libraryprovider.html   |    4 +-
 docs/contribute/code_guide.html                    |    5 +-
 docs/contribute/code_review.html                   |    3 +-
 docs/contribute/committer_guide.html               |    3 +-
 docs/contribute/community.html                     |    3 +-
 docs/contribute/document.html                      |    3 +-
 docs/contribute/error_handling.html                |    3 +-
 docs/contribute/git_howto.html                     |    3 +-
 docs/contribute/index.html                         |    3 +-
 docs/contribute/pull_request.html                  |    3 +-
 docs/contribute/release_process.html               |    3 +-
 docs/deploy/android.html                           |    3 +-
 docs/deploy/arm_compute_lib.html                   |    3 +-
 docs/deploy/cpp_deploy.html                        |    3 +-
 docs/deploy/hls.html                               |    3 +-
 docs/deploy/index.html                             |    3 +-
 docs/deploy/integrate.html                         |    3 +-
 docs/dev/benchmark.html                            |    3 +-
 docs/dev/codebase_walkthrough.html                 |    3 +-
 docs/dev/convert_layout.html                       |    3 +-
 docs/dev/debugger.html                             |    3 +-
 docs/dev/frontend/tensorflow.html                  |    3 +-
 docs/dev/how_to.html                               |    3 +-
 docs/dev/hybrid_script.html                        |    3 +-
 docs/dev/index.html                                |    3 +-
 docs/dev/inferbound.html                           |   11 +-
 docs/dev/introduction_to_module_serialization.html |    3 +-
 docs/dev/pass_infra.html                           |    3 +-
 docs/dev/relay_add_op.html                         |    3 +-
 docs/dev/relay_add_pass.html                       |    3 +-
 docs/dev/relay_bring_your_own_codegen.html         |    3 +-
 docs/dev/relay_intro.html                          |    3 +-
 docs/dev/relay_op_strategy.html                    |    3 +-
 docs/dev/runtime.html                              |    3 +-
 docs/dev/security.html                             |    3 +-
 docs/dev/virtual_machine.html                      |    3 +-
 docs/faq.html                                      |    5 +-
 docs/genindex.html                                 |  188 +-
 docs/index.html                                    |    3 +-
 docs/install/docker.html                           |    5 +-
 docs/install/from_source.html                      |    9 +-
 docs/install/index.html                            |    3 +-
 docs/install/nnpack.html                           |    3 +-
 docs/langref/hybrid_script.html                    |    3 +-
 docs/langref/index.html                            |    3 +-
 docs/langref/relay_adt.html                        |    3 +-
 docs/langref/relay_expr.html                       |    3 +-
 docs/langref/relay_op.html                         |    3 +-
 docs/langref/relay_pattern.html                    |    3 +-
 docs/langref/relay_type.html                       |    3 +-
 docs/objects.inv                                   |  Bin 15987 -> 16576 bytes
 docs/py-modindex.html                              |   12 +-
 docs/search.html                                   |    3 +-
 docs/searchindex.js                                |    2 +-
 .../sg_execution_times.html                        |   11 +-
 docs/tutorials/auto_scheduler/tune_matmul_x86.html |  497 +++
 docs/tutorials/autotvm/sg_execution_times.html     |   17 +-
 docs/tutorials/autotvm/tune_conv2d_cuda.html       |  101 +-
 docs/tutorials/autotvm/tune_relay_arm.html         |  138 +-
 docs/tutorials/autotvm/tune_relay_cuda.html        |  142 +-
 docs/tutorials/autotvm/tune_relay_mobile_gpu.html  |  148 +-
 docs/tutorials/autotvm/tune_relay_x86.html         |  109 +-
 docs/tutorials/autotvm/tune_simple_template.html   |   77 +-
 docs/tutorials/dev/low_level_custom_pass.html      |   23 +-
 docs/tutorials/dev/sg_execution_times.html         |    9 +-
 docs/tutorials/dev/use_pass_infra.html             |  121 +-
 docs/tutorials/frontend/build_gcn.html             |  125 +-
 .../frontend/deploy_model_on_android.html          |  102 +-
 docs/tutorials/frontend/deploy_model_on_rasp.html  |   52 +-
 .../frontend/deploy_object_detection_pytorch.html  |  454 +++
 docs/tutorials/frontend/deploy_prequantized.html   |   43 +-
 .../frontend/deploy_prequantized_tflite.html       |   54 +-
 docs/tutorials/frontend/deploy_quantized.html      |   33 +-
 docs/tutorials/frontend/deploy_sparse.html         |   32 +-
 docs/tutorials/frontend/deploy_ssd_gluoncv.html    |   42 +-
 docs/tutorials/frontend/from_caffe2.html           |   52 +-
 docs/tutorials/frontend/from_coreml.html           |   45 +-
 docs/tutorials/frontend/from_darknet.html          |  122 +-
 docs/tutorials/frontend/from_keras.html            |   62 +-
 docs/tutorials/frontend/from_mxnet.html            |   59 +-
 docs/tutorials/frontend/from_onnx.html             |   46 +-
 docs/tutorials/frontend/from_pytorch.html          |   79 +-
 docs/tutorials/frontend/from_tensorflow.html       |   94 +-
 docs/tutorials/frontend/from_tflite.html           |   41 +-
 docs/tutorials/frontend/sg_execution_times.html    |   42 +-
 docs/tutorials/frontend/using_external_lib.html    |   17 +-
 .../get_started/cross_compilation_and_rpc.html     |   29 +-
 docs/tutorials/get_started/relay_quick_start.html  |  124 +-
 docs/tutorials/get_started/sg_execution_times.html |   11 +-
 .../get_started/tensor_expr_get_started.html       |   25 +-
 docs/tutorials/index.html                          |  172 +-
 docs/tutorials/language/extern_op.html             |   53 +-
 docs/tutorials/language/intrin_math.html           |   16 +-
 docs/tutorials/language/reduction.html             |   42 +-
 docs/tutorials/language/scan.html                  |   31 +-
 docs/tutorials/language/schedule_primitives.html   |   85 +-
 docs/tutorials/language/sg_execution_times.html    |   21 +-
 docs/tutorials/language/tedd.html                  |   35 +-
 docs/tutorials/language/tensorize.html             |  113 +-
 docs/tutorials/language/tuple_inputs.html          |   59 +-
 docs/tutorials/micro/micro_tflite.html             |  111 +-
 docs/tutorials/micro/sg_execution_times.html       |    7 +-
 docs/tutorials/optimize/opt_conv_cuda.html         |   44 +-
 docs/tutorials/optimize/opt_conv_tensorcore.html   |  236 +-
 docs/tutorials/optimize/opt_gemm.html              |  123 +-
 .../optimize/opt_matmul_auto_tensorcore.html       |  407 +-
 docs/tutorials/optimize/sg_execution_times.html    |   13 +-
 docs/tutorials/topi/intro_topi.html                |   13 +-
 docs/tutorials/topi/sg_execution_times.html        |    7 +-
 docs/vta/dev/config.html                           |    3 +-
 docs/vta/dev/hardware.html                         |    3 +-
 docs/vta/dev/index.html                            |    3 +-
 docs/vta/index.html                                |    3 +-
 docs/vta/install.html                              |   27 +-
 docs/vta/tutorials/autotvm/sg_execution_times.html |    7 +-
 docs/vta/tutorials/autotvm/tune_relay_vta.html     |  345 +-
 .../tutorials/frontend/deploy_classification.html  |   66 +-
 .../vta/tutorials/frontend/sg_execution_times.html |    7 +-
 docs/vta/tutorials/index.html                      |    3 +-
 docs/vta/tutorials/matrix_multiply.html            |   49 +-
 docs/vta/tutorials/optimize/convolution_opt.html   |  161 +-
 .../tutorials/optimize/matrix_multiply_opt.html    |  102 +-
 .../vta/tutorials/optimize/sg_execution_times.html |    9 +-
 docs/vta/tutorials/sg_execution_times.html         |    9 +-
 docs/vta/tutorials/vta_get_started.html            |   34 +-
 1156 files changed, 81207 insertions(+), 75800 deletions(-)

diff --git a/docs/_downloads/00a1355fcb7c30e9e70fc8fefc708f98/tuple_inputs.ipynb b/docs/_downloads/00a1355fcb7c30e9e70fc8fefc708f98/tuple_inputs.ipynb
index 54dd8a5..dedcacd 100644
--- a/docs/_downloads/00a1355fcb7c30e9e70fc8fefc708f98/tuple_inputs.ipynb
+++ b/docs/_downloads/00a1355fcb7c30e9e70fc8fefc708f98/tuple_inputs.ipynb
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "n = te.var(\"n\")\nm = te.var(\"m\")\nA0 = te.placeholder((m, n), name='A0')\nA1 = te.placeholder((m, n), name='A1')\nB0, B1 = te.compute((m, n), lambda i, j: (A0[i, j] + 2, A1[i, j] * 3), name='B')\n\n# The generated IR code would be:\ns = te.create_schedule(B0.op)\nprint(tvm.lower(s, [A0, A1, B0, B1], simple_mode=True))"
+        "n = te.var(\"n\")\nm = te.var(\"m\")\nA0 = te.placeholder((m, n), name=\"A0\")\nA1 = te.placeholder((m, n), name=\"A1\")\nB0, B1 = te.compute((m, n), lambda i, j: (A0[i, j] + 2, A1[i, j] * 3), name=\"B\")\n\n# The generated IR code would be:\ns = te.create_schedule(B0.op)\nprint(tvm.lower(s, [A0, A1, B0, B1], simple_mode=True))"
       ]
     },
     {
@@ -62,7 +62,7 @@
       },
       "outputs": [],
       "source": [
-        "# x and y are the operands of reduction, both of them is a tuple of index\n# and value.\ndef fcombine(x, y):\n    lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0])\n    rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1])\n    return lhs, rhs\n\n# our identity element also need to be a tuple, so `fidentity` accepts\n# two types as inputs.\ndef fidentity(t0, t1):\n    return tvm.tir.const(-1, t0), tvm.te.min_value(t1)\n\nargmax = te.comm_reducer(fcombine, fidentity, name='argmax')\n\n [...]
+        "# x and y are the operands of reduction, both of them is a tuple of index\n# and value.\ndef fcombine(x, y):\n    lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0])\n    rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1])\n    return lhs, rhs\n\n\n# our identity element also need to be a tuple, so `fidentity` accepts\n# two types as inputs.\ndef fidentity(t0, t1):\n    return tvm.tir.const(-1, t0), tvm.te.min_value(t1)\n\n\nargmax = te.comm_reducer(fcombine, fidentity, name=\"argmax\ [...]
       ]
     },
     {
@@ -87,7 +87,7 @@
       },
       "outputs": [],
       "source": [
-        "n = te.var(\"n\")\nm = te.var(\"m\")\nA0 = te.placeholder((m, n), name='A0')\nB0, B1 = te.compute((m, n), lambda i, j: (A0[i, j] + 2, A0[i, j] * 3), name='B')\nA1 = te.placeholder((m, n), name='A1')\nC = te.compute((m, n), lambda i, j: A1[i, j] + B0[i, j], name='C')\n\ns = te.create_schedule(C.op)\ns[B0].compute_at(s[C], C.op.axis[0])\n# as you can see in the below generated IR code:\nprint(tvm.lower(s, [A0, A1, C], simple_mode=True))"
+        "n = te.var(\"n\")\nm = te.var(\"m\")\nA0 = te.placeholder((m, n), name=\"A0\")\nB0, B1 = te.compute((m, n), lambda i, j: (A0[i, j] + 2, A0[i, j] * 3), name=\"B\")\nA1 = te.placeholder((m, n), name=\"A1\")\nC = te.compute((m, n), lambda i, j: A1[i, j] + B0[i, j], name=\"C\")\n\ns = te.create_schedule(C.op)\ns[B0].compute_at(s[C], C.op.axis[0])\n# as you can see in the below generated IR code:\nprint(tvm.lower(s, [A0, A1, C], simple_mode=True))"
       ]
     },
     {
diff --git a/docs/_downloads/02fc8627299fa0b05eb017773b471bfa/from_tflite.py b/docs/_downloads/02fc8627299fa0b05eb017773b471bfa/from_tflite.py
index c0b2a03..ee7da62 100644
--- a/docs/_downloads/02fc8627299fa0b05eb017773b471bfa/from_tflite.py
+++ b/docs/_downloads/02fc8627299fa0b05eb017773b471bfa/from_tflite.py
@@ -57,15 +57,17 @@ Below you can find an example on how to compile TFLite model using TVM.
 # ----------------------------------------------
 import os
 
+
 def extract(path):
     import tarfile
+
     if path.endswith("tgz") or path.endswith("gz"):
         dir_path = os.path.dirname(path)
         tar = tarfile.open(path)
         tar.extractall(path=dir_path)
         tar.close()
     else:
-        raise RuntimeError('Could not decompress the file: ' + path)
+        raise RuntimeError("Could not decompress the file: " + path)
 
 
 ######################################################################
@@ -77,7 +79,7 @@ from tvm.contrib.download import download_testdata
 model_url = "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz"
 
 # Download model tar file and extract it to get mobilenet_v1_1.0_224.tflite
-model_path = download_testdata(model_url, "mobilenet_v1_1.0_224.tgz", module=['tf', 'official'])
+model_path = download_testdata(model_url, "mobilenet_v1_1.0_224.tgz", module=["tf", "official"])
 model_dir = os.path.dirname(model_path)
 extract(model_path)
 
@@ -88,9 +90,11 @@ tflite_model_buf = open(tflite_model_file, "rb").read()
 # Get TFLite model from buffer
 try:
     import tflite
+
     tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
 except AttributeError:
     import tflite.Model
+
     tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
 
 ######################################################################
@@ -101,8 +105,8 @@ from PIL import Image
 from matplotlib import pyplot as plt
 import numpy as np
 
-image_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
-image_path = download_testdata(image_url, 'cat.png', module='data')
+image_url = "https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true"
+image_path = download_testdata(image_url, "cat.png", module="data")
 resized_image = Image.open(image_path).resize((224, 224))
 plt.imshow(resized_image)
 plt.show()
@@ -116,7 +120,7 @@ image_data = np.expand_dims(image_data, axis=0)
 image_data[:, :, :, 0] = 2.0 / 255.0 * image_data[:, :, :, 0] - 1
 image_data[:, :, :, 1] = 2.0 / 255.0 * image_data[:, :, :, 1] - 1
 image_data[:, :, :, 2] = 2.0 / 255.0 * image_data[:, :, :, 2] - 1
-print('input', image_data.shape)
+print("input", image_data.shape)
 
 ######################################################################
 # Compile the model with relay
@@ -129,9 +133,10 @@ input_dtype = "float32"
 
 # Parse TFLite model and convert it to a Relay module
 from tvm import relay, transform
-mod, params = relay.frontend.from_tflite(tflite_model,
-                                         shape_dict={input_tensor: input_shape},
-                                         dtype_dict={input_tensor: input_dtype})
+
+mod, params = relay.frontend.from_tflite(
+    tflite_model, shape_dict={input_tensor: input_shape}, dtype_dict={input_tensor: input_dtype}
+)
 
 # Build the module against to x86 CPU
 target = "llvm"
@@ -146,7 +151,7 @@ from tvm import te
 from tvm.contrib import graph_runtime as runtime
 
 # Create a runtime executor module
-module = runtime.GraphModule(lib['default'](tvm.cpu()))
+module = runtime.GraphModule(lib["default"](tvm.cpu()))
 
 # Feed input data
 module.set_input(input_tensor, tvm.nd.array(image_data))
@@ -162,12 +167,16 @@ tvm_output = module.get_output(0).asnumpy()
 # ---------------
 
 # Load label file
-label_file_url = ''.join(['https://raw.githubusercontent.com/',
-                          'tensorflow/tensorflow/master/tensorflow/lite/java/demo/',
-                          'app/src/main/assets/',
-                          'labels_mobilenet_quant_v1_224.txt'])
+label_file_url = "".join(
+    [
+        "https://raw.githubusercontent.com/",
+        "tensorflow/tensorflow/master/tensorflow/lite/java/demo/",
+        "app/src/main/assets/",
+        "labels_mobilenet_quant_v1_224.txt",
+    ]
+)
 label_file = "labels_mobilenet_quant_v1_224.txt"
-label_path = download_testdata(label_file_url, label_file, module='data')
+label_path = download_testdata(label_file_url, label_file, module="data")
 
 # List of 1001 classes
 with open(label_path) as f:
diff --git a/docs/_downloads/0bb862dbb3a4c434477f93fe2c147fbb/tune_simple_template.py b/docs/_downloads/0bb862dbb3a4c434477f93fe2c147fbb/tune_simple_template.py
index c5a3843..357abf1 100644
--- a/docs/_downloads/0bb862dbb3a4c434477f93fe2c147fbb/tune_simple_template.py
+++ b/docs/_downloads/0bb862dbb3a4c434477f93fe2c147fbb/tune_simple_template.py
@@ -71,11 +71,11 @@ from tvm import autotvm
 
 # Matmul V0: Constant tiling factor
 def matmul_v0(N, L, M, dtype):
-    A = te.placeholder((N, L), name='A', dtype=dtype)
-    B = te.placeholder((L, M), name='B', dtype=dtype)
+    A = te.placeholder((N, L), name="A", dtype=dtype)
+    B = te.placeholder((L, M), name="B", dtype=dtype)
 
-    k = te.reduce_axis((0, L), name='k')
-    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name='C')
+    k = te.reduce_axis((0, L), name="k")
+    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
     s = te.create_schedule(C.op)
 
     # schedule
@@ -89,6 +89,7 @@ def matmul_v0(N, L, M, dtype):
 
     return s, [A, B, C]
 
+
 #####################################################################
 # Parametrize the schedule
 # ^^^^^^^^^^^^^^^^^^^^^^^^
@@ -105,11 +106,11 @@ def matmul_v0(N, L, M, dtype):
 # Matmul V1: List candidate values
 @autotvm.template("tutorial/matmul_v1")  # 1. use a decorator
 def matmul_v1(N, L, M, dtype):
-    A = te.placeholder((N, L), name='A', dtype=dtype)
-    B = te.placeholder((L, M), name='B', dtype=dtype)
+    A = te.placeholder((N, L), name="A", dtype=dtype)
+    B = te.placeholder((L, M), name="B", dtype=dtype)
 
-    k = te.reduce_axis((0, L), name='k')
-    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name='C')
+    k = te.reduce_axis((0, L), name="k")
+    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
     s = te.create_schedule(C.op)
 
     # schedule
@@ -124,13 +125,14 @@ def matmul_v1(N, L, M, dtype):
     cfg.define_knob("tile_x", [1, 2, 4, 8, 16])
 
     # 4. schedule according to config
-    yo, yi = s[C].split(y, cfg['tile_y'].val)
-    xo, xi = s[C].split(x, cfg['tile_x'].val)
+    yo, yi = s[C].split(y, cfg["tile_y"].val)
+    xo, xi = s[C].split(x, cfg["tile_x"].val)
 
     s[C].reorder(yo, xo, k, yi, xi)
 
     return s, [A, B, C]
 
+
 ###############################################################################
 # Here we make four modifications to the previous schedule code and get
 # a tunable "template". We can explain the modifications one by one.
@@ -183,13 +185,14 @@ def matmul_v1(N, L, M, dtype):
 # When the high level API cannot meet your requirement, you can always fall
 # back to use low level API.
 
+
 @autotvm.template("tutorial/matmul")
 def matmul(N, L, M, dtype):
-    A = te.placeholder((N, L), name='A', dtype=dtype)
-    B = te.placeholder((L, M), name='B', dtype=dtype)
+    A = te.placeholder((N, L), name="A", dtype=dtype)
+    B = te.placeholder((L, M), name="B", dtype=dtype)
 
-    k = te.reduce_axis((0, L), name='k')
-    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name='C')
+    k = te.reduce_axis((0, L), name="k")
+    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
     s = te.create_schedule(C.op)
 
     # schedule
@@ -210,6 +213,7 @@ def matmul(N, L, M, dtype):
 
     return s, [A, B, C]
 
+
 ######################################################################
 # .. note:: More Explanation on :code:`cfg.defile_split`
 #
@@ -273,7 +277,7 @@ def matmul(N, L, M, dtype):
 # In this case, for a 512x512 square matrix multiplication, the space size
 # is 10x10=100
 N, L, M = 512, 512, 512
-task = autotvm.task.create("tutorial/matmul", args=(N, L, M, 'float32'), target='llvm')
+task = autotvm.task.create("tutorial/matmul", args=(N, L, M, "float32"), target="llvm")
 print(task.config_space)
 
 ################################################################
@@ -286,22 +290,22 @@ print(task.config_space)
 # used to get the best config later.
 
 # logging config (for printing tuning log to the screen)
-logging.getLogger('autotvm').setLevel(logging.DEBUG)
-logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))
+logging.getLogger("autotvm").setLevel(logging.DEBUG)
+logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout))
 
 # There are two steps for measuring a config: build and run.
 # By default, we use all CPU cores to compile program. Then measure them sequentially.
 # We measure 5 times and take average to reduce variance.
-measure_option = autotvm.measure_option(
-    builder='local',
-    runner=autotvm.LocalRunner(number=5))
+measure_option = autotvm.measure_option(builder="local", runner=autotvm.LocalRunner(number=5))
 
 # Begin tuning with RandomTuner, log records to file `matmul.log`
 # You can use alternatives like XGBTuner.
 tuner = autotvm.tuner.RandomTuner(task)
-tuner.tune(n_trial=10,
-           measure_option=measure_option,
-           callbacks=[autotvm.callback.log_to_file('matmul.log')])
+tuner.tune(
+    n_trial=10,
+    measure_option=measure_option,
+    callbacks=[autotvm.callback.log_to_file("matmul.log")],
+)
 
 #########################################################################
 # Finally we apply history best from the cache file and check its correctness.
@@ -311,9 +315,9 @@ tuner.tune(n_trial=10,
 # with the same argument.
 
 # apply history best from log file
-with autotvm.apply_history_best('matmul.log'):
-    with tvm.target.create("llvm"):
-        s, arg_bufs = matmul(N, L, M, 'float32')
+with autotvm.apply_history_best("matmul.log"):
+    with tvm.target.Target("llvm"):
+        s, arg_bufs = matmul(N, L, M, "float32")
         func = tvm.build(s, arg_bufs)
 
 # check correctness
diff --git a/docs/_downloads/0c30ce88b67b0e8d46494348ab36c9fb/from_tflite.ipynb b/docs/_downloads/0c30ce88b67b0e8d46494348ab36c9fb/from_tflite.ipynb
index 1860f7d..2c3f32a 100644
--- a/docs/_downloads/0c30ce88b67b0e8d46494348ab36c9fb/from_tflite.ipynb
+++ b/docs/_downloads/0c30ce88b67b0e8d46494348ab36c9fb/from_tflite.ipynb
@@ -33,7 +33,7 @@
       },
       "outputs": [],
       "source": [
-        "import os\n\ndef extract(path):\n    import tarfile\n    if path.endswith(\"tgz\") or path.endswith(\"gz\"):\n        dir_path = os.path.dirname(path)\n        tar = tarfile.open(path)\n        tar.extractall(path=dir_path)\n        tar.close()\n    else:\n        raise RuntimeError('Could not decompress the file: ' + path)"
+        "import os\n\n\ndef extract(path):\n    import tarfile\n\n    if path.endswith(\"tgz\") or path.endswith(\"gz\"):\n        dir_path = os.path.dirname(path)\n        tar = tarfile.open(path)\n        tar.extractall(path=dir_path)\n        tar.close()\n    else:\n        raise RuntimeError(\"Could not decompress the file: \" + path)"
       ]
     },
     {
@@ -51,7 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "from tvm.contrib.download import download_testdata\n\nmodel_url = \"http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz\"\n\n# Download model tar file and extract it to get mobilenet_v1_1.0_224.tflite\nmodel_path = download_testdata(model_url, \"mobilenet_v1_1.0_224.tgz\", module=['tf', 'official'])\nmodel_dir = os.path.dirname(model_path)\nextract(model_path)\n\n# Now we can open mobilenet_v1_1.0_224.tflite\ntflite_model_file = os.path.join(mo [...]
+        "from tvm.contrib.download import download_testdata\n\nmodel_url = \"http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz\"\n\n# Download model tar file and extract it to get mobilenet_v1_1.0_224.tflite\nmodel_path = download_testdata(model_url, \"mobilenet_v1_1.0_224.tgz\", module=[\"tf\", \"official\"])\nmodel_dir = os.path.dirname(model_path)\nextract(model_path)\n\n# Now we can open mobilenet_v1_1.0_224.tflite\ntflite_model_file = os.path.joi [...]
       ]
     },
     {
@@ -69,7 +69,7 @@
       },
       "outputs": [],
       "source": [
-        "from PIL import Image\nfrom matplotlib import pyplot as plt\nimport numpy as np\n\nimage_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'\nimage_path = download_testdata(image_url, 'cat.png', module='data')\nresized_image = Image.open(image_path).resize((224, 224))\nplt.imshow(resized_image)\nplt.show()\nimage_data = np.asarray(resized_image).astype(\"float32\")\n\n# Add a dimension to the image so that we have NHWC format layout\nimage_data = np.expand [...]
+        "from PIL import Image\nfrom matplotlib import pyplot as plt\nimport numpy as np\n\nimage_url = \"https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true\"\nimage_path = download_testdata(image_url, \"cat.png\", module=\"data\")\nresized_image = Image.open(image_path).resize((224, 224))\nplt.imshow(resized_image)\nplt.show()\nimage_data = np.asarray(resized_image).astype(\"float32\")\n\n# Add a dimension to the image so that we have NHWC format layout\nimage_data = np. [...]
       ]
     },
     {
@@ -87,7 +87,7 @@
       },
       "outputs": [],
       "source": [
-        "# TFLite input tensor name, shape and type\ninput_tensor = \"input\"\ninput_shape = (1, 224, 224, 3)\ninput_dtype = \"float32\"\n\n# Parse TFLite model and convert it to a Relay module\nfrom tvm import relay, transform\nmod, params = relay.frontend.from_tflite(tflite_model,\n                                         shape_dict={input_tensor: input_shape},\n                                         dtype_dict={input_tensor: input_dtype})\n\n# Build the module against to x86 CPU\nta [...]
+        "# TFLite input tensor name, shape and type\ninput_tensor = \"input\"\ninput_shape = (1, 224, 224, 3)\ninput_dtype = \"float32\"\n\n# Parse TFLite model and convert it to a Relay module\nfrom tvm import relay, transform\n\nmod, params = relay.frontend.from_tflite(\n    tflite_model, shape_dict={input_tensor: input_shape}, dtype_dict={input_tensor: input_dtype}\n)\n\n# Build the module against to x86 CPU\ntarget = \"llvm\"\nwith transform.PassContext(opt_level=3):\n    lib = relay [...]
       ]
     },
     {
@@ -105,7 +105,7 @@
       },
       "outputs": [],
       "source": [
-        "import tvm\nfrom tvm import te\nfrom tvm.contrib import graph_runtime as runtime\n\n# Create a runtime executor module\nmodule = runtime.GraphModule(lib['default'](tvm.cpu()))\n\n# Feed input data\nmodule.set_input(input_tensor, tvm.nd.array(image_data))\n\n# Run\nmodule.run()\n\n# Get output\ntvm_output = module.get_output(0).asnumpy()"
+        "import tvm\nfrom tvm import te\nfrom tvm.contrib import graph_runtime as runtime\n\n# Create a runtime executor module\nmodule = runtime.GraphModule(lib[\"default\"](tvm.cpu()))\n\n# Feed input data\nmodule.set_input(input_tensor, tvm.nd.array(image_data))\n\n# Run\nmodule.run()\n\n# Get output\ntvm_output = module.get_output(0).asnumpy()"
       ]
     },
     {
@@ -123,7 +123,7 @@
       },
       "outputs": [],
       "source": [
-        "# Load label file\nlabel_file_url = ''.join(['https://raw.githubusercontent.com/',\n                          'tensorflow/tensorflow/master/tensorflow/lite/java/demo/',\n                          'app/src/main/assets/',\n                          'labels_mobilenet_quant_v1_224.txt'])\nlabel_file = \"labels_mobilenet_quant_v1_224.txt\"\nlabel_path = download_testdata(label_file_url, label_file, module='data')\n\n# List of 1001 classes\nwith open(label_path) as f:\n    labels = f. [...]
+        "# Load label file\nlabel_file_url = \"\".join(\n    [\n        \"https://raw.githubusercontent.com/\",\n        \"tensorflow/tensorflow/master/tensorflow/lite/java/demo/\",\n        \"app/src/main/assets/\",\n        \"labels_mobilenet_quant_v1_224.txt\",\n    ]\n)\nlabel_file = \"labels_mobilenet_quant_v1_224.txt\"\nlabel_path = download_testdata(label_file_url, label_file, module=\"data\")\n\n# List of 1001 classes\nwith open(label_path) as f:\n    labels = f.readlines()\n\n#  [...]
       ]
     }
   ],
diff --git a/docs/_downloads/0d95a85fc279fdff660608ef305b9107/tune_simple_template.ipynb b/docs/_downloads/0d95a85fc279fdff660608ef305b9107/tune_simple_template.ipynb
index 03aa884..7c9c127 100644
--- a/docs/_downloads/0d95a85fc279fdff660608ef305b9107/tune_simple_template.ipynb
+++ b/docs/_downloads/0d95a85fc279fdff660608ef305b9107/tune_simple_template.ipynb
@@ -51,7 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "# Matmul V0: Constant tiling factor\ndef matmul_v0(N, L, M, dtype):\n    A = te.placeholder((N, L), name='A', dtype=dtype)\n    B = te.placeholder((L, M), name='B', dtype=dtype)\n\n    k = te.reduce_axis((0, L), name='k')\n    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name='C')\n    s = te.create_schedule(C.op)\n\n    # schedule\n    y, x = s[C].op.axis\n    k = s[C].op.reduce_axis[0]\n\n    yo, yi = s[C].split(y, 8)\n    xo, xi = s[C].split(x, 8)\n\ [...]
+        "# Matmul V0: Constant tiling factor\ndef matmul_v0(N, L, M, dtype):\n    A = te.placeholder((N, L), name=\"A\", dtype=dtype)\n    B = te.placeholder((L, M), name=\"B\", dtype=dtype)\n\n    k = te.reduce_axis((0, L), name=\"k\")\n    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name=\"C\")\n    s = te.create_schedule(C.op)\n\n    # schedule\n    y, x = s[C].op.axis\n    k = s[C].op.reduce_axis[0]\n\n    yo, yi = s[C].split(y, 8)\n    xo, xi = s[C].split( [...]
       ]
     },
     {
@@ -69,7 +69,7 @@
       },
       "outputs": [],
       "source": [
-        "# Matmul V1: List candidate values\n@autotvm.template(\"tutorial/matmul_v1\")  # 1. use a decorator\ndef matmul_v1(N, L, M, dtype):\n    A = te.placeholder((N, L), name='A', dtype=dtype)\n    B = te.placeholder((L, M), name='B', dtype=dtype)\n\n    k = te.reduce_axis((0, L), name='k')\n    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name='C')\n    s = te.create_schedule(C.op)\n\n    # schedule\n    y, x = s[C].op.axis\n    k = s[C].op.reduce_axis[0]\n\ [...]
+        "# Matmul V1: List candidate values\n@autotvm.template(\"tutorial/matmul_v1\")  # 1. use a decorator\ndef matmul_v1(N, L, M, dtype):\n    A = te.placeholder((N, L), name=\"A\", dtype=dtype)\n    B = te.placeholder((L, M), name=\"B\", dtype=dtype)\n\n    k = te.reduce_axis((0, L), name=\"k\")\n    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name=\"C\")\n    s = te.create_schedule(C.op)\n\n    # schedule\n    y, x = s[C].op.axis\n    k = s[C].op.reduce_ax [...]
       ]
     },
     {
@@ -94,7 +94,7 @@
       },
       "outputs": [],
       "source": [
-        "@autotvm.template(\"tutorial/matmul\")\ndef matmul(N, L, M, dtype):\n    A = te.placeholder((N, L), name='A', dtype=dtype)\n    B = te.placeholder((L, M), name='B', dtype=dtype)\n\n    k = te.reduce_axis((0, L), name='k')\n    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name='C')\n    s = te.create_schedule(C.op)\n\n    # schedule\n    y, x = s[C].op.axis\n    k = s[C].op.reduce_axis[0]\n\n    ##### define space begin #####\n    cfg = autotvm.get_confi [...]
+        "@autotvm.template(\"tutorial/matmul\")\ndef matmul(N, L, M, dtype):\n    A = te.placeholder((N, L), name=\"A\", dtype=dtype)\n    B = te.placeholder((L, M), name=\"B\", dtype=dtype)\n\n    k = te.reduce_axis((0, L), name=\"k\")\n    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name=\"C\")\n    s = te.create_schedule(C.op)\n\n    # schedule\n    y, x = s[C].op.axis\n    k = s[C].op.reduce_axis[0]\n\n    ##### define space begin #####\n    cfg = autotvm.g [...]
       ]
     },
     {
@@ -126,7 +126,7 @@
       },
       "outputs": [],
       "source": [
-        "N, L, M = 512, 512, 512\ntask = autotvm.task.create(\"tutorial/matmul\", args=(N, L, M, 'float32'), target='llvm')\nprint(task.config_space)"
+        "N, L, M = 512, 512, 512\ntask = autotvm.task.create(\"tutorial/matmul\", args=(N, L, M, \"float32\"), target=\"llvm\")\nprint(task.config_space)"
       ]
     },
     {
@@ -144,7 +144,7 @@
       },
       "outputs": [],
       "source": [
-        "# logging config (for printing tuning log to the screen)\nlogging.getLogger('autotvm').setLevel(logging.DEBUG)\nlogging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))\n\n# There are two steps for measuring a config: build and run.\n# By default, we use all CPU cores to compile program. Then measure them sequentially.\n# We measure 5 times and take average to reduce variance.\nmeasure_option = autotvm.measure_option(\n    builder='local',\n    runner=autotvm.L [...]
+        "# logging config (for printing tuning log to the screen)\nlogging.getLogger(\"autotvm\").setLevel(logging.DEBUG)\nlogging.getLogger(\"autotvm\").addHandler(logging.StreamHandler(sys.stdout))\n\n# There are two steps for measuring a config: build and run.\n# By default, we use all CPU cores to compile program. Then measure them sequentially.\n# We measure 5 times and take average to reduce variance.\nmeasure_option = autotvm.measure_option(builder=\"local\", runner=autotvm.LocalR [...]
       ]
     },
     {
@@ -162,7 +162,7 @@
       },
       "outputs": [],
       "source": [
-        "# apply history best from log file\nwith autotvm.apply_history_best('matmul.log'):\n    with tvm.target.create(\"llvm\"):\n        s, arg_bufs = matmul(N, L, M, 'float32')\n        func = tvm.build(s, arg_bufs)\n\n# check correctness\na_np = np.random.uniform(size=(N, L)).astype(np.float32)\nb_np = np.random.uniform(size=(L, M)).astype(np.float32)\nc_np = a_np.dot(b_np)\n\nc_tvm = tvm.nd.empty(c_np.shape)\nfunc(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm)\n\ntvm.testing.assert [...]
+        "# apply history best from log file\nwith autotvm.apply_history_best(\"matmul.log\"):\n    with tvm.target.Target(\"llvm\"):\n        s, arg_bufs = matmul(N, L, M, \"float32\")\n        func = tvm.build(s, arg_bufs)\n\n# check correctness\na_np = np.random.uniform(size=(N, L)).astype(np.float32)\nb_np = np.random.uniform(size=(L, M)).astype(np.float32)\nc_np = a_np.dot(b_np)\n\nc_tvm = tvm.nd.empty(c_np.shape)\nfunc(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm)\n\ntvm.testing.as [...]
       ]
     }
   ],
diff --git a/docs/_downloads/10e16681be542cc483fa89e9b4678a27/opt_matmul_auto_tensorcore.py b/docs/_downloads/10e16681be542cc483fa89e9b4678a27/opt_matmul_auto_tensorcore.py
index 7dbd475..d81eca5 100644
--- a/docs/_downloads/10e16681be542cc483fa89e9b4678a27/opt_matmul_auto_tensorcore.py
+++ b/docs/_downloads/10e16681be542cc483fa89e9b4678a27/opt_matmul_auto_tensorcore.py
@@ -51,22 +51,32 @@ from tvm import te
 from tvm import autotvm
 from tvm.contrib import nvcc
 
-def matmul_nn(A, B, L, dtype='float16', layout='NN'):
-    k = te.reduce_axis((0, L), name='k')
-    if dtype == 'float16':
-      out_type = 'float'
-    elif dtype == 'int8':
-      out_type = 'int'
-    elif dtype == 'int4' or dtype == 'int1':
-      out_type = 'int'
-    if (layout == 'NN'):
-      return te.compute((N, M), lambda i, j: te.sum(A[i, k].astype(out_type) * B[k, j].astype(out_type), axis=k))
-    if (layout == 'NT'):
-      return te.compute((N, M), lambda i, j: te.sum(A[k, i].astype(out_type) * B[k, j].astype(out_type), axis=k))
-    if (layout == 'TN'):
-      return te.compute((N, M), lambda i, j: te.sum(A[i, k].astype(out_type) * B[j, k].astype(out_type), axis=k))
-    if (layout == 'TT'):
-      return te.compute((N, M), lambda i, j: te.sum(A[k, i].astype(out_type) * B[j, k].astype(out_type), axis=k))
+
+def matmul_nn(A, B, L, dtype="float16", layout="NN"):
+    k = te.reduce_axis((0, L), name="k")
+    if dtype == "float16":
+        out_type = "float"
+    elif dtype == "int8":
+        out_type = "int"
+    elif dtype == "int4" or dtype == "int1":
+        out_type = "int"
+    if layout == "NN":
+        return te.compute(
+            (N, M), lambda i, j: te.sum(A[i, k].astype(out_type) * B[k, j].astype(out_type), axis=k)
+        )
+    if layout == "NT":
+        return te.compute(
+            (N, M), lambda i, j: te.sum(A[k, i].astype(out_type) * B[k, j].astype(out_type), axis=k)
+        )
+    if layout == "TN":
+        return te.compute(
+            (N, M), lambda i, j: te.sum(A[i, k].astype(out_type) * B[j, k].astype(out_type), axis=k)
+        )
+    if layout == "TT":
+        return te.compute(
+            (N, M), lambda i, j: te.sum(A[k, i].astype(out_type) * B[j, k].astype(out_type), axis=k)
+        )
+
 
 ###############################################################################
 # Scheduling the Computation
@@ -86,7 +96,7 @@ def matmul_nn(A, B, L, dtype='float16', layout='NN'):
 #   (2) The warp tile size is not 16x16x16 on CUDA9, or not one of {16x16x16, 32x8x16, 8x32x16} on CUDA version >= 10.0.
 #
 # In this schedule, storage_align is used to reduce bank conflicts of shared memory. Please refer to this
-# `doc <https://tvm.apache.org/docs/api/python/schedule.html#tvm.te.schedule.Stage.storage_align>`_
+# `doc <https://tvm.apache.org/docs/api/python/te.html#tvm.te.Stage.storage_align>`_
 # for the usage of storage_align primitive. In short, we need to add an offset to some shared memory buffer
 # to reduce bank conflicts.
 # According to the `wmma doc <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#wmma-description>`_,
@@ -95,25 +105,26 @@ def matmul_nn(A, B, L, dtype='float16', layout='NN'):
 #
 # We use AutoTVM to search for best configurations in this schedule.
 
+
 @autotvm.template("tutorial/auto_tensorcore/test_gemm")
 def test_gemm(N, L, M, dtype, layout):
-    if (layout == "NN"):
-      shape_a = (N, L)
-      shape_b = (L, M)
-    elif (layout == "NT"):
-      shape_a = (L, N)
-      shape_b = (L, M)
-    elif (layout == "TN"):
-      shape_a = (N, L)
-      shape_b = (M, L)
-    elif (layout == "TT"):
-      shape_a = (L, N)
-      shape_b = (M, L)
+    if layout == "NN":
+        shape_a = (N, L)
+        shape_b = (L, M)
+    elif layout == "NT":
+        shape_a = (L, N)
+        shape_b = (L, M)
+    elif layout == "TN":
+        shape_a = (N, L)
+        shape_b = (M, L)
+    elif layout == "TT":
+        shape_a = (L, N)
+        shape_b = (M, L)
     else:
-      print ("Unsupported layout:", layout)
-      sys.exit(1);
-    A = te.placeholder(shape_a, name='A', dtype=dtype)
-    B = te.placeholder(shape_b, name='B', dtype=dtype)
+        print("Unsupported layout:", layout)
+        sys.exit(1)
+    A = te.placeholder(shape_a, name="A", dtype=dtype)
+    B = te.placeholder(shape_b, name="B", dtype=dtype)
     C = matmul_nn(A, B, L, dtype, layout)
 
     s = te.create_schedule(C.op)
@@ -123,53 +134,53 @@ def test_gemm(N, L, M, dtype, layout):
     # storage_align params
     factor = 16
     offset = 8
-    if dtype == 'int8':
-      factor = 32
-      offset = 16
-    elif dtype == 'int4':
-      factor = 64
-      offset = 32
-    elif dtype == 'int1':
-      factor = 256
-      offset = 128
+    if dtype == "int8":
+        factor = 32
+        offset = 16
+    elif dtype == "int4":
+        factor = 64
+        offset = 32
+    elif dtype == "int1":
+        factor = 256
+        offset = 128
 
     # create cache stages
     AA = s.cache_read(A, "shared", [C])
-    if (layout == "NN" or layout == "TN"):
-      s[AA].storage_align(AA.op.axis[0], factor, offset)
+    if layout == "NN" or layout == "TN":
+        s[AA].storage_align(AA.op.axis[0], factor, offset)
     AL = s.cache_read(AA, "local", [C])
     BB = s.cache_read(B, "shared", [C])
-    if (layout == "TT" or layout == "NT"):
-      s[BB].storage_align(BB.op.axis[0], factor, offset)
+    if layout == "TT" or layout == "NT":
+        s[BB].storage_align(BB.op.axis[0], factor, offset)
     BL = s.cache_read(BB, "local", [C])
     CL = s.cache_write(C, "local")
 
-    #autotvm search space definition
+    # autotvm search space definition
     cfg = autotvm.get_config()
 
     cfg.define_knob("bx", [2, 4, 8])
     cfg.define_knob("by", [8, 16, 32, 64])
     cfg.define_knob("step_k", [1, 2, 4, 8, 16, 32])
     cfg.define_knob("v", [4, 8, 16, 32])
-    by = cfg['by'].val
-    bx = cfg['bx'].val
-    step_k = cfg['step_k'].val
-    v = cfg['v'].val
+    by = cfg["by"].val
+    bx = cfg["bx"].val
+    step_k = cfg["step_k"].val
+    v = cfg["v"].val
 
     # thread tile
     TX = 8
     TY = 1
-    if dtype == 'int4' or dtype == 'int1':
-      TX = 2
+    if dtype == "int4" or dtype == "int1":
+        TX = 2
     # warp tile
-    warp_tile_m = 16 # it could also be 8 or 32 on CUDA version >= 10.0
-    warp_tile_k = 16 # it must be 16 for fp16/int8 data type
-    if dtype == 'int4':
-      warp_tile_m = 8
-      warp_tile_k = 32
-    elif dtype == 'int1':
-      warp_tile_m = 8
-      warp_tile_k = 128
+    warp_tile_m = 16  # it could also be 8 or 32 on CUDA version >= 10.0
+    warp_tile_k = 16  # it must be 16 for fp16/int8 data type
+    if dtype == "int4":
+        warp_tile_m = 8
+        warp_tile_k = 32
+    elif dtype == "int1":
+        warp_tile_m = 8
+        warp_tile_k = 128
     # block tile
     tile_x = bx * TX
     tile_y = by * TY
@@ -198,8 +209,8 @@ def test_gemm(N, L, M, dtype, layout):
 
     # schedule for AA stage
     s[AA].compute_at(s[CL], ko)
-    xo, xi = s[AA].split(s[AA].op.axis[1], factor=bx*v)
-    tz, tx = s[AA].split(xi, factor=(WX//TX)*v)
+    xo, xi = s[AA].split(s[AA].op.axis[1], factor=bx * v)
+    tz, tx = s[AA].split(xi, factor=(WX // TX) * v)
     tx, vec = s[AA].split(tx, factor=v)
     fused = s[AA].fuse(s[AA].op.axis[0], xo)
     _, ty = s[AA].split(fused, factor=by)
@@ -211,8 +222,8 @@ def test_gemm(N, L, M, dtype, layout):
 
     # schedule for BB stage
     s[BB].compute_at(s[CL], ko)
-    xo, xi = s[BB].split(s[BB].op.axis[1], factor=bx*v)
-    tz, tx = s[BB].split(xi, factor=(WX//TX)*v)
+    xo, xi = s[BB].split(s[BB].op.axis[1], factor=bx * v)
+    tz, tx = s[BB].split(xi, factor=(WX // TX) * v)
     tx, vec = s[BB].split(tx, factor=v)
     fused = s[BB].fuse(s[BB].op.axis[0], xo)
     _, ty = s[BB].split(fused, factor=by)
@@ -225,10 +236,11 @@ def test_gemm(N, L, M, dtype, layout):
     s[BL].compute_at(s[CL], kl)
 
     # set the 'tensor_core' pragma for tensorcore codegen
-    s[CL].pragma(ko, 'tensor_core')
+    s[CL].pragma(ko, "tensor_core")
 
     return s, [A, B, C]
 
+
 ###############################################################################
 # AutoTune and Test
 # -----------------
@@ -237,150 +249,151 @@ def test_gemm(N, L, M, dtype, layout):
 
 # check whether the gpu has tensorcore
 if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
-  print("skip because cuda is not enabled..")
-  sys.exit(0)
+    raise Exception("skip building this tutorial because cuda is not enabled..")
 
 ctx = tvm.gpu()
 if not nvcc.have_tensorcore(ctx.compute_version):
-  print('the gpu has no tensorcore, skipping...')
-  sys.exit(0)
+    raise Exception("the gpu has no tensorcore, skipping...")
 
 M, N, L = 512, 32, 512
-dtype = 'float16'
-layout = 'NN'
+dtype = "float16"
+layout = "NN"
 if len(sys.argv) >= 4:
-  M, N, L = int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3])
+    M, N, L = int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3])
 if len(sys.argv) >= 5:
-  dtype = sys.argv[4]
+    dtype = sys.argv[4]
 if len(sys.argv) >= 6:
-  layout = sys.argv[5]
+    layout = sys.argv[5]
 
 # check whether current gpu arch support support current dtype's wmma codegen
 cuda_compute_capability = tvm.runtime._ffi_api.GetDeviceAttr(2, 0, 4)
-major, minor= nvcc.parse_compute_version(cuda_compute_capability)
-if dtype == 'int8':
-  assert(major == 7 and minor >= 2)
-elif dtype == 'int4' or dtype == 'int1':
-  # int4/int1 only support layout TN
-  assert(major == 7 and minor == 5 and layout == 'TN')
+major, minor = nvcc.parse_compute_version(cuda_compute_capability)
+if dtype == "int8":
+    assert major == 7 and minor >= 2
+elif dtype == "int4" or dtype == "int1":
+    # int4/int1 only support layout TN
+    assert major == 7 and minor == 5 and layout == "TN"
+
 
 def tune_and_evaluate(M, N, L, dtype, layout):
-  task = autotvm.task.create("tutorial/auto_tensorcore/test_gemm", args=(N, L, M, dtype, layout),
-                             target='cuda')
-  print(task.config_space)
-
-  logging.getLogger('autotvm').setLevel(logging.DEBUG)
-  logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))
-
-  measure_option = autotvm.measure_option(
-    builder='local',
-    runner=autotvm.LocalRunner(number=5))
-
-  tuner = autotvm.tuner.XGBTuner(task)
-  tuner.tune(n_trial=1000,
-             measure_option=measure_option,
-             callbacks=[autotvm.callback.log_to_file('matmul.log')])
-
-  dispatch_context = autotvm.apply_history_best("matmul.log")
-  best_config = dispatch_context.query(task.target, task.workload)
-  print("\nBest config:")
-  print(best_config)
-  with autotvm.apply_history_best('matmul.log'):
-    with tvm.target.create("cuda"):
-          s, arg_bufs = test_gemm(N, L, M, dtype, layout)
-          print(tvm.lower(s, arg_bufs, simple_mode=True))
-          func = tvm.build(s, arg_bufs)
-  dev_module = func.imported_modules[0]
-  print(dev_module.get_source())
-
-  # check correctness
-  if (layout == "NN"):
-    shape_a = (N, L)
-    shape_b = (L, M)
-  elif (layout == "NT"):
-    shape_a = (L, N)
-    shape_b = (L, M)
-  elif (layout == "TN"):
-    shape_a = (N, L)
-    shape_b = (M, L)
-  elif (layout == "TT"):
-    shape_a = (L, N)
-    shape_b = (M, L)
-
-  a_np = None
-  b_np = None
-  c_np = None
-  c_np_type = None
-  if dtype == 'float16':
-    c_np_type = np.float32
-    a_np = np.random.uniform(size=shape_a).astype(np.float16)
-    b_np = np.random.uniform(size=shape_b).astype(np.float16)
-    if (layout == "NN"):
-      c_np = np.dot(a_np, b_np)
-    elif (layout == "NT"):
-      c_np = np.dot(a_np.T, b_np)
-    elif (layout == "TN"):
-      c_np = np.dot(a_np, b_np.T)
-    elif (layout == "TT"):
-      c_np = np.dot(a_np.T, b_np.T)
-  elif dtype == 'int8':
-    c_np_type = np.int32
-    a_np = np.random.randint(low=-128, high=127, size=shape_a).astype(np.int8)
-    b_np = np.random.randint(low=-128, high=127, size=shape_b).astype(np.int8)
-    if (layout == "NN"):
-      c_np = np.dot(a_np.astype(np.int32), b_np.astype(np.int32))
-    elif (layout == "NT"):
-      c_np = np.dot(a_np.astype(np.int32).T, b_np.astype(np.int32))
-    elif (layout == "TN"):
-      c_np = np.dot(a_np.astype(np.int32), b_np.astype(np.int32).T)
-    elif (layout == "TT"):
-      c_np = np.dot(a_np.astype(np.int32).T, b_np.astype(np.int32).T)
-  elif dtype == 'int4':
-    c_np_type = np.int32
-    a_np_int = np.random.randint(low=-8, high=7, size=shape_a).astype(np.int32)
-    b_np_int = np.random.randint(low=-8, high=7, size=shape_b).astype(np.int32)
-    # "TN"
-    c_np = np.dot(a_np_int.astype(np.int32), b_np_int.astype(np.int32).T)
-    a_np = np.zeros(shape=(N, int(L/8)), dtype = np.int32)
-    b_np = np.zeros(shape=(M, int(L/8)), dtype = np.int32)
-    # a_np --> col_major
-    for i in range(N):
-      for j in range(int(L/8)):
-        for k in range(8):
-          a_np[i, j] = a_np[i, j] | ((a_np_int[i, j * 8 + k] & 0xf) << ((7 - k) * 4))
-
-    # b_np --> row_major
-    for i in range(M):
-      for j in range(int(L/8)):
-        for k in range(8):
-          b_np[i, j] = b_np[i, j] | ((b_np_int[i, j * 8 + k] & 0xf) << ((7 - k) * 4))
-  elif dtype == 'int1':
-    c_np_type = np.int32
-    a_np_int = np.random.randint(low=0, high=1, size=shape_a).astype(np.int32)
-    b_np_int = np.random.randint(low=0, high=1, size=shape_b).astype(np.int32)
-    # "TN"
-    c_np = np.dot(a_np_int.astype(np.int32), b_np_int.astype(np.int32).T)
-    a_np = np.zeros(shape=(N, int(L/32)), dtype = np.int32)
-    b_np = np.zeros(shape=(M, int(L/32)), dtype = np.int32)
-    for i in range(N):
-      for j in range(int(L/32)):
-        for k in range(32):
-          a_np[i, j] = a_np[i, j] | ((a_np_int[i, j * 32 + k] & 0xf) << (31 - k))
-
-    for i in range(M):
-      for j in range(int(L/32)):
-        for k in range(32):
-          b_np[i, j] = b_np[i, j] | ((b_np_int[i, j * 32 + k] & 0xf) << (31 - k))
-
-  c_tvm = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np_type), ctx=ctx)
-  a_tvm = tvm.nd.array(a_np, ctx=ctx)
-  b_tvm = tvm.nd.array(b_np, ctx=ctx)
-  func(a_tvm, b_tvm, c_tvm)
-
-  tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-3)
-
-  evaluator = func.time_evaluator(func.entry_name, ctx, number=100)
-  print('Time cost of this operator: %f' % evaluator(a_tvm, b_tvm, c_tvm).mean)
+    task = autotvm.task.create(
+        "tutorial/auto_tensorcore/test_gemm", args=(N, L, M, dtype, layout), target="cuda"
+    )
+    print(task.config_space)
+
+    logging.getLogger("autotvm").setLevel(logging.DEBUG)
+    logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout))
+
+    measure_option = autotvm.measure_option(builder="local", runner=autotvm.LocalRunner(number=5))
+
+    tuner = autotvm.tuner.XGBTuner(task)
+    tuner.tune(
+        n_trial=1000,
+        measure_option=measure_option,
+        callbacks=[autotvm.callback.log_to_file("matmul.log")],
+    )
+
+    dispatch_context = autotvm.apply_history_best("matmul.log")
+    best_config = dispatch_context.query(task.target, task.workload)
+    print("\nBest config:")
+    print(best_config)
+    with autotvm.apply_history_best("matmul.log"):
+        with tvm.target.Target("cuda"):
+            s, arg_bufs = test_gemm(N, L, M, dtype, layout)
+            print(tvm.lower(s, arg_bufs, simple_mode=True))
+            func = tvm.build(s, arg_bufs)
+    dev_module = func.imported_modules[0]
+    print(dev_module.get_source())
+
+    # check correctness
+    if layout == "NN":
+        shape_a = (N, L)
+        shape_b = (L, M)
+    elif layout == "NT":
+        shape_a = (L, N)
+        shape_b = (L, M)
+    elif layout == "TN":
+        shape_a = (N, L)
+        shape_b = (M, L)
+    elif layout == "TT":
+        shape_a = (L, N)
+        shape_b = (M, L)
+
+    a_np = None
+    b_np = None
+    c_np = None
+    c_np_type = None
+    if dtype == "float16":
+        c_np_type = np.float32
+        a_np = np.random.uniform(size=shape_a).astype(np.float16)
+        b_np = np.random.uniform(size=shape_b).astype(np.float16)
+        if layout == "NN":
+            c_np = np.dot(a_np, b_np)
+        elif layout == "NT":
+            c_np = np.dot(a_np.T, b_np)
+        elif layout == "TN":
+            c_np = np.dot(a_np, b_np.T)
+        elif layout == "TT":
+            c_np = np.dot(a_np.T, b_np.T)
+    elif dtype == "int8":
+        c_np_type = np.int32
+        a_np = np.random.randint(low=-128, high=127, size=shape_a).astype(np.int8)
+        b_np = np.random.randint(low=-128, high=127, size=shape_b).astype(np.int8)
+        if layout == "NN":
+            c_np = np.dot(a_np.astype(np.int32), b_np.astype(np.int32))
+        elif layout == "NT":
+            c_np = np.dot(a_np.astype(np.int32).T, b_np.astype(np.int32))
+        elif layout == "TN":
+            c_np = np.dot(a_np.astype(np.int32), b_np.astype(np.int32).T)
+        elif layout == "TT":
+            c_np = np.dot(a_np.astype(np.int32).T, b_np.astype(np.int32).T)
+    elif dtype == "int4":
+        c_np_type = np.int32
+        a_np_int = np.random.randint(low=-8, high=7, size=shape_a).astype(np.int32)
+        b_np_int = np.random.randint(low=-8, high=7, size=shape_b).astype(np.int32)
+        # "TN"
+        c_np = np.dot(a_np_int.astype(np.int32), b_np_int.astype(np.int32).T)
+        a_np = np.zeros(shape=(N, int(L / 8)), dtype=np.int32)
+        b_np = np.zeros(shape=(M, int(L / 8)), dtype=np.int32)
+        # a_np --> col_major
+        for i in range(N):
+            for j in range(int(L / 8)):
+                for k in range(8):
+                    a_np[i, j] = a_np[i, j] | ((a_np_int[i, j * 8 + k] & 0xF) << ((7 - k) * 4))
+
+        # b_np --> row_major
+        for i in range(M):
+            for j in range(int(L / 8)):
+                for k in range(8):
+                    b_np[i, j] = b_np[i, j] | ((b_np_int[i, j * 8 + k] & 0xF) << ((7 - k) * 4))
+    elif dtype == "int1":
+        c_np_type = np.int32
+        a_np_int = np.random.randint(low=0, high=1, size=shape_a).astype(np.int32)
+        b_np_int = np.random.randint(low=0, high=1, size=shape_b).astype(np.int32)
+        # "TN"
+        c_np = np.dot(a_np_int.astype(np.int32), b_np_int.astype(np.int32).T)
+        a_np = np.zeros(shape=(N, int(L / 32)), dtype=np.int32)
+        b_np = np.zeros(shape=(M, int(L / 32)), dtype=np.int32)
+        for i in range(N):
+            for j in range(int(L / 32)):
+                for k in range(32):
+                    a_np[i, j] = a_np[i, j] | ((a_np_int[i, j * 32 + k] & 0xF) << (31 - k))
+
+        for i in range(M):
+            for j in range(int(L / 32)):
+                for k in range(32):
+                    b_np[i, j] = b_np[i, j] | ((b_np_int[i, j * 32 + k] & 0xF) << (31 - k))
+
+    c_tvm = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np_type), ctx=ctx)
+    a_tvm = tvm.nd.array(a_np, ctx=ctx)
+    b_tvm = tvm.nd.array(b_np, ctx=ctx)
+    func(a_tvm, b_tvm, c_tvm)
+
+    tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-3)
+
+    evaluator = func.time_evaluator(func.entry_name, ctx, number=100)
+    print("Time cost of this operator: %f" % evaluator(a_tvm, b_tvm, c_tvm).mean)
+
 
 # We do not run the tuning in our webpage server since it takes some time.
 # Uncomment the following line to run it by yourself.
diff --git a/docs/_downloads/1195277fef6a622db64b78b4ea799ed4/matrix_multiply.py b/docs/_downloads/1195277fef6a622db64b78b4ea799ed4/matrix_multiply.py
index 024e179..77fc805 100644
--- a/docs/_downloads/1195277fef6a622db64b78b4ea799ed4/matrix_multiply.py
+++ b/docs/_downloads/1195277fef6a622db64b78b4ea799ed4/matrix_multiply.py
@@ -205,11 +205,12 @@ ki = te.reduce_axis((0, env.BLOCK_IN), name="ki")
 # Describe the in-VTA matrix multiplication
 C_buf = te.compute(
     (o, m, env.BATCH, env.BLOCK_OUT),
-    lambda bo, co, bi, ci:
-        te.sum(A_buf[bo, ko, bi, ki].astype(env.acc_dtype) *
-                B_buf[co, ko, ci, ki].astype(env.acc_dtype),
-                axis=[ko, ki]),
-    name="C_buf")
+    lambda bo, co, bi, ci: te.sum(
+        A_buf[bo, ko, bi, ki].astype(env.acc_dtype) * B_buf[co, ko, ci, ki].astype(env.acc_dtype),
+        axis=[ko, ki],
+    ),
+    name="C_buf",
+)
 
 ######################################################################
 # Casting the Results
@@ -236,9 +237,8 @@ C_buf = te.compute(
 
 # Cast to output type, and send to main memory
 C = te.compute(
-    (o, m, env.BATCH, env.BLOCK_OUT),
-    lambda *i: C_buf(*i).astype(env.inp_dtype),
-    name="C")
+    (o, m, env.BATCH, env.BLOCK_OUT), lambda *i: C_buf(*i).astype(env.inp_dtype), name="C"
+)
 
 ######################################################################
 # This concludes the computation declaration part of this tutorial.
@@ -369,12 +369,8 @@ print(tvm.lower(s, [A, B, C], simple_mode=True))
 # by the VTA runtime JIT compiler.
 
 s[C_buf].reorder(
-    ko,
-    s[C_buf].op.axis[0],
-    s[C_buf].op.axis[1],
-    s[C_buf].op.axis[2],
-    s[C_buf].op.axis[3],
-    ki)
+    ko, s[C_buf].op.axis[0], s[C_buf].op.axis[1], s[C_buf].op.axis[2], s[C_buf].op.axis[3], ki
+)
 s[C_buf].tensorize(s[C_buf].op.axis[2], env.gemm)
 
 # Let's take a look at the finalized schedule
@@ -422,16 +418,12 @@ f = remote.load_module("gemm.o")
 ctx = remote.ext_dev(0)
 
 # Initialize the A and B arrays randomly in the int range of (-128, 128]
-A_orig = np.random.randint(
-    -128, 128, size=(o * env.BATCH, n * env.BLOCK_IN)).astype(A.dtype)
-B_orig = np.random.randint(
-    -128, 128, size=(m * env.BLOCK_OUT, n * env.BLOCK_IN)).astype(B.dtype)
+A_orig = np.random.randint(-128, 128, size=(o * env.BATCH, n * env.BLOCK_IN)).astype(A.dtype)
+B_orig = np.random.randint(-128, 128, size=(m * env.BLOCK_OUT, n * env.BLOCK_IN)).astype(B.dtype)
 
 # Apply packing to the A and B arrays from a 2D to a 4D packed layout
-A_packed = A_orig.reshape(
-    o, env.BATCH, n, env.BLOCK_IN).transpose((0, 2, 1, 3))
-B_packed = B_orig.reshape(
-    m, env.BLOCK_OUT, n, env.BLOCK_IN).transpose((0, 2, 1, 3))
+A_packed = A_orig.reshape(o, env.BATCH, n, env.BLOCK_IN).transpose((0, 2, 1, 3))
+B_packed = B_orig.reshape(m, env.BLOCK_OUT, n, env.BLOCK_IN).transpose((0, 2, 1, 3))
 
 # Format the input/output arrays with tvm.nd.array to the DLPack standard
 A_nd = tvm.nd.array(A_packed, ctx)
@@ -452,10 +444,8 @@ f(A_nd, B_nd, C_nd)
 # matrix multiplication indeed is correct
 
 # Compute reference result with numpy
-C_ref = np.dot(A_orig.astype(env.acc_dtype),
-               B_orig.T.astype(env.acc_dtype)).astype(C.dtype)
-C_ref = C_ref.reshape(
-    o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))
+C_ref = np.dot(A_orig.astype(env.acc_dtype), B_orig.T.astype(env.acc_dtype)).astype(C.dtype)
+C_ref = C_ref.reshape(o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))
 np.testing.assert_equal(C_ref, C_nd.asnumpy())
 
 # Print stats
diff --git a/docs/_downloads/13509e02380dbdb802e80921620e9b5c/use_pass_infra.ipynb b/docs/_downloads/13509e02380dbdb802e80921620e9b5c/use_pass_infra.ipynb
index 18254dc..936c740 100644
--- a/docs/_downloads/13509e02380dbdb802e80921620e9b5c/use_pass_infra.ipynb
+++ b/docs/_downloads/13509e02380dbdb802e80921620e9b5c/use_pass_infra.ipynb
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "def example():\n    shape = (1, 64, 54, 54)\n    c_data = np.empty(shape).astype(\"float32\")\n    c = relay.const(c_data)\n    weight = relay.var('weight', shape=(64, 64, 3, 3))\n    x = relay.var(\"x\", relay.TensorType((1, 64, 56, 56), \"float32\"))\n    conv = relay.nn.conv2d(x, weight)\n    y = relay.add(c, c)\n    y = relay.multiply(y, relay.const(2, \"float32\"))\n    y = relay.add(conv, y)\n    z = relay.add(y, c)\n    z1 = relay.add(y, c)\n    z2 = relay.add(z, z1)\n    [...]
+        "def example():\n    shape = (1, 64, 54, 54)\n    c_data = np.empty(shape).astype(\"float32\")\n    c = relay.const(c_data)\n    weight = relay.var(\"weight\", shape=(64, 64, 3, 3))\n    x = relay.var(\"x\", relay.TensorType((1, 64, 56, 56), \"float32\"))\n    conv = relay.nn.conv2d(x, weight)\n    y = relay.add(c, c)\n    y = relay.multiply(y, relay.const(2, \"float32\"))\n    y = relay.add(conv, y)\n    z = relay.add(y, c)\n    z1 = relay.add(y, c)\n    z2 = relay.add(z, z1)\n  [...]
       ]
     },
     {
@@ -62,7 +62,7 @@
       },
       "outputs": [],
       "source": [
-        "@relay.op.register_alter_op_layout(\"nn.conv2d\", level=101)\ndef alter_conv2d(attrs, inputs, tinfos, out_type):\n    data, weight = inputs\n    new_attrs = dict(attrs)\n    new_attrs['data_layout'] = 'NCHW16c'\n    return relay.nn.conv2d(data, weight, **new_attrs)"
+        "@relay.op.register_alter_op_layout(\"nn.conv2d\", level=101)\ndef alter_conv2d(attrs, inputs, tinfos, out_type):\n    data, weight = inputs\n    new_attrs = dict(attrs)\n    new_attrs[\"data_layout\"] = \"NCHW16c\"\n    return relay.nn.conv2d(data, weight, **new_attrs)"
       ]
     },
     {
@@ -134,7 +134,7 @@
       },
       "outputs": [],
       "source": [
-        "# Now let's execute some passes through :py:class:`tvm.transform.Sequential`\nf = example()\nmod = tvm.IRModule.from_expr(f)\n# Glob the interested passes.\nseq = tvm.transform.Sequential([relay.transform.FoldConstant(),\n                                  relay.transform.EliminateCommonSubexpr(),\n                                  relay.transform.FuseOps(fuse_opt_level=2)])\nmod1 = seq(mod)\nprint(mod1)"
+        "# Now let's execute some passes through :py:class:`tvm.transform.Sequential`\nf = example()\nmod = tvm.IRModule.from_expr(f)\n# Glob the interested passes.\nseq = tvm.transform.Sequential(\n    [\n        relay.transform.FoldConstant(),\n        relay.transform.EliminateCommonSubexpr(),\n        relay.transform.FuseOps(fuse_opt_level=2),\n    ]\n)\nmod1 = seq(mod)\nprint(mod1)"
       ]
     },
     {
@@ -188,7 +188,7 @@
       },
       "outputs": [],
       "source": [
-        "with tvm.transform.PassContext(opt_level=3):\n    mod4 = seq(mod)\nprint(mod4)\n\nseq1 = tvm.transform.Sequential([relay.transform.AlterOpLayout()])\nwith tvm.transform.PassContext(opt_level=3):\n    with tvm.target.create(\"llvm\"):\n        mod5 = seq1(mod)\nprint(mod5)"
+        "with tvm.transform.PassContext(opt_level=3):\n    mod4 = seq(mod)\nprint(mod4)\n\nseq1 = tvm.transform.Sequential([relay.transform.AlterOpLayout()])\nwith tvm.transform.PassContext(opt_level=3):\n    with tvm.target.Target(\"llvm\"):\n        mod5 = seq1(mod)\nprint(mod5)"
       ]
     },
     {
@@ -206,7 +206,7 @@
       },
       "outputs": [],
       "source": [
-        "@relay.transform.function_pass(opt_level=1)\nclass CustomPipeline:\n    \"\"\"Simple test function to replace one argument to another.\"\"\"\n\n    def __init__(self, multiplier):\n        self.multiplier = multiplier\n\n    # This function can define a pass.\n    def transform_function(self, func, mod, ctx):\n        obj = self\n\n        class ReplaceConstant(tvm.relay.ExprMutator):\n            def visit_constant(self, c):\n                return relay.multiply(obj.multiplier [...]
+        "@relay.transform.function_pass(opt_level=1)\nclass CustomPipeline:\n    \"\"\"Simple test function to replace one argument to another.\"\"\"\n\n    def __init__(self, multiplier):\n        self.multiplier = multiplier\n\n    # This function can define a pass.\n    def transform_function(self, func, mod, ctx):\n        obj = self\n\n        class ReplaceConstant(tvm.relay.ExprMutator):\n            def visit_constant(self, c):\n                return relay.multiply(obj.multiplier [...]
       ]
     },
     {
@@ -224,7 +224,7 @@
       },
       "outputs": [],
       "source": [
-        "f = example()\nmod = tvm.IRModule.from_expr(f)\nseq = tvm.transform.Sequential([relay.transform.FoldConstant(),\n                                tvm.transform.PrintIR(),\n                                relay.transform.EliminateCommonSubexpr(),\n                                relay.transform.FuseOps(),\n                                relay.transform.AlterOpLayout()])\n\n# By inserting the ``PrintIR`` pass after ``FoldConstant``, the pass infra will\n# dump out the module IR wh [...]
+        "f = example()\nmod = tvm.IRModule.from_expr(f)\nseq = tvm.transform.Sequential(\n    [\n        relay.transform.FoldConstant(),\n        tvm.transform.PrintIR(),\n        relay.transform.EliminateCommonSubexpr(),\n        relay.transform.FuseOps(),\n        relay.transform.AlterOpLayout(),\n    ]\n)\n\n# By inserting the ``PrintIR`` pass after ``FoldConstant``, the pass infra will\n# dump out the module IR when ``FoldConstant`` is done. Users can plug in this\n# pass after any p [...]
       ]
     },
     {
diff --git a/docs/_downloads/143c743c62f58570eabd77fd3395ca8c/scan.py b/docs/_downloads/143c743c62f58570eabd77fd3395ca8c/scan.py
index 73790da..fdb6ec9 100644
--- a/docs/_downloads/143c743c62f58570eabd77fd3395ca8c/scan.py
+++ b/docs/_downloads/143c743c62f58570eabd77fd3395ca8c/scan.py
@@ -52,7 +52,7 @@ n = te.var("n")
 X = te.placeholder((m, n), name="X")
 s_state = te.placeholder((m, n))
 s_init = te.compute((1, n), lambda _, i: X[0, i])
-s_update = te.compute((m, n), lambda t, i: s_state[t-1, i] + X[t, i])
+s_update = te.compute((m, n), lambda t, i: s_state[t - 1, i] + X[t, i])
 s_scan = tvm.te.scan(s_init, s_update, s_state, inputs=[X])
 
 ######################################################################
@@ -106,7 +106,7 @@ n = te.var("n")
 X = te.placeholder((m, n), name="X")
 s_state = te.placeholder((m, n))
 s_init = te.compute((1, n), lambda _, i: X[0, i])
-s_update_s1 = te.compute((m, n), lambda t, i: s_state[t-1, i] * 2, name="s1")
+s_update_s1 = te.compute((m, n), lambda t, i: s_state[t - 1, i] * 2, name="s1")
 s_update_s2 = te.compute((m, n), lambda t, i: s_update_s1[t, i] + X[t, i], name="s2")
 s_scan = tvm.te.scan(s_init, s_update_s2, s_state, inputs=[X])
 
@@ -135,11 +135,11 @@ s_state1 = te.placeholder((m, n))
 s_state2 = te.placeholder((m, l))
 s_init1 = te.compute((1, n), lambda _, i: X[0, i])
 s_init2 = te.compute((1, l), lambda _, i: 0.0)
-s_update1 = te.compute((m, n), lambda t, i: s_state1[t-1, i] + X[t, i])
-s_update2 = te.compute((m, l), lambda t, i: s_state2[t-1, i] + s_state1[t-1, 0])
-s_scan1, s_scan2 = tvm.te.scan([s_init1, s_init2],
-                            [s_update1, s_update2],
-                            [s_state1, s_state2], inputs=[X])
+s_update1 = te.compute((m, n), lambda t, i: s_state1[t - 1, i] + X[t, i])
+s_update2 = te.compute((m, l), lambda t, i: s_state2[t - 1, i] + s_state1[t - 1, 0])
+s_scan1, s_scan2 = tvm.te.scan(
+    [s_init1, s_init2], [s_update1, s_update2], [s_state1, s_state2], inputs=[X]
+)
 s = te.create_schedule(s_scan1.op)
 print(tvm.lower(s, [X, s_scan1, s_scan2], simple_mode=True))
 
diff --git a/docs/_downloads/1604460dde2b82fb9db809bb388890f8/deploy_prequantized_tflite.ipynb b/docs/_downloads/1604460dde2b82fb9db809bb388890f8/deploy_prequantized_tflite.ipynb
index bacd7a6..db7c42f 100644
--- a/docs/_downloads/1604460dde2b82fb9db809bb388890f8/deploy_prequantized_tflite.ipynb
+++ b/docs/_downloads/1604460dde2b82fb9db809bb388890f8/deploy_prequantized_tflite.ipynb
@@ -51,7 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "# Download mobilenet V2 TFLite model provided by Google\nfrom tvm.contrib.download import download_testdata\n\nmodel_url = \"https://storage.googleapis.com/download.tensorflow.org/models/\" \\\n             \"tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz\"\n\n# Download model tar file and extract it to get mobilenet_v2_1.0_224.tflite\nmodel_path = download_testdata(model_url, \"mobilenet_v2_1.0_224_quant.tgz\",\n                               module=['tf', 'official'])\nmodel_d [...]
+        "# Download mobilenet V2 TFLite model provided by Google\nfrom tvm.contrib.download import download_testdata\n\nmodel_url = (\n    \"https://storage.googleapis.com/download.tensorflow.org/models/\"\n    \"tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz\"\n)\n\n# Download model tar file and extract it to get mobilenet_v2_1.0_224.tflite\nmodel_path = download_testdata(\n    model_url, \"mobilenet_v2_1.0_224_quant.tgz\", module=[\"tf\", \"official\"]\n)\nmodel_dir = os.path.dirname(m [...]
       ]
     },
     {
@@ -69,7 +69,7 @@
       },
       "outputs": [],
       "source": [
-        "def extract(path):\n    import tarfile\n    if path.endswith(\"tgz\") or path.endswith(\"gz\"):\n        dir_path = os.path.dirname(path)\n        tar = tarfile.open(path)\n        tar.extractall(path=dir_path)\n        tar.close()\n    else:\n        raise RuntimeError('Could not decompress the file: ' + path)\n\nextract(model_path)"
+        "def extract(path):\n    import tarfile\n\n    if path.endswith(\"tgz\") or path.endswith(\"gz\"):\n        dir_path = os.path.dirname(path)\n        tar = tarfile.open(path)\n        tar.extractall(path=dir_path)\n        tar.close()\n    else:\n        raise RuntimeError(\"Could not decompress the file: \" + path)\n\n\nextract(model_path)"
       ]
     },
     {
@@ -94,7 +94,7 @@
       },
       "outputs": [],
       "source": [
-        "def get_real_image(im_height, im_width):\n    from PIL import Image\n    repo_base = 'https://github.com/dmlc/web-data/raw/master/tensorflow/models/InceptionV1/'\n    img_name = 'elephant-299.jpg'\n    image_url = os.path.join(repo_base, img_name)\n    img_path = download_testdata(image_url, img_name, module='data')\n    image = Image.open(img_path).resize((im_height, im_width))\n    x = np.array(image).astype('uint8')\n    data = np.reshape(x, (1, im_height, im_width, 3))\n     [...]
+        "def get_real_image(im_height, im_width):\n    from PIL import Image\n\n    repo_base = \"https://github.com/dmlc/web-data/raw/master/tensorflow/models/InceptionV1/\"\n    img_name = \"elephant-299.jpg\"\n    image_url = os.path.join(repo_base, img_name)\n    img_path = download_testdata(image_url, img_name, module=\"data\")\n    image = Image.open(img_path).resize((im_height, im_width))\n    x = np.array(image).astype(\"uint8\")\n    data = np.reshape(x, (1, im_height, im_width, [...]
       ]
     },
     {
@@ -119,7 +119,7 @@
       },
       "outputs": [],
       "source": [
-        "tflite_model_file = os.path.join(model_dir, \"mobilenet_v2_1.0_224_quant.tflite\")\ntflite_model_buf = open(tflite_model_file, \"rb\").read()\n\n# Get TFLite model from buffer\ntry:\n    import tflite\n    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\nexcept AttributeError:\n    import tflite.Model\n    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)"
+        "tflite_model_file = os.path.join(model_dir, \"mobilenet_v2_1.0_224_quant.tflite\")\ntflite_model_buf = open(tflite_model_file, \"rb\").read()\n\n# Get TFLite model from buffer\ntry:\n    import tflite\n\n    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\nexcept AttributeError:\n    import tflite.Model\n\n    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)"
       ]
     },
     {
@@ -137,7 +137,7 @@
       },
       "outputs": [],
       "source": [
-        "def run_tflite_model(tflite_model_buf, input_data):\n    \"\"\" Generic function to execute TFLite \"\"\"\n    try:\n        from tensorflow import lite as interpreter_wrapper\n    except ImportError:\n        from tensorflow.contrib import lite as interpreter_wrapper\n\n    input_data = input_data if isinstance(input_data, list) else [input_data]\n\n    interpreter = interpreter_wrapper.Interpreter(model_content=tflite_model_buf)\n    interpreter.allocate_tensors()\n\n    input [...]
+        "def run_tflite_model(tflite_model_buf, input_data):\n    \"\"\" Generic function to execute TFLite \"\"\"\n    try:\n        from tensorflow import lite as interpreter_wrapper\n    except ImportError:\n        from tensorflow.contrib import lite as interpreter_wrapper\n\n    input_data = input_data if isinstance(input_data, list) else [input_data]\n\n    interpreter = interpreter_wrapper.Interpreter(model_content=tflite_model_buf)\n    interpreter.allocate_tensors()\n\n    input [...]
       ]
     },
     {
@@ -155,7 +155,7 @@
       },
       "outputs": [],
       "source": [
-        "def run_tvm(lib):\n    from tvm.contrib import graph_runtime\n    rt_mod = graph_runtime.GraphModule(lib['default'](tvm.cpu(0)))\n    rt_mod.set_input('input', data)\n    rt_mod.run()\n    tvm_res = rt_mod.get_output(0).asnumpy()\n    tvm_pred = np.squeeze(tvm_res).argsort()[-5:][::-1]\n    return tvm_pred, rt_mod"
+        "def run_tvm(lib):\n    from tvm.contrib import graph_runtime\n\n    rt_mod = graph_runtime.GraphModule(lib[\"default\"](tvm.cpu(0)))\n    rt_mod.set_input(\"input\", data)\n    rt_mod.run()\n    tvm_res = rt_mod.get_output(0).asnumpy()\n    tvm_pred = np.squeeze(tvm_res).argsort()[-5:][::-1]\n    return tvm_pred, rt_mod"
       ]
     },
     {
@@ -205,7 +205,7 @@
       },
       "outputs": [],
       "source": [
-        "dtype_dict = {'input': data.dtype.name}\nshape_dict = {'input': data.shape}\n\nmod, params = relay.frontend.from_tflite(tflite_model,\n                                         shape_dict=shape_dict,\n                                         dtype_dict=dtype_dict)\n# print(mod)"
+        "dtype_dict = {\"input\": data.dtype.name}\nshape_dict = {\"input\": data.shape}\n\nmod, params = relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict)\n# print(mod)"
       ]
     },
     {
@@ -223,7 +223,7 @@
       },
       "outputs": [],
       "source": [
-        "target = 'llvm'\nwith tvm.transform.PassContext(opt_level=3):\n    lib = relay.build_module.build(mod, target=target, params=params)"
+        "target = \"llvm\"\nwith tvm.transform.PassContext(opt_level=3):\n    lib = relay.build_module.build(mod, target=target, params=params)"
       ]
     },
     {
diff --git a/docs/_downloads/2354a24ad8bc07194943c49f2fb48874/tune_conv2d_cuda.ipynb b/docs/_downloads/2354a24ad8bc07194943c49f2fb48874/tune_conv2d_cuda.ipynb
index f439224..4994c69 100644
--- a/docs/_downloads/2354a24ad8bc07194943c49f2fb48874/tune_conv2d_cuda.ipynb
+++ b/docs/_downloads/2354a24ad8bc07194943c49f2fb48874/tune_conv2d_cuda.ipynb
@@ -51,7 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "@autotvm.template(\"tutorial/conv2d_no_batching\")\ndef conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):\n    assert N == 1, \"Only consider batch_size = 1 in this template\"\n\n    data = te.placeholder((N, CI, H, W), name='data')\n    kernel = te.placeholder((CO, CI, KH, KW), name='kernel')\n    conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype='float32')\n    s = te.create_schedule([conv.op])\n\n    ##### space definition begin ##### [...]
+        "@autotvm.template(\"tutorial/conv2d_no_batching\")\ndef conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):\n    assert N == 1, \"Only consider batch_size = 1 in this template\"\n\n    data = te.placeholder((N, CI, H, W), name=\"data\")\n    kernel = te.placeholder((CO, CI, KH, KW), name=\"kernel\")\n    conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype=\"float32\")\n    s = te.create_schedule([conv.op])\n\n    ##### space definition begin [...]
       ]
     },
     {
@@ -69,7 +69,7 @@
       },
       "outputs": [],
       "source": [
-        "# logging config (for printing tuning log to screen)\nlogging.getLogger('autotvm').setLevel(logging.DEBUG)\nlogging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))\n\n# the last layer in resnet\nN, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1)\ntask = autotvm.task.create(\"tutorial/conv2d_no_batching\",\n                           args=(N, H, W, CO, CI, KH, KW, strides, padding),\n                           target='cuda')\npr [...]
+        "# logging config (for printing tuning log to screen)\nlogging.getLogger(\"autotvm\").setLevel(logging.DEBUG)\nlogging.getLogger(\"autotvm\").addHandler(logging.StreamHandler(sys.stdout))\n\n# the last layer in resnet\nN, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1)\ntask = autotvm.task.create(\n    \"tutorial/conv2d_no_batching\", args=(N, H, W, CO, CI, KH, KW, strides, padding), target=\"cuda\"\n)\nprint(task.config_space)\n\n# Use local gpu, [...]
       ]
     },
     {
@@ -87,7 +87,7 @@
       },
       "outputs": [],
       "source": [
-        "# inspect the best config\ndispatch_context = autotvm.apply_history_best(\"conv2d.log\")\nbest_config = dispatch_context.query(task.target, task.workload)\nprint(\"\\nBest config:\")\nprint(best_config)\n\n# apply history best from log file\nwith autotvm.apply_history_best('conv2d.log'):\n    with tvm.target.create(\"cuda\"):\n        s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides, padding)\n        func = tvm.build(s, arg_bufs)\n\n# check correctness\na_np =  [...]
+        "# inspect the best config\ndispatch_context = autotvm.apply_history_best(\"conv2d.log\")\nbest_config = dispatch_context.query(task.target, task.workload)\nprint(\"\\nBest config:\")\nprint(best_config)\n\n# apply history best from log file\nwith autotvm.apply_history_best(\"conv2d.log\"):\n    with tvm.target.Target(\"cuda\"):\n        s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides, padding)\n        func = tvm.build(s, arg_bufs)\n\n# check correctness\na_np  [...]
       ]
     }
   ],
diff --git a/docs/_downloads/24a7471da81b18c4ba77d215289aed2f/relay_quick_start.ipynb b/docs/_downloads/24a7471da81b18c4ba77d215289aed2f/relay_quick_start.ipynb
index d9ac3bd..19d8a89 100644
--- a/docs/_downloads/24a7471da81b18c4ba77d215289aed2f/relay_quick_start.ipynb
+++ b/docs/_downloads/24a7471da81b18c4ba77d215289aed2f/relay_quick_start.ipynb
@@ -51,7 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "batch_size = 1\nnum_class = 1000\nimage_shape = (3, 224, 224)\ndata_shape = (batch_size,) + image_shape\nout_shape = (batch_size, num_class)\n\nmod, params = relay.testing.resnet.get_workload(\n    num_layers=18, batch_size=batch_size, image_shape=image_shape)\n\n# set show_meta_data=True if you want to show meta data\nprint(mod.astext(show_meta_data=False))"
+        "batch_size = 1\nnum_class = 1000\nimage_shape = (3, 224, 224)\ndata_shape = (batch_size,) + image_shape\nout_shape = (batch_size, num_class)\n\nmod, params = relay.testing.resnet.get_workload(\n    num_layers=18, batch_size=batch_size, image_shape=image_shape\n)\n\n# set show_meta_data=True if you want to show meta data\nprint(mod.astext(show_meta_data=False))"
       ]
     },
     {
@@ -69,7 +69,7 @@
       },
       "outputs": [],
       "source": [
-        "opt_level = 3\ntarget = tvm.target.cuda()\nwith tvm.transform.PassContext(opt_level=opt_level):\n    graph, lib, params = relay.build(mod, target, params=params)"
+        "opt_level = 3\ntarget = tvm.target.cuda()\nwith tvm.transform.PassContext(opt_level=opt_level):\n    lib = relay.build(mod, target, params=params)"
       ]
     },
     {
@@ -87,7 +87,7 @@
       },
       "outputs": [],
       "source": [
-        "# create random input\nctx = tvm.gpu()\ndata = np.random.uniform(-1, 1, size=data_shape).astype(\"float32\")\n# create module\nmodule = graph_runtime.create(graph, lib, ctx)\n# set input and parameters\nmodule.set_input(\"data\", data)\nmodule.set_input(**params)\n# run\nmodule.run()\n# get output\nout = module.get_output(0, tvm.nd.empty(out_shape)).asnumpy()\n\n# Print first 10 elements of output\nprint(out.flatten()[0:10])"
+        "# create random input\nctx = tvm.gpu()\ndata = np.random.uniform(-1, 1, size=data_shape).astype(\"float32\")\n# create module\nmodule = graph_runtime.GraphModule(lib[\"default\"](ctx))\n# set input and parameters\nmodule.set_input(\"data\", data)\n# run\nmodule.run()\n# get output\nout = module.get_output(0, tvm.nd.empty(out_shape)).asnumpy()\n\n# Print first 10 elements of output\nprint(out.flatten()[0:10])"
       ]
     },
     {
@@ -105,7 +105,7 @@
       },
       "outputs": [],
       "source": [
-        "# save the graph, lib and params into separate files\nfrom tvm.contrib import util\n\ntemp = util.tempdir()\npath_lib = temp.relpath(\"deploy_lib.tar\")\nlib.export_library(path_lib)\nwith open(temp.relpath(\"deploy_graph.json\"), \"w\") as fo:\n    fo.write(graph)\nwith open(temp.relpath(\"deploy_param.params\"), \"wb\") as fo:\n    fo.write(relay.save_param_dict(params))\nprint(temp.listdir())"
+        "# save the graph, lib and params into separate files\nfrom tvm.contrib import util\n\ntemp = util.tempdir()\npath_lib = temp.relpath(\"deploy_lib.tar\")\nlib.export_library(path_lib)\nprint(temp.listdir())"
       ]
     },
     {
@@ -116,7 +116,7 @@
       },
       "outputs": [],
       "source": [
-        "# load the module back.\nloaded_json = open(temp.relpath(\"deploy_graph.json\")).read()\nloaded_lib = tvm.runtime.load_module(path_lib)\nloaded_params = bytearray(open(temp.relpath(\"deploy_param.params\"), \"rb\").read())\ninput_data = tvm.nd.array(np.random.uniform(size=data_shape).astype(\"float32\"))\n\nmodule = graph_runtime.create(loaded_json, loaded_lib, ctx)\nmodule.load_params(loaded_params)\nmodule.run(data=input_data)\nout_deploy = module.get_output(0).asnumpy()\n\n#  [...]
+        "# load the module back.\nloaded_lib = tvm.runtime.load_module(path_lib)\ninput_data = tvm.nd.array(np.random.uniform(size=data_shape).astype(\"float32\"))\n\nmodule = graph_runtime.GraphModule(loaded_lib[\"default\"](ctx))\nmodule.run(data=input_data)\nout_deploy = module.get_output(0).asnumpy()\n\n# Print first 10 elements of output\nprint(out_deploy.flatten()[0:10])\n\n# check whether the output from deployed module is consistent with original one\ntvm.testing.assert_allclose( [...]
       ]
     }
   ],
diff --git a/docs/_downloads/272a5a893d007658546dc0eaf0a7aeed/tune_relay_cuda.py b/docs/_downloads/272a5a893d007658546dc0eaf0a7aeed/tune_relay_cuda.py
index 6a995af..32ee266 100644
--- a/docs/_downloads/272a5a893d007658546dc0eaf0a7aeed/tune_relay_cuda.py
+++ b/docs/_downloads/272a5a893d007658546dc0eaf0a7aeed/tune_relay_cuda.py
@@ -75,37 +75,48 @@ import tvm.contrib.graph_runtime as runtime
 # We can load some pre-defined network from :code:`tvm.relay.testing`.
 # We can also load models from MXNet, ONNX and TensorFlow.
 
+
 def get_network(name, batch_size):
     """Get the symbol definition and random weight of a network"""
     input_shape = (batch_size, 3, 224, 224)
     output_shape = (batch_size, 1000)
 
     if "resnet" in name:
-        n_layer = int(name.split('-')[1])
-        mod, params = relay.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer, batch_size=batch_size, dtype=dtype
+        )
     elif "vgg" in name:
-        n_layer = int(name.split('-')[1])
-        mod, params = relay.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)
-    elif name == 'mobilenet':
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.vgg.get_workload(
+            num_layers=n_layer, batch_size=batch_size, dtype=dtype
+        )
+    elif name == "mobilenet":
         mod, params = relay.testing.mobilenet.get_workload(batch_size=batch_size, dtype=dtype)
-    elif name == 'squeezenet_v1.1':
-        mod, params = relay.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1', dtype=dtype)
-    elif name == 'inception_v3':
+    elif name == "squeezenet_v1.1":
+        mod, params = relay.testing.squeezenet.get_workload(
+            batch_size=batch_size, version="1.1", dtype=dtype
+        )
+    elif name == "inception_v3":
         input_shape = (1, 3, 299, 299)
         mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
-    elif name == 'mxnet':
+    elif name == "mxnet":
         # an example for mxnet model
         from mxnet.gluon.model_zoo.vision import get_model
-        block = get_model('resnet18_v1', pretrained=True)
-        mod, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype)
+
+        block = get_model("resnet18_v1", pretrained=True)
+        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
         net = mod["main"]
-        net = relay.Function(net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs)
+        net = relay.Function(
+            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
+        )
         mod = tvm.IRModule.from_expr(net)
     else:
         raise ValueError("Unsupported network: " + name)
 
     return mod, params, input_shape, output_shape
 
+
 ###########################################
 # Set Tuning Options
 # ------------------
@@ -115,18 +126,16 @@ def get_network(name, batch_size):
 target = tvm.target.cuda()
 
 #### TUNING OPTION ####
-network = 'resnet-18'
+network = "resnet-18"
 log_file = "%s.log" % network
-dtype = 'float32'
+dtype = "float32"
 
 tuning_option = {
-    'log_filename': log_file,
-
-    'tuner': 'xgb',
-    'n_trial': 2000,
-    'early_stopping': 600,
-
-    'measure_option': autotvm.measure_option(
+    "log_filename": log_file,
+    "tuner": "xgb",
+    "n_trial": 2000,
+    "early_stopping": 600,
+    "measure_option": autotvm.measure_option(
         builder=autotvm.LocalBuilder(timeout=10),
         runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, min_repeat_ms=150),
     ),
@@ -154,29 +163,31 @@ tuning_option = {
 # We will introduce a more sophisticated tuning scheduler in the future.
 
 # You can skip the implementation of this function for this tutorial.
-def tune_tasks(tasks,
-               measure_option,
-               tuner='xgb',
-               n_trial=1000,
-               early_stopping=None,
-               log_filename='tuning.log',
-               use_transfer_learning=True):
+def tune_tasks(
+    tasks,
+    measure_option,
+    tuner="xgb",
+    n_trial=1000,
+    early_stopping=None,
+    log_filename="tuning.log",
+    use_transfer_learning=True,
+):
     # create tmp log file
     tmp_log_file = log_filename + ".tmp"
     if os.path.exists(tmp_log_file):
         os.remove(tmp_log_file)
 
     for i, tsk in enumerate(reversed(tasks)):
-        prefix = "[Task %2d/%2d] " %(i+1, len(tasks))
+        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
 
         # create tuner
-        if tuner == 'xgb' or tuner == 'xgb-rank':
-            tuner_obj = XGBTuner(tsk, loss_type='rank')
-        elif tuner == 'ga':
+        if tuner == "xgb" or tuner == "xgb-rank":
+            tuner_obj = XGBTuner(tsk, loss_type="rank")
+        elif tuner == "ga":
             tuner_obj = GATuner(tsk, pop_size=100)
-        elif tuner == 'random':
+        elif tuner == "random":
             tuner_obj = RandomTuner(tsk)
-        elif tuner == 'gridsearch':
+        elif tuner == "gridsearch":
             tuner_obj = GridSearchTuner(tsk)
         else:
             raise ValueError("Invalid tuner: " + tuner)
@@ -187,13 +198,15 @@ def tune_tasks(tasks,
 
         # do tuning
         tsk_trial = min(n_trial, len(tsk.config_space))
-        tuner_obj.tune(n_trial=tsk_trial,
-                       early_stopping=early_stopping,
-                       measure_option=measure_option,
-                       callbacks=[
-                           autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
-                           autotvm.callback.log_to_file(tmp_log_file)
-                       ])
+        tuner_obj.tune(
+            n_trial=tsk_trial,
+            early_stopping=early_stopping,
+            measure_option=measure_option,
+            callbacks=[
+                autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
+                autotvm.callback.log_to_file(tmp_log_file),
+            ],
+        )
 
     # pick best records to a cache file
     autotvm.record.pick_best(tmp_log_file, log_filename)
@@ -203,13 +216,14 @@ def tune_tasks(tasks,
 ########################################################################
 # Finally, we launch tuning jobs and evaluate the end-to-end performance.
 
+
 def tune_and_evaluate(tuning_opt):
     # extract workloads from relay program
     print("Extract tasks...")
     mod, params, input_shape, out_shape = get_network(network, batch_size=1)
-    tasks = autotvm.task.extract_from_program(mod["main"], target=target,
-                                              params=params,
-                                              ops=(relay.op.get("nn.conv2d"),))
+    tasks = autotvm.task.extract_from_program(
+        mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"),)
+    )
 
     # run tuning tasks
     print("Tuning...")
@@ -219,8 +233,7 @@ def tune_and_evaluate(tuning_opt):
     with autotvm.apply_history_best(log_file):
         print("Compile...")
         with tvm.transform.PassContext(opt_level=3):
-            graph, lib, params = relay.build_module.build(
-                mod, target=target, params=params)
+            lib = relay.build_module.build(mod, target=target, params=params)
 
         # export library
         tmp = tempdir()
@@ -229,17 +242,19 @@ def tune_and_evaluate(tuning_opt):
 
         # load parameters
         ctx = tvm.context(str(target), 0)
-        module = runtime.create(graph, lib, ctx)
+        module = runtime.GraphModule(lib["default"](ctx))
         data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
-        module.set_input('data', data_tvm)
-        module.set_input(**params)
+        module.set_input("data", data_tvm)
 
         # evaluate
         print("Evaluate inference time cost...")
         ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=600)
         prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
-        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
-              (np.mean(prof_res), np.std(prof_res)))
+        print(
+            "Mean inference time (std dev): %.2f ms (%.2f ms)"
+            % (np.mean(prof_res), np.std(prof_res))
+        )
+
 
 # We do not run the tuning in our webpage server since it takes too long.
 # Uncomment the following line to run it by yourself.
@@ -358,17 +373,20 @@ def tune_and_evaluate(tuning_opt):
 # to replace the corresponding part above.
 
 tuning_option = {
-    'log_filename': log_file,
-
-    'tuner': 'xgb',
-    'n_trial': 2000,
-    'early_stopping': 600,
-
-    'measure_option': autotvm.measure_option(
+    "log_filename": log_file,
+    "tuner": "xgb",
+    "n_trial": 2000,
+    "early_stopping": 600,
+    "measure_option": autotvm.measure_option(
         builder=autotvm.LocalBuilder(timeout=10),
         runner=autotvm.RPCRunner(
-            '1080ti',  # change the device key to your key
-            '0.0.0.0', 9190,
-            number=20, repeat=3, timeout=4, min_repeat_ms=150),
+            "1080ti",  # change the device key to your key
+            "0.0.0.0",
+            9190,
+            number=20,
+            repeat=3,
+            timeout=4,
+            min_repeat_ms=150,
+        ),
     ),
 }
diff --git a/docs/_downloads/2c0ed53a9ebd68caf76cd8235fae2711/tune_relay_mobile_gpu.ipynb b/docs/_downloads/2c0ed53a9ebd68caf76cd8235fae2711/tune_relay_mobile_gpu.ipynb
index 07fb0ba..0e17ec1 100644
--- a/docs/_downloads/2c0ed53a9ebd68caf76cd8235fae2711/tune_relay_mobile_gpu.ipynb
+++ b/docs/_downloads/2c0ed53a9ebd68caf76cd8235fae2711/tune_relay_mobile_gpu.ipynb
@@ -51,7 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "def get_network(name, batch_size):\n    \"\"\"Get the symbol definition and random weight of a network\"\"\"\n    input_shape = (batch_size, 3, 224, 224)\n    output_shape = (batch_size, 1000)\n\n    if \"resnet\" in name:\n        n_layer = int(name.split('-')[1])\n        mod, params = relay.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)\n    elif \"vgg\" in name:\n        n_layer = int(name.split('-')[1])\n        mod, params = relay.testi [...]
+        "def get_network(name, batch_size):\n    \"\"\"Get the symbol definition and random weight of a network\"\"\"\n    input_shape = (batch_size, 3, 224, 224)\n    output_shape = (batch_size, 1000)\n\n    if \"resnet\" in name:\n        n_layer = int(name.split(\"-\")[1])\n        mod, params = relay.testing.resnet.get_workload(\n            num_layers=n_layer, batch_size=batch_size, dtype=dtype\n        )\n    elif \"vgg\" in name:\n        n_layer = int(name.split(\"-\")[1])\n      [...]
       ]
     },
     {
@@ -83,7 +83,7 @@
       },
       "outputs": [],
       "source": [
-        "#### DEVICE CONFIG ####\n\ntarget = tvm.target.create('opencl -device=mali')\n\n# Replace \"aarch64-linux-gnu\" with the correct target of your board.\n# This target host is used for cross compilation. You can query it by :code:`gcc -v` on your device.\ntarget_host = 'llvm -mtriple=aarch64-linux-gnu'\n\n# Also replace this with the device key in your tracker\ndevice_key = 'rk3399'\n\n# Set this to True if you use android phone\nuse_android = False\n\n#### TUNING OPTION ####\nnet [...]
+        "#### DEVICE CONFIG ####\n\ntarget = tvm.target.Target(\"opencl -device=mali\")\n\n# Replace \"aarch64-linux-gnu\" with the correct target of your board.\n# This target host is used for cross compilation. You can query it by :code:`gcc -v` on your device.\ntarget_host = \"llvm -mtriple=aarch64-linux-gnu\"\n\n# Also replace this with the device key in your tracker\ndevice_key = \"rk3399\"\n\n# Set this to True if you use android phone\nuse_android = False\n\n#### TUNING OPTION ### [...]
       ]
     },
     {
@@ -108,7 +108,7 @@
       },
       "outputs": [],
       "source": [
-        "# You can skip the implementation of this function for this tutorial.\ndef tune_tasks(tasks,\n               measure_option,\n               tuner='xgb',\n               n_trial=1000,\n               early_stopping=None,\n               log_filename='tuning.log',\n               use_transfer_learning=True):\n    # create tmp log file\n    tmp_log_file = log_filename + \".tmp\"\n    if os.path.exists(tmp_log_file):\n        os.remove(tmp_log_file)\n\n    for i, tsk in enumerate(r [...]
+        "# You can skip the implementation of this function for this tutorial.\ndef tune_tasks(\n    tasks,\n    measure_option,\n    tuner=\"xgb\",\n    n_trial=1000,\n    early_stopping=None,\n    log_filename=\"tuning.log\",\n    use_transfer_learning=True,\n):\n    # create tmp log file\n    tmp_log_file = log_filename + \".tmp\"\n    if os.path.exists(tmp_log_file):\n        os.remove(tmp_log_file)\n\n    for i, tsk in enumerate(reversed(tasks)):\n        prefix = \"[Task %2d/%2d] \ [...]
       ]
     },
     {
@@ -126,7 +126,7 @@
       },
       "outputs": [],
       "source": [
-        "def tune_and_evaluate(tuning_opt):\n    # extract workloads from relay program\n    print(\"Extract tasks...\")\n    mod, params, input_shape, _ = get_network(network, batch_size=1)\n    tasks = autotvm.task.extract_from_program(mod[\"main\"],\n                                              target=target,\n                                              target_host=target_host,\n                                              params=params,\n                                           [...]
+        "def tune_and_evaluate(tuning_opt):\n    # extract workloads from relay program\n    print(\"Extract tasks...\")\n    mod, params, input_shape, _ = get_network(network, batch_size=1)\n    tasks = autotvm.task.extract_from_program(\n        mod[\"main\"],\n        target=target,\n        target_host=target_host,\n        params=params,\n        ops=(relay.op.get(\"nn.conv2d\"),),\n    )\n\n    # run tuning tasks\n    print(\"Tuning...\")\n    tune_tasks(tasks, **tuning_opt)\n\n    [...]
       ]
     },
     {
diff --git a/docs/_downloads/2c8ef0390ad4c53ca85671fa36c33b26/tune_conv2d_cuda.py b/docs/_downloads/2c8ef0390ad4c53ca85671fa36c33b26/tune_conv2d_cuda.py
index 9043151..ce9c198 100644
--- a/docs/_downloads/2c8ef0390ad4c53ca85671fa36c33b26/tune_conv2d_cuda.py
+++ b/docs/_downloads/2c8ef0390ad4c53ca85671fa36c33b26/tune_conv2d_cuda.py
@@ -79,13 +79,14 @@ from tvm import autotvm
 # can be very large (at the level of 10^9 for some input shapes)
 #
 
+
 @autotvm.template("tutorial/conv2d_no_batching")
 def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
     assert N == 1, "Only consider batch_size = 1 in this template"
 
-    data = te.placeholder((N, CI, H, W), name='data')
-    kernel = te.placeholder((CO, CI, KH, KW), name='kernel')
-    conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype='float32')
+    data = te.placeholder((N, CI, H, W), name="data")
+    kernel = te.placeholder((CO, CI, KH, KW), name="kernel")
+    conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype="float32")
     s = te.create_schedule([conv.op])
 
     ##### space definition begin #####
@@ -109,13 +110,13 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
     data, raw_data = pad_data, data
 
     output = conv
-    OL = s.cache_write(conv, 'local')
+    OL = s.cache_write(conv, "local")
 
     # create cache stage
-    AA = s.cache_read(data, 'shared', [OL])
-    WW = s.cache_read(kernel, 'shared', [OL])
-    AL = s.cache_read(AA, 'local', [OL])
-    WL = s.cache_read(WW, 'local', [OL])
+    AA = s.cache_read(data, "shared", [OL])
+    WW = s.cache_read(kernel, "shared", [OL])
+    AL = s.cache_read(AA, "local", [OL])
+    WL = s.cache_read(WW, "local", [OL])
 
     # tile and bind spatial axes
     n, f, y, x = s[output].op.axis
@@ -139,9 +140,9 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
     # tile reduction axes
     n, f, y, x = s[OL].op.axis
     rc, ry, rx = s[OL].op.reduce_axis
-    rco, rcm, rci = cfg['tile_rc'].apply(s, OL, rc)
-    ryo, rym, ryi = cfg['tile_rx'].apply(s, OL, ry)
-    rxo, rxm, rxi = cfg['tile_ry'].apply(s, OL, rx)
+    rco, rcm, rci = cfg["tile_rc"].apply(s, OL, rc)
+    ryo, rym, ryi = cfg["tile_rx"].apply(s, OL, ry)
+    rxo, rxm, rxi = cfg["tile_ry"].apply(s, OL, rx)
     s[OL].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, n, f, y, x)
 
     s[AA].compute_at(s[OL], rxo)
@@ -161,11 +162,12 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
         s[load].bind(tx, te.thread_axis("threadIdx.x"))
 
     # tune unroll
-    s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
-    s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
+    s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
+    s[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
 
     return s, [raw_data, kernel, conv]
 
+
 ######################################################################
 # Step 2:  Search through the space
 # ---------------------------------
@@ -176,30 +178,32 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
 # for this template
 
 # logging config (for printing tuning log to screen)
-logging.getLogger('autotvm').setLevel(logging.DEBUG)
-logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))
+logging.getLogger("autotvm").setLevel(logging.DEBUG)
+logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout))
 
 # the last layer in resnet
 N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1)
-task = autotvm.task.create("tutorial/conv2d_no_batching",
-                           args=(N, H, W, CO, CI, KH, KW, strides, padding),
-                           target='cuda')
+task = autotvm.task.create(
+    "tutorial/conv2d_no_batching", args=(N, H, W, CO, CI, KH, KW, strides, padding), target="cuda"
+)
 print(task.config_space)
 
 # Use local gpu, measure 10 times for every config to reduce variance
 # The timeout of compiling a program is 10 seconds, the timeout for running is 4 seconds
 measure_option = autotvm.measure_option(
     builder=autotvm.LocalBuilder(),
-    runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4)
+    runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4),
 )
 
 # Begin tuning, log records to file `conv2d.log`
 # During tuning we will also try many invalid configs, so you are expected to
 # see many error reports. As long as you can see non-zero GFLOPS, it is okay.
 tuner = autotvm.tuner.XGBTuner(task)
-tuner.tune(n_trial=20,
-           measure_option=measure_option,
-           callbacks=[autotvm.callback.log_to_file('conv2d.log')])
+tuner.tune(
+    n_trial=20,
+    measure_option=measure_option,
+    callbacks=[autotvm.callback.log_to_file("conv2d.log")],
+)
 
 #########################################################################
 # Finally we can inspect the best config from log file, check correctness,
@@ -212,8 +216,8 @@ print("\nBest config:")
 print(best_config)
 
 # apply history best from log file
-with autotvm.apply_history_best('conv2d.log'):
-    with tvm.target.create("cuda"):
+with autotvm.apply_history_best("conv2d.log"):
+    with tvm.target.Target("cuda"):
         s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides, padding)
         func = tvm.build(s, arg_bufs)
 
@@ -233,5 +237,4 @@ tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
 # Evaluate running time. Here we choose a large repeat number (400) to reduce the noise
 # and the overhead of kernel launch. You can also use nvprof to validate the result.
 evaluator = func.time_evaluator(func.entry_name, ctx, number=400)
-print('Time cost of this operator: %f' % evaluator(a_tvm, w_tvm, c_tvm).mean)
-
+print("Time cost of this operator: %f" % evaluator(a_tvm, w_tvm, c_tvm).mean)
diff --git a/docs/_downloads/2daaacf3c023a8ad30b14e52b9aaa635/matrix_multiply_opt.ipynb b/docs/_downloads/2daaacf3c023a8ad30b14e52b9aaa635/matrix_multiply_opt.ipynb
index 9fc93b2..0cf7055 100644
--- a/docs/_downloads/2daaacf3c023a8ad30b14e52b9aaa635/matrix_multiply_opt.ipynb
+++ b/docs/_downloads/2daaacf3c023a8ad30b14e52b9aaa635/matrix_multiply_opt.ipynb
@@ -51,7 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "# Fully connected layer dimensions: 1024 x 1024\nbatch_size = 1\nin_channels = 1024\nout_channels = 1024\nassert batch_size % env.BATCH == 0\nassert in_channels % env.BLOCK_IN == 0\nassert out_channels % env.BLOCK_OUT == 0\n\n# Let's derive the tiled input tensor shapes\ndata_shape = (batch_size // env.BATCH,\n              in_channels // env.BLOCK_IN,\n              env.BATCH,\n              env.BLOCK_IN)\nweight_shape = (out_channels // env.BLOCK_OUT,\n                in_chann [...]
+        "# Fully connected layer dimensions: 1024 x 1024\nbatch_size = 1\nin_channels = 1024\nout_channels = 1024\nassert batch_size % env.BATCH == 0\nassert in_channels % env.BLOCK_IN == 0\nassert out_channels % env.BLOCK_OUT == 0\n\n# Let's derive the tiled input tensor shapes\ndata_shape = (batch_size // env.BATCH, in_channels // env.BLOCK_IN, env.BATCH, env.BLOCK_IN)\nweight_shape = (\n    out_channels // env.BLOCK_OUT,\n    in_channels // env.BLOCK_IN,\n    env.BLOCK_OUT,\n    env.B [...]
       ]
     },
     {
@@ -141,7 +141,7 @@
       },
       "outputs": [],
       "source": [
-        "# Compile the TVM module\nmy_gemm = vta.build(s, [data, weight, res], \"ext_dev\", env.target_host, name=\"my_gemm\")\ntemp = util.tempdir()\nmy_gemm.save(temp.relpath(\"gemm.o\"))\nremote.upload(temp.relpath(\"gemm.o\"))\nf = remote.load_module(\"gemm.o\")\n\n# Get the remote device context\nctx = remote.ext_dev(0)\n\n# Initialize the data and weight arrays randomly in the int range of (-128, 128]\ndata_np = np.random.randint(\n    -128, 128, size=(batch_size, in_channels)).ast [...]
+        "# Compile the TVM module\nmy_gemm = vta.build(s, [data, weight, res], \"ext_dev\", env.target_host, name=\"my_gemm\")\ntemp = util.tempdir()\nmy_gemm.save(temp.relpath(\"gemm.o\"))\nremote.upload(temp.relpath(\"gemm.o\"))\nf = remote.load_module(\"gemm.o\")\n\n# Get the remote device context\nctx = remote.ext_dev(0)\n\n# Initialize the data and weight arrays randomly in the int range of (-128, 128]\ndata_np = np.random.randint(-128, 128, size=(batch_size, in_channels)).astype(da [...]
       ]
     },
     {
diff --git a/docs/_downloads/2e974b05b6d59fcf944f96d27106b994/from_keras.ipynb b/docs/_downloads/2e974b05b6d59fcf944f96d27106b994/from_keras.ipynb
index b78d755..154de67 100644
--- a/docs/_downloads/2e974b05b6d59fcf944f96d27106b994/from_keras.ipynb
+++ b/docs/_downloads/2e974b05b6d59fcf944f96d27106b994/from_keras.ipynb
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "weights_url = ''.join(['https://github.com/fchollet/deep-learning-models/releases/',\n                       'download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5'])\nweights_file = 'resnet50_weights.h5'\nweights_path = download_testdata(weights_url, weights_file, module='keras')\nkeras_resnet50 = keras.applications.resnet50.ResNet50(include_top=True, weights=None,\n                                                      input_shape=(224, 224, 3), classes=1000)\nkeras_resn [...]
+        "weights_url = \"\".join(\n    [\n        \"https://github.com/fchollet/deep-learning-models/releases/\",\n        \"download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5\",\n    ]\n)\nweights_file = \"resnet50_weights.h5\"\nweights_path = download_testdata(weights_url, weights_file, module=\"keras\")\nkeras_resnet50 = keras.applications.resnet50.ResNet50(\n    include_top=True, weights=None, input_shape=(224, 224, 3), classes=1000\n)\nkeras_resnet50.load_weights(weights_path)"
       ]
     },
     {
@@ -62,7 +62,7 @@
       },
       "outputs": [],
       "source": [
-        "from PIL import Image\nfrom matplotlib import pyplot as plt\nfrom keras.applications.resnet50 import preprocess_input\nimg_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'\nimg_path = download_testdata(img_url, 'cat.png', module='data')\nimg = Image.open(img_path).resize((224, 224))\nplt.imshow(img)\nplt.show()\n# input preprocess\ndata = np.array(img)[np.newaxis, :].astype('float32')\ndata = preprocess_input(data).transpose([0, 3, 1, 2])\nprint('input_ [...]
+        "from PIL import Image\nfrom matplotlib import pyplot as plt\nfrom keras.applications.resnet50 import preprocess_input\n\nimg_url = \"https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true\"\nimg_path = download_testdata(img_url, \"cat.png\", module=\"data\")\nimg = Image.open(img_path).resize((224, 224))\nplt.imshow(img)\nplt.show()\n# input preprocess\ndata = np.array(img)[np.newaxis, :].astype(\"float32\")\ndata = preprocess_input(data).transpose([0, 3, 1, 2])\npri [...]
       ]
     },
     {
@@ -80,7 +80,7 @@
       },
       "outputs": [],
       "source": [
-        "shape_dict = {'input_1': data.shape}\nmod, params = relay.frontend.from_keras(keras_resnet50, shape_dict)\n# compile the model\ntarget = 'cuda'\nctx = tvm.gpu(0)\nwith tvm.transform.PassContext(opt_level=3):\n    executor = relay.build_module.create_executor('graph', mod, ctx, target)"
+        "shape_dict = {\"input_1\": data.shape}\nmod, params = relay.frontend.from_keras(keras_resnet50, shape_dict)\n# compile the model\ntarget = \"cuda\"\nctx = tvm.gpu(0)\nwith tvm.transform.PassContext(opt_level=3):\n    executor = relay.build_module.create_executor(\"graph\", mod, ctx, target)"
       ]
     },
     {
@@ -98,7 +98,7 @@
       },
       "outputs": [],
       "source": [
-        "dtype = 'float32'\ntvm_out = executor.evaluate()(tvm.nd.array(data.astype(dtype)), **params)\ntop1_tvm = np.argmax(tvm_out.asnumpy()[0])"
+        "dtype = \"float32\"\ntvm_out = executor.evaluate()(tvm.nd.array(data.astype(dtype)), **params)\ntop1_tvm = np.argmax(tvm_out.asnumpy()[0])"
       ]
     },
     {
@@ -116,7 +116,7 @@
       },
       "outputs": [],
       "source": [
-        "synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',\n                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',\n                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',\n                      'imagenet1000_clsid_to_human.txt'])\nsynset_name = 'imagenet1000_clsid_to_human.txt'\nsynset_path = download_testdata(synset_url, synset_name, module='data')\nwith open(synset_path) as f:\n    synset = eval(f.read())\nprint('Relay top-1 id: {}, class name:  [...]
+        "synset_url = \"\".join(\n    [\n        \"https://gist.githubusercontent.com/zhreshold/\",\n        \"4d0b62f3d01426887599d4f7ede23ee5/raw/\",\n        \"596b27d23537e5a1b5751d2b0481ef172f58b539/\",\n        \"imagenet1000_clsid_to_human.txt\",\n    ]\n)\nsynset_name = \"imagenet1000_clsid_to_human.txt\"\nsynset_path = download_testdata(synset_url, synset_name, module=\"data\")\nwith open(synset_path) as f:\n    synset = eval(f.read())\nprint(\"Relay top-1 id: {}, class name: {} [...]
       ]
     }
   ],
diff --git a/docs/_downloads/2f6dcf56b15f857f94b6d320c1ace6e5/from_coreml.ipynb b/docs/_downloads/2f6dcf56b15f857f94b6d320c1ace6e5/from_coreml.ipynb
index 957c3ac..f02c05b 100644
--- a/docs/_downloads/2f6dcf56b15f857f94b6d320c1ace6e5/from_coreml.ipynb
+++ b/docs/_downloads/2f6dcf56b15f857f94b6d320c1ace6e5/from_coreml.ipynb
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "model_url = 'https://docs-assets.developer.apple.com/coreml/models/MobileNet.mlmodel'\nmodel_file = 'mobilenet.mlmodel'\nmodel_path = download_testdata(model_url, model_file, module='coreml')\n# Now you have mobilenet.mlmodel on disk\nmlmodel = cm.models.MLModel(model_path)"
+        "model_url = \"https://docs-assets.developer.apple.com/coreml/models/MobileNet.mlmodel\"\nmodel_file = \"mobilenet.mlmodel\"\nmodel_path = download_testdata(model_url, model_file, module=\"coreml\")\n# Now you have mobilenet.mlmodel on disk\nmlmodel = cm.models.MLModel(model_path)"
       ]
     },
     {
@@ -62,7 +62,7 @@
       },
       "outputs": [],
       "source": [
-        "img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'\nimg_path = download_testdata(img_url, 'cat.png', module='data')\nimg = Image.open(img_path).resize((224, 224))\n# Mobilenet.mlmodel's input is BGR format\nimg_bgr = np.array(img)[:,:,::-1]\nx = np.transpose(img_bgr, (2, 0, 1))[np.newaxis, :]"
+        "img_url = \"https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true\"\nimg_path = download_testdata(img_url, \"cat.png\", module=\"data\")\nimg = Image.open(img_path).resize((224, 224))\n# Mobilenet.mlmodel's input is BGR format\nimg_bgr = np.array(img)[:, :, ::-1]\nx = np.transpose(img_bgr, (2, 0, 1))[np.newaxis, :]"
       ]
     },
     {
@@ -80,7 +80,7 @@
       },
       "outputs": [],
       "source": [
-        "target = 'llvm'\nshape_dict = {'image': x.shape}\n\n# Parse CoreML model and convert into Relay computation graph\nmod, params = relay.frontend.from_coreml(mlmodel, shape_dict)\n\nwith tvm.transform.PassContext(opt_level=3):\n    lib = relay.build(mod, target, params=params)"
+        "target = \"llvm\"\nshape_dict = {\"image\": x.shape}\n\n# Parse CoreML model and convert into Relay computation graph\nmod, params = relay.frontend.from_coreml(mlmodel, shape_dict)\n\nwith tvm.transform.PassContext(opt_level=3):\n    lib = relay.build(mod, target, params=params)"
       ]
     },
     {
@@ -98,7 +98,7 @@
       },
       "outputs": [],
       "source": [
-        "from tvm.contrib import graph_runtime\nctx = tvm.cpu(0)\ndtype = 'float32'\nm = graph_runtime.GraphModule(lib['default'](ctx))\n# set inputs\nm.set_input('image', tvm.nd.array(x.astype(dtype)))\n# execute\nm.run()\n# get outputs\ntvm_output = m.get_output(0)\ntop1 = np.argmax(tvm_output.asnumpy()[0])"
+        "from tvm.contrib import graph_runtime\n\nctx = tvm.cpu(0)\ndtype = \"float32\"\nm = graph_runtime.GraphModule(lib[\"default\"](ctx))\n# set inputs\nm.set_input(\"image\", tvm.nd.array(x.astype(dtype)))\n# execute\nm.run()\n# get outputs\ntvm_output = m.get_output(0)\ntop1 = np.argmax(tvm_output.asnumpy()[0])"
       ]
     },
     {
@@ -116,7 +116,7 @@
       },
       "outputs": [],
       "source": [
-        "synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',\n                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',\n                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',\n                      'imagenet1000_clsid_to_human.txt'])\nsynset_name = 'imagenet1000_clsid_to_human.txt'\nsynset_path = download_testdata(synset_url, synset_name, module='data')\nwith open(synset_path) as f:\n    synset = eval(f.read())\n# You should see the following result:  [...]
+        "synset_url = \"\".join(\n    [\n        \"https://gist.githubusercontent.com/zhreshold/\",\n        \"4d0b62f3d01426887599d4f7ede23ee5/raw/\",\n        \"596b27d23537e5a1b5751d2b0481ef172f58b539/\",\n        \"imagenet1000_clsid_to_human.txt\",\n    ]\n)\nsynset_name = \"imagenet1000_clsid_to_human.txt\"\nsynset_path = download_testdata(synset_url, synset_name, module=\"data\")\nwith open(synset_path) as f:\n    synset = eval(f.read())\n# You should see the following result: Top [...]
       ]
     }
   ],
diff --git a/docs/_downloads/30015213c2882505d466865fafaed52d/from_caffe2.ipynb b/docs/_downloads/30015213c2882505d466865fafaed52d/from_caffe2.ipynb
index a097a19..e9447da 100644
--- a/docs/_downloads/30015213c2882505d466865fafaed52d/from_caffe2.ipynb
+++ b/docs/_downloads/30015213c2882505d466865fafaed52d/from_caffe2.ipynb
@@ -33,7 +33,7 @@
       },
       "outputs": [],
       "source": [
-        "from caffe2.python.models.download import ModelDownloader\nmf = ModelDownloader()\n\nclass Model:\n    def __init__(self, model_name):\n        self.init_net, self.predict_net, self.value_info = mf.get_c2_model(model_name)\n\nresnet50 = Model('resnet50')"
+        "from caffe2.python.models.download import ModelDownloader\n\nmf = ModelDownloader()\n\n\nclass Model:\n    def __init__(self, model_name):\n        self.init_net, self.predict_net, self.value_info = mf.get_c2_model(model_name)\n\n\nresnet50 = Model(\"resnet50\")"
       ]
     },
     {
@@ -51,7 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "from tvm.contrib.download import download_testdata\nfrom PIL import Image\nfrom matplotlib import pyplot as plt\nimport numpy as np\nimg_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'\nimg_path = download_testdata(img_url, 'cat.png', module='data')\nimg = Image.open(img_path).resize((224, 224))\nplt.imshow(img)\nplt.show()\n# input preprocess\ndef transform_image(image):\n    image = np.array(image) - np.array([123., 117., 104.])\n    image /= np.arra [...]
+        "from tvm.contrib.download import download_testdata\nfrom PIL import Image\nfrom matplotlib import pyplot as plt\nimport numpy as np\n\nimg_url = \"https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true\"\nimg_path = download_testdata(img_url, \"cat.png\", module=\"data\")\nimg = Image.open(img_path).resize((224, 224))\nplt.imshow(img)\nplt.show()\n# input preprocess\ndef transform_image(image):\n    image = np.array(image) - np.array([123.0, 117.0, 104.0])\n    image [...]
       ]
     },
     {
@@ -69,7 +69,7 @@
       },
       "outputs": [],
       "source": [
-        "# Caffe2 input tensor name, shape and type\ninput_name = resnet50.predict_net.op[0].input[0]\nshape_dict = {input_name: data.shape}\ndtype_dict = {input_name: data.dtype}\n\n# parse Caffe2 model and convert into Relay computation graph\nfrom tvm import relay, transform\nmod, params = relay.frontend.from_caffe2(resnet50.init_net, resnet50.predict_net, shape_dict, dtype_dict)\n\n# compile the model\n# target x86 CPU\ntarget = 'llvm'\nwith transform.PassContext(opt_level=3):\n    l [...]
+        "# Caffe2 input tensor name, shape and type\ninput_name = resnet50.predict_net.op[0].input[0]\nshape_dict = {input_name: data.shape}\ndtype_dict = {input_name: data.dtype}\n\n# parse Caffe2 model and convert into Relay computation graph\nfrom tvm import relay, transform\n\nmod, params = relay.frontend.from_caffe2(\n    resnet50.init_net, resnet50.predict_net, shape_dict, dtype_dict\n)\n\n# compile the model\n# target x86 CPU\ntarget = \"llvm\"\nwith transform.PassContext(opt_leve [...]
       ]
     },
     {
@@ -87,7 +87,7 @@
       },
       "outputs": [],
       "source": [
-        "import tvm\nfrom tvm import te\nfrom tvm.contrib import graph_runtime\n# context x86 CPU, use tvm.gpu(0) if you run on GPU\nctx = tvm.cpu(0)\n# create a runtime executor module\nm = graph_runtime.GraphModule(lib['default'](ctx))\n# set inputs\nm.set_input(input_name, tvm.nd.array(data.astype('float32')))\n# execute\nm.run()\n# get outputs\ntvm_out = m.get_output(0)\ntop1_tvm = np.argmax(tvm_out.asnumpy()[0])"
+        "import tvm\nfrom tvm import te\nfrom tvm.contrib import graph_runtime\n\n# context x86 CPU, use tvm.gpu(0) if you run on GPU\nctx = tvm.cpu(0)\n# create a runtime executor module\nm = graph_runtime.GraphModule(lib[\"default\"](ctx))\n# set inputs\nm.set_input(input_name, tvm.nd.array(data.astype(\"float32\")))\n# execute\nm.run()\n# get outputs\ntvm_out = m.get_output(0)\ntop1_tvm = np.argmax(tvm_out.asnumpy()[0])"
       ]
     },
     {
@@ -105,7 +105,7 @@
       },
       "outputs": [],
       "source": [
-        "from caffe2.python import workspace\nsynset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',\n                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',\n                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',\n                      'imagenet1000_clsid_to_human.txt'])\nsynset_name = 'imagenet1000_clsid_to_human.txt'\nsynset_path = download_testdata(synset_url, synset_name, module='data')\nwith open(synset_path) as f:\n    synset = eval(f.read())\npr [...]
+        "from caffe2.python import workspace\n\nsynset_url = \"\".join(\n    [\n        \"https://gist.githubusercontent.com/zhreshold/\",\n        \"4d0b62f3d01426887599d4f7ede23ee5/raw/\",\n        \"596b27d23537e5a1b5751d2b0481ef172f58b539/\",\n        \"imagenet1000_clsid_to_human.txt\",\n    ]\n)\nsynset_name = \"imagenet1000_clsid_to_human.txt\"\nsynset_path = download_testdata(synset_url, synset_name, module=\"data\")\nwith open(synset_path) as f:\n    synset = eval(f.read())\npri [...]
       ]
     }
   ],
diff --git a/docs/_downloads/33a19782c8aaf9fc62e565c57df5caca/deploy_sparse.py b/docs/_downloads/33a19782c8aaf9fc62e565c57df5caca/deploy_sparse.py
index 11cd63c..dcf2fc4 100644
--- a/docs/_downloads/33a19782c8aaf9fc62e565c57df5caca/deploy_sparse.py
+++ b/docs/_downloads/33a19782c8aaf9fc62e565c57df5caca/deploy_sparse.py
@@ -132,9 +132,9 @@ def load_keras_model(module, name, seq_len, batch_size, report_runtime=True):
     dummy_input = tf.keras.Input(shape=[seq_len], batch_size=batch_size, dtype="int32")
     dummy_out = model(dummy_input)  # Propagate shapes through the keras model.
     if report_runtime:
-        np_input = np.random.uniform(
-            size=[batch_size, seq_len], low=0, high=seq_len
-        ).astype("int32")
+        np_input = np.random.uniform(size=[batch_size, seq_len], low=0, high=seq_len).astype(
+            "int32"
+        )
         start = time.time()
         repeats = 50
         for i in range(repeats):
@@ -180,12 +180,8 @@ def import_graphdef(
 ):
     abs_path = os.path.dirname(os.path.abspath(__file__))
     shape_dict = {"input_1": (batch_size, seq_len)}
-    relay_file = ("%s_%d_%d_%s" % (name, batch_size, seq_len, relay_file)).replace(
-        "/", "_"
-    )
-    relay_params = ("%s_%d_%d_%s" % (name, batch_size, seq_len, relay_params)).replace(
-        "/", "_"
-    )
+    relay_file = ("%s_%d_%d_%s" % (name, batch_size, seq_len, relay_file)).replace("/", "_")
+    relay_params = ("%s_%d_%d_%s" % (name, batch_size, seq_len, relay_params)).replace("/", "_")
     if os.path.exists(os.path.join(abs_path, relay_file)) and os.path.exists(
         os.path.join(abs_path, relay_params)
     ):
@@ -218,11 +214,9 @@ def run_relay_graph(mod, params, shape_dict, target, ctx):
     with relay.build_config(opt_level=3):
         lib = relay.build(mod, target=target, params=params)
     input_shape = shape_dict["input_1"]
-    dummy_data = np.random.uniform(size=input_shape, low=0, high=input_shape[1]).astype(
-        "int32"
-    )
+    dummy_data = np.random.uniform(size=input_shape, low=0, high=input_shape[1]).astype("int32")
 
-    m = graph_runtime.GraphModule(lib['default'](ctx))
+    m = graph_runtime.GraphModule(lib["default"](ctx))
     m.set_input(0, dummy_data)
     m.run()
     tvm_output = m.get_output(0)
@@ -252,7 +246,7 @@ def run_dense(mod, params, shape_dict, target, ctx):
 # into the parameters. This makes it easier to convert to matrix multiplies
 # to sparse versions. Next we apply `bsr_dense.convert` to identify all
 # weight matrices that can be sparse, and automatically replace them.
-# 
+#
 # The `bsr_dense.convert` call below is doing the heavy lifting of identifying
 # which weights in the model can be made sparse by checking if they are
 # at least `sparsity_threshold` percent sparse. If so, it converts those
@@ -269,9 +263,7 @@ def random_bsr_matrix(M, N, BS_R, BS_C, density, dtype="float32"):
     assert N % BS_C == 0
     nnz = int(density * M * N)
     num_blocks = int(nnz / (BS_R * BS_C)) + 1
-    candidate_blocks = np.asarray(
-        list(itertools.product(range(0, M, BS_R), range(0, N, BS_C)))
-    )
+    candidate_blocks = np.asarray(list(itertools.product(range(0, M, BS_R), range(0, N, BS_C))))
     assert candidate_blocks.shape[0] == M // BS_R * N // BS_C
     chosen_blocks = candidate_blocks[
         np.random.choice(candidate_blocks.shape[0], size=num_blocks, replace=False)
@@ -308,9 +300,7 @@ def random_sparse_bert_params(func, params, density, BS_R, BS_C):
 def run_sparse(mod, params, shape_dict, target, ctx, bs_r, sparsity, gen_weights):
     mod, params = ddo.simplify_fc_transpose.convert(mod["main"], params)
     if gen_weights:
-        params = random_sparse_bert_params(
-            mod, params, BS_R=bs_r, BS_C=1, density=1 - sparsity
-        )
+        params = random_sparse_bert_params(mod, params, BS_R=bs_r, BS_C=1, density=1 - sparsity)
     mod, params = ddo.bsr_dense.convert(mod, params, (bs_r, 1), sparsity_threshold=0.8)
     print("Block Sparse Model with {blocksize}x1 blocks:".format(blocksize=bs_r))
     return run_relay_graph(mod, params, shape_dict, target, ctx)
diff --git a/docs/_downloads/37c76200603adf82ebeffc23bdef8d31/tensor_expr_get_started.py b/docs/_downloads/37c76200603adf82ebeffc23bdef8d31/tensor_expr_get_started.py
index d31dc1e..76e0262 100644
--- a/docs/_downloads/37c76200603adf82ebeffc23bdef8d31/tensor_expr_get_started.py
+++ b/docs/_downloads/37c76200603adf82ebeffc23bdef8d31/tensor_expr_get_started.py
@@ -35,9 +35,9 @@ import numpy as np
 
 # Global declarations of environment.
 
-tgt_host="llvm"
+tgt_host = "llvm"
 # Change it to respective GPU if gpu is enabled Ex: cuda, opencl, rocm
-tgt="cuda"
+tgt = "cuda"
 
 ######################################################################
 # Vector Add Example
@@ -66,8 +66,8 @@ tgt="cuda"
 # the computation should be done.
 #
 n = te.var("n")
-A = te.placeholder((n,), name='A')
-B = te.placeholder((n,), name='B')
+A = te.placeholder((n,), name="A")
+B = te.placeholder((n,), name="B")
 C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
 print(type(C))
 
@@ -116,9 +116,9 @@ bx, tx = s[C].split(C.op.axis[0], factor=64)
 # compute grid. These are GPU specific constructs that allow us
 # to generate code that runs on GPU.
 #
-if tgt == "cuda" or tgt == "rocm" or tgt.startswith('opencl'):
-  s[C].bind(bx, te.thread_axis("blockIdx.x"))
-  s[C].bind(tx, te.thread_axis("threadIdx.x"))
+if tgt == "cuda" or tgt == "rocm" or tgt.startswith("opencl"):
+    s[C].bind(bx, te.thread_axis("blockIdx.x"))
+    s[C].bind(tx, te.thread_axis("threadIdx.x"))
 
 ######################################################################
 # Compilation
@@ -171,7 +171,7 @@ tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 #
 # The following code fetches the device module and prints the content code.
 #
-if tgt == "cuda" or tgt == "rocm" or tgt.startswith('opencl'):
+if tgt == "cuda" or tgt == "rocm" or tgt.startswith("opencl"):
     dev_module = fadd.imported_modules[0]
     print("-----GPU code-----")
     print(dev_module.get_source())
@@ -217,7 +217,7 @@ if tgt == "cuda":
     fadd.imported_modules[0].save(temp.relpath("myadd.ptx"))
 if tgt == "rocm":
     fadd.imported_modules[0].save(temp.relpath("myadd.hsaco"))
-if tgt.startswith('opencl'):
+if tgt.startswith("opencl"):
     fadd.imported_modules[0].save(temp.relpath("myadd.cl"))
 cc.create_shared(temp.relpath("myadd.so"), [temp.relpath("myadd.o")])
 print(temp.listdir())
@@ -247,7 +247,7 @@ if tgt == "rocm":
     fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.hsaco"))
     fadd1.import_module(fadd1_dev)
 
-if tgt.startswith('opencl'):
+if tgt.startswith("opencl"):
     fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.cl"))
     fadd1.import_module(fadd1_dev)
 
@@ -289,7 +289,7 @@ tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 # The following code blocks generate OpenCL code, creates array on an OpenCL
 # device, and verifies the correctness of the code.
 #
-if tgt.startswith('opencl'):
+if tgt.startswith("opencl"):
     fadd_cl = tvm.build(s, [A, B, C], tgt, name="myadd")
     print("------opencl code------")
     print(fadd_cl.imported_modules[0].get_source())
diff --git a/docs/_downloads/38606228ff7130fbd6473b7c0625ddcd/deploy_model_on_android.ipynb b/docs/_downloads/38606228ff7130fbd6473b7c0625ddcd/deploy_model_on_android.ipynb
index dc10671..57871d3 100644
--- a/docs/_downloads/38606228ff7130fbd6473b7c0625ddcd/deploy_model_on_android.ipynb
+++ b/docs/_downloads/38606228ff7130fbd6473b7c0625ddcd/deploy_model_on_android.ipynb
@@ -65,7 +65,7 @@
       },
       "outputs": [],
       "source": [
-        "keras.backend.clear_session()  # Destroys the current TF graph and creates a new one.\nweights_url = ''.join(['https://github.com/JonathanCMitchell/',\n                       'mobilenet_v2_keras/releases/download/v1.1/',\n                       'mobilenet_v2_weights_tf_dim_ordering_tf_kernels_0.5_224.h5'])\nweights_file = 'mobilenet_v2_weights.h5'\nweights_path = download_testdata(weights_url, weights_file, module='keras')\nkeras_mobilenet_v2 = MobileNetV2(alpha=0.5, include_top [...]
+        "keras.backend.clear_session()  # Destroys the current TF graph and creates a new one.\nweights_url = \"\".join(\n    [\n        \"https://github.com/JonathanCMitchell/\",\n        \"mobilenet_v2_keras/releases/download/v1.1/\",\n        \"mobilenet_v2_weights_tf_dim_ordering_tf_kernels_0.5_224.h5\",\n    ]\n)\nweights_file = \"mobilenet_v2_weights.h5\"\nweights_path = download_testdata(weights_url, weights_file, module=\"keras\")\nkeras_mobilenet_v2 = MobileNetV2(\n    alpha=0.5 [...]
       ]
     },
     {
@@ -83,7 +83,7 @@
       },
       "outputs": [],
       "source": [
-        "img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'\nimg_name = 'cat.png'\nimg_path = download_testdata(img_url, img_name, module='data')\nimage = Image.open(img_path).resize((224, 224))\ndtype = 'float32'\n\ndef transform_image(image):\n    image = np.array(image) - np.array([123., 117., 104.])\n    image /= np.array([58.395, 57.12, 57.375])\n    image = image.transpose((2, 0, 1))\n    image = image[np.newaxis, :]\n    return image\n\nx = transform_im [...]
+        "img_url = \"https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true\"\nimg_name = \"cat.png\"\nimg_path = download_testdata(img_url, img_name, module=\"data\")\nimage = Image.open(img_path).resize((224, 224))\ndtype = \"float32\"\n\n\ndef transform_image(image):\n    image = np.array(image) - np.array([123.0, 117.0, 104.0])\n    image /= np.array([58.395, 57.12, 57.375])\n    image = image.transpose((2, 0, 1))\n    image = image[np.newaxis, :]\n    return image\n\n\nx [...]
       ]
     },
     {
@@ -101,7 +101,7 @@
       },
       "outputs": [],
       "source": [
-        "synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',\n                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',\n                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',\n                      'imagenet1000_clsid_to_human.txt'])\nsynset_name = 'imagenet1000_clsid_to_human.txt'\nsynset_path = download_testdata(synset_url, synset_name, module='data')\nwith open(synset_path) as f:\n    synset = eval(f.read())"
+        "synset_url = \"\".join(\n    [\n        \"https://gist.githubusercontent.com/zhreshold/\",\n        \"4d0b62f3d01426887599d4f7ede23ee5/raw/\",\n        \"596b27d23537e5a1b5751d2b0481ef172f58b539/\",\n        \"imagenet1000_clsid_to_human.txt\",\n    ]\n)\nsynset_name = \"imagenet1000_clsid_to_human.txt\"\nsynset_path = download_testdata(synset_url, synset_name, module=\"data\")\nwith open(synset_path) as f:\n    synset = eval(f.read())"
       ]
     },
     {
@@ -119,7 +119,7 @@
       },
       "outputs": [],
       "source": [
-        "local_demo = True\n\n# by default on CPU target will execute.\n# select 'cpu', 'opencl' and 'vulkan'\ntest_target = 'cpu'\n\n# Change target configuration.\n# Run `adb shell cat /proc/cpuinfo` to find the arch.\narch = 'arm64'\ntarget = 'llvm -mtriple=%s-linux-android' % arch\ntarget_host = None\n\nif local_demo:\n    target_host = None\n    target = 'llvm'\nelif test_target == 'opencl':\n    target_host = target\n    target = 'opencl'\nelif test_target == 'vulkan':\n    target_ [...]
+        "local_demo = True\n\n# by default on CPU target will execute.\n# select 'cpu', 'opencl' and 'vulkan'\ntest_target = \"cpu\"\n\n# Change target configuration.\n# Run `adb shell cat /proc/cpuinfo` to find the arch.\narch = \"arm64\"\ntarget = \"llvm -mtriple=%s-linux-android\" % arch\ntarget_host = None\n\nif local_demo:\n    target_host = None\n    target = \"llvm\"\nelif test_target == \"opencl\":\n    target_host = target\n    target = \"opencl\"\nelif test_target == \"vulkan\" [...]
       ]
     },
     {
@@ -137,7 +137,7 @@
       },
       "outputs": [],
       "source": [
-        "tracker_host = os.environ.get('TVM_TRACKER_HOST', '0.0.0.0')\ntracker_port = int(os.environ.get('TVM_TRACKER_PORT', 9190))\nkey = 'android'\n\nif local_demo:\n    remote = rpc.LocalSession()\nelse:\n    tracker = rpc.connect_tracker(tracker_host, tracker_port)\n    # When running a heavy model, we should increase the `session_timeout`\n    remote = tracker.request(key, priority=0,\n                             session_timeout=60)\n\nif local_demo:\n    ctx = remote.cpu(0)\nelif  [...]
+        "tracker_host = os.environ.get(\"TVM_TRACKER_HOST\", \"0.0.0.0\")\ntracker_port = int(os.environ.get(\"TVM_TRACKER_PORT\", 9190))\nkey = \"android\"\n\nif local_demo:\n    remote = rpc.LocalSession()\nelse:\n    tracker = rpc.connect_tracker(tracker_host, tracker_port)\n    # When running a heavy model, we should increase the `session_timeout`\n    remote = tracker.request(key, priority=0, session_timeout=60)\n\nif local_demo:\n    ctx = remote.cpu(0)\nelif test_target == \"openc [...]
       ]
     },
     {
@@ -155,7 +155,7 @@
       },
       "outputs": [],
       "source": [
-        "# set input data\nmodule.set_input(input_name, tvm.nd.array(x.astype(dtype)))\n# run\nmodule.run()\n# get output\nout = module.get_output(0)\n\n# get top1 result\ntop1 = np.argmax(out.asnumpy())\nprint('TVM prediction top-1: {}'.format(synset[top1]))\n\nprint('Evaluate inference time cost...')\nftimer = module.module.time_evaluator('run', ctx, number=1, repeat=10)\nprof_res = np.array(ftimer().results) * 1000  # convert to millisecond\nprint('Mean inference time (std dev): %.2f  [...]
+        "# set input data\nmodule.set_input(input_name, tvm.nd.array(x.astype(dtype)))\n# run\nmodule.run()\n# get output\nout = module.get_output(0)\n\n# get top1 result\ntop1 = np.argmax(out.asnumpy())\nprint(\"TVM prediction top-1: {}\".format(synset[top1]))\n\nprint(\"Evaluate inference time cost...\")\nftimer = module.module.time_evaluator(\"run\", ctx, number=1, repeat=10)\nprof_res = np.array(ftimer().results) * 1000  # convert to millisecond\nprint(\"Mean inference time (std dev) [...]
       ]
     },
     {
diff --git a/docs/_downloads/3961fdfa7abff1b6dc996faa43b4c40f/deploy_model_on_android.py b/docs/_downloads/3961fdfa7abff1b6dc996faa43b4c40f/deploy_model_on_android.py
index fe16dac..3bf55d9 100644
--- a/docs/_downloads/3961fdfa7abff1b6dc996faa43b4c40f/deploy_model_on_android.py
+++ b/docs/_downloads/3961fdfa7abff1b6dc996faa43b4c40f/deploy_model_on_android.py
@@ -189,42 +189,53 @@ from tvm.contrib.download import download_testdata
 # ---------------------------
 # We load a pretrained MobileNetV2(alpha=0.5) classification model provided by keras.
 keras.backend.clear_session()  # Destroys the current TF graph and creates a new one.
-weights_url = ''.join(['https://github.com/JonathanCMitchell/',
-                       'mobilenet_v2_keras/releases/download/v1.1/',
-                       'mobilenet_v2_weights_tf_dim_ordering_tf_kernels_0.5_224.h5'])
-weights_file = 'mobilenet_v2_weights.h5'
-weights_path = download_testdata(weights_url, weights_file, module='keras')
-keras_mobilenet_v2 = MobileNetV2(alpha=0.5, include_top=True, weights=None,
-                                input_shape=(224, 224, 3), classes=1000)
+weights_url = "".join(
+    [
+        "https://github.com/JonathanCMitchell/",
+        "mobilenet_v2_keras/releases/download/v1.1/",
+        "mobilenet_v2_weights_tf_dim_ordering_tf_kernels_0.5_224.h5",
+    ]
+)
+weights_file = "mobilenet_v2_weights.h5"
+weights_path = download_testdata(weights_url, weights_file, module="keras")
+keras_mobilenet_v2 = MobileNetV2(
+    alpha=0.5, include_top=True, weights=None, input_shape=(224, 224, 3), classes=1000
+)
 keras_mobilenet_v2.load_weights(weights_path)
 
 ######################################################################
 # In order to test our model, here we download an image of cat and
 # transform its format.
-img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
-img_name = 'cat.png'
-img_path = download_testdata(img_url, img_name, module='data')
+img_url = "https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true"
+img_name = "cat.png"
+img_path = download_testdata(img_url, img_name, module="data")
 image = Image.open(img_path).resize((224, 224))
-dtype = 'float32'
+dtype = "float32"
+
 
 def transform_image(image):
-    image = np.array(image) - np.array([123., 117., 104.])
+    image = np.array(image) - np.array([123.0, 117.0, 104.0])
     image /= np.array([58.395, 57.12, 57.375])
     image = image.transpose((2, 0, 1))
     image = image[np.newaxis, :]
     return image
 
+
 x = transform_image(image)
 
 ######################################################################
 # synset is used to transform the label from number of ImageNet class to
 # the word human can understand.
-synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
-                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',
-                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',
-                      'imagenet1000_clsid_to_human.txt'])
-synset_name = 'imagenet1000_clsid_to_human.txt'
-synset_path = download_testdata(synset_url, synset_name, module='data')
+synset_url = "".join(
+    [
+        "https://gist.githubusercontent.com/zhreshold/",
+        "4d0b62f3d01426887599d4f7ede23ee5/raw/",
+        "596b27d23537e5a1b5751d2b0481ef172f58b539/",
+        "imagenet1000_clsid_to_human.txt",
+    ]
+)
+synset_name = "imagenet1000_clsid_to_human.txt"
+synset_path = download_testdata(synset_url, synset_name, module="data")
 with open(synset_path) as f:
     synset = eval(f.read())
 
@@ -241,31 +252,30 @@ local_demo = True
 
 # by default on CPU target will execute.
 # select 'cpu', 'opencl' and 'vulkan'
-test_target = 'cpu'
+test_target = "cpu"
 
 # Change target configuration.
 # Run `adb shell cat /proc/cpuinfo` to find the arch.
-arch = 'arm64'
-target = 'llvm -mtriple=%s-linux-android' % arch
+arch = "arm64"
+target = "llvm -mtriple=%s-linux-android" % arch
 target_host = None
 
 if local_demo:
     target_host = None
-    target = 'llvm'
-elif test_target == 'opencl':
+    target = "llvm"
+elif test_target == "opencl":
     target_host = target
-    target = 'opencl'
-elif test_target == 'vulkan':
+    target = "opencl"
+elif test_target == "vulkan":
     target_host = target
-    target = 'vulkan'
+    target = "vulkan"
 
-input_name = 'input_1'
+input_name = "input_1"
 shape_dict = {input_name: x.shape}
 mod, params = relay.frontend.from_keras(keras_mobilenet_v2, shape_dict)
 
 with tvm.transform.PassContext(opt_level=3):
-    lib = relay.build(mod, target=target,
-                      target_host=target_host, params=params)
+    lib = relay.build(mod, target=target, target_host=target_host, params=params)
 
 # After `relay.build`, you will get three return values: graph,
 # library and the new parameter, since we do some optimization that will
@@ -273,7 +283,7 @@ with tvm.transform.PassContext(opt_level=3):
 
 # Save the library at local temporary directory.
 tmp = util.tempdir()
-lib_fname = tmp.relpath('net.so')
+lib_fname = tmp.relpath("net.so")
 fcompile = ndk.create_shared if not local_demo else None
 lib.export_library(lib_fname, fcompile)
 
@@ -283,33 +293,32 @@ lib.export_library(lib_fname, fcompile)
 # With RPC, you can deploy the model remotely from your host machine
 # to the remote android device.
 
-tracker_host = os.environ.get('TVM_TRACKER_HOST', '0.0.0.0')
-tracker_port = int(os.environ.get('TVM_TRACKER_PORT', 9190))
-key = 'android'
+tracker_host = os.environ.get("TVM_TRACKER_HOST", "0.0.0.0")
+tracker_port = int(os.environ.get("TVM_TRACKER_PORT", 9190))
+key = "android"
 
 if local_demo:
     remote = rpc.LocalSession()
 else:
     tracker = rpc.connect_tracker(tracker_host, tracker_port)
     # When running a heavy model, we should increase the `session_timeout`
-    remote = tracker.request(key, priority=0,
-                             session_timeout=60)
+    remote = tracker.request(key, priority=0, session_timeout=60)
 
 if local_demo:
     ctx = remote.cpu(0)
-elif test_target == 'opencl':
+elif test_target == "opencl":
     ctx = remote.cl(0)
-elif test_target == 'vulkan':
+elif test_target == "vulkan":
     ctx = remote.vulkan(0)
 else:
     ctx = remote.cpu(0)
 
 # upload the library to remote device and load it
 remote.upload(lib_fname)
-rlib = remote.load_module('net.so')
+rlib = remote.load_module("net.so")
 
 # create the remote runtime module
-module = runtime.GraphModule(rlib['default'](ctx))
+module = runtime.GraphModule(rlib["default"](ctx))
 
 ######################################################################
 # Execute on TVM
@@ -324,13 +333,12 @@ out = module.get_output(0)
 
 # get top1 result
 top1 = np.argmax(out.asnumpy())
-print('TVM prediction top-1: {}'.format(synset[top1]))
+print("TVM prediction top-1: {}".format(synset[top1]))
 
-print('Evaluate inference time cost...')
-ftimer = module.module.time_evaluator('run', ctx, number=1, repeat=10)
+print("Evaluate inference time cost...")
+ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=10)
 prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
-print('Mean inference time (std dev): %.2f ms (%.2f ms)' % (np.mean(prof_res),
-                                                            np.std(prof_res)))
+print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
 
 ######################################################################
 # Sample Output
diff --git a/docs/_downloads/440add54bfa6dfb4fa9ed5037187aa4c/opt_gemm.ipynb b/docs/_downloads/440add54bfa6dfb4fa9ed5037187aa4c/opt_gemm.ipynb
index 78de83b..d664a74 100644
--- a/docs/_downloads/440add54bfa6dfb4fa9ed5037187aa4c/opt_gemm.ipynb
+++ b/docs/_downloads/440add54bfa6dfb4fa9ed5037187aa4c/opt_gemm.ipynb
@@ -33,7 +33,7 @@
       },
       "outputs": [],
       "source": [
-        "import tvm\nfrom tvm import te\nimport numpy\nimport timeit\n\n# The size of the matrix\n# (M, K) x (K, N)\n# You are free to try out different shapes, sometimes TVM optimization outperforms numpy with MKL.\nM = 1024\nK = 1024\nN = 1024\n\n# The default tensor type in tvm\ndtype = \"float32\"\n\n# using Intel AVX2(Advanced Vector Extensions) ISA for SIMD\n# To get the best performance, please change the following line\n# to llvm -mcpu=core-avx2, or specific type of CPU you use\n [...]
+        "import tvm\nimport tvm.testing\nfrom tvm import te\nimport numpy\nimport timeit\n\n# The size of the matrix\n# (M, K) x (K, N)\n# You are free to try out different shapes, sometimes TVM optimization outperforms numpy with MKL.\nM = 1024\nK = 1024\nN = 1024\n\n# The default tensor type in tvm\ndtype = \"float32\"\n\n# using Intel AVX2(Advanced Vector Extensions) ISA for SIMD\n# To get the best performance, please change the following line\n# to llvm -mcpu=core-avx2, or specific t [...]
       ]
     },
     {
@@ -69,7 +69,7 @@
       },
       "outputs": [],
       "source": [
-        "bn = 32\ns = te.create_schedule(C.op)\n\n# Blocking by loop tiling\nxo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)\nk, = s[C].op.reduce_axis\nko, ki = s[C].split(k, factor=4)\n\n# Hoist reduction domain outside the blocking loop\ns[C].reorder(xo, yo, ko, ki, xi, yi)\n\nfunc = tvm.build(s, [A, B, C], target=target, name='mmult')\nassert func\n\nc = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)\nfunc(a, b, c)\ntvm.testing.assert_allclose(c.asnumpy(), answer [...]
+        "bn = 32\ns = te.create_schedule(C.op)\n\n# Blocking by loop tiling\nxo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)\n(k,) = s[C].op.reduce_axis\nko, ki = s[C].split(k, factor=4)\n\n# Hoist reduction domain outside the blocking loop\ns[C].reorder(xo, yo, ko, ki, xi, yi)\n\nfunc = tvm.build(s, [A, B, C], target=target, name=\"mmult\")\nassert func\n\nc = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)\nfunc(a, b, c)\ntvm.testing.assert_allclose(c.asnumpy(), answ [...]
       ]
     },
     {
@@ -105,7 +105,7 @@
       },
       "outputs": [],
       "source": [
-        "s = te.create_schedule(C.op)\nxo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)\nk, = s[C].op.reduce_axis\nko, ki = s[C].split(k, factor=4)\n\ns[C].reorder(xo, yo, ko, ki, xi, yi)\n\n# Vectorization\ns[C].vectorize(yi)\n\nfunc = tvm.build(s, [A, B, C], target=target, name='mmult')\nassert func\n\nc = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)\nfunc(a, b, c)\ntvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)\n\nevaluator = func.time_evaluator(fun [...]
+        "s = te.create_schedule(C.op)\nxo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)\n(k,) = s[C].op.reduce_axis\nko, ki = s[C].split(k, factor=4)\n\ns[C].reorder(xo, yo, ko, ki, xi, yi)\n\n# Vectorization\ns[C].vectorize(yi)\n\nfunc = tvm.build(s, [A, B, C], target=target, name=\"mmult\")\nassert func\n\nc = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)\nfunc(a, b, c)\ntvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)\n\nevaluator = func.time_evaluator(f [...]
       ]
     },
     {
@@ -141,7 +141,7 @@
       },
       "outputs": [],
       "source": [
-        "s = te.create_schedule(C.op)\nxo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)\nk, = s[C].op.reduce_axis\nko, ki = s[C].split(k, factor=4)\n\n# re-ordering\ns[C].reorder(xo, yo, ko, xi, ki, yi)\ns[C].vectorize(yi)\n\nfunc = tvm.build(s, [A, B, C], target=target, name='mmult')\nassert func\n\nc = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)\nfunc(a, b, c)\ntvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)\n\nevaluator = func.time_evaluator(func.en [...]
+        "s = te.create_schedule(C.op)\nxo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)\n(k,) = s[C].op.reduce_axis\nko, ki = s[C].split(k, factor=4)\n\n# re-ordering\ns[C].reorder(xo, yo, ko, xi, ki, yi)\ns[C].vectorize(yi)\n\nfunc = tvm.build(s, [A, B, C], target=target, name=\"mmult\")\nassert func\n\nc = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)\nfunc(a, b, c)\ntvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)\n\nevaluator = func.time_evaluator(func. [...]
       ]
     },
     {
@@ -184,7 +184,7 @@
       },
       "outputs": [],
       "source": [
-        "# We have to re-write the algorithm slightly.\npackedB = te.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name='packedB')\nC = te.compute((M, N),\n                lambda x, y: te.sum(A[x, k] * packedB[y // bn, k, tvm.tir.indexmod(y, bn)], axis=k),\n                name = 'C')\n\ns = te.create_schedule(C.op)\n\nxo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)\nk, = s[C].op.reduce_axis\nko, ki = s[C].split(k, factor=4)\n\ns[C].reorder(xo, yo, ko, xi, ki, [...]
+        "# We have to re-write the algorithm slightly.\npackedB = te.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name=\"packedB\")\nC = te.compute(\n    (M, N),\n    lambda x, y: te.sum(A[x, k] * packedB[y // bn, k, tvm.tir.indexmod(y, bn)], axis=k),\n    name=\"C\",\n)\n\ns = te.create_schedule(C.op)\n\nxo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)\n(k,) = s[C].op.reduce_axis\nko, ki = s[C].split(k, factor=4)\n\ns[C].reorder(xo, yo, ko, xi, ki, yi)\ns[C]. [...]
       ]
     },
     {
@@ -220,7 +220,7 @@
       },
       "outputs": [],
       "source": [
-        "s = te.create_schedule(C.op)\n\n# Allocate write cache\nCC = s.cache_write(C, 'global')\n\nxo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)\n\n# Write cache is computed at yo\ns[CC].compute_at(s[C], yo)\n\n# New inner axes\nxc, yc = s[CC].op.axis\n\nk, = s[CC].op.reduce_axis\nko, ki = s[CC].split(k, factor=4)\ns[CC].reorder(ko, xc, ki, yc)\ns[CC].unroll(ki)\ns[CC].vectorize(yc)\n\nx, y, z = s[packedB].op.axis\ns[packedB].vectorize(z)\ns[packedB].parallel(x)\n\nfunc [...]
+        "s = te.create_schedule(C.op)\n\n# Allocate write cache\nCC = s.cache_write(C, \"global\")\n\nxo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)\n\n# Write cache is computed at yo\ns[CC].compute_at(s[C], yo)\n\n# New inner axes\nxc, yc = s[CC].op.axis\n\n(k,) = s[CC].op.reduce_axis\nko, ki = s[CC].split(k, factor=4)\ns[CC].reorder(ko, xc, ki, yc)\ns[CC].unroll(ki)\ns[CC].vectorize(yc)\n\nx, y, z = s[packedB].op.axis\ns[packedB].vectorize(z)\ns[packedB].parallel(x)\n\n [...]
       ]
     },
     {
@@ -256,7 +256,7 @@
       },
       "outputs": [],
       "source": [
-        "s = te.create_schedule(C.op)\n\nCC = s.cache_write(C, 'global')\n\nxo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)\n\ns[CC].compute_at(s[C], yo)\n\nxc, yc = s[CC].op.axis\n\nk, = s[CC].op.reduce_axis\nko, ki = s[CC].split(k, factor=4)\ns[CC].reorder(ko, xc, ki, yc)\ns[CC].unroll(ki)\ns[CC].vectorize(yc)\n\n# parallel\ns[C].parallel(xo)\n\nx, y, z = s[packedB].op.axis\ns[packedB].vectorize(z)\ns[packedB].parallel(x)\n\nfunc = tvm.build(s, [A, B, C], target=target,  [...]
+        "s = te.create_schedule(C.op)\n\nCC = s.cache_write(C, \"global\")\n\nxo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)\n\ns[CC].compute_at(s[C], yo)\n\nxc, yc = s[CC].op.axis\n\n(k,) = s[CC].op.reduce_axis\nko, ki = s[CC].split(k, factor=4)\ns[CC].reorder(ko, xc, ki, yc)\ns[CC].unroll(ki)\ns[CC].vectorize(yc)\n\n# parallel\ns[C].parallel(xo)\n\nx, y, z = s[packedB].op.axis\ns[packedB].vectorize(z)\ns[packedB].parallel(x)\n\nfunc = tvm.build(s, [A, B, C], target=targ [...]
       ]
     },
     {
diff --git a/docs/_downloads/48779ddff800bd9d4b8b3bd7ef8f054c/using_external_lib.py b/docs/_downloads/48779ddff800bd9d4b8b3bd7ef8f054c/using_external_lib.py
index bc47023..a150b68 100644
--- a/docs/_downloads/48779ddff800bd9d4b8b3bd7ef8f054c/using_external_lib.py
+++ b/docs/_downloads/48779ddff800bd9d4b8b3bd7ef8f054c/using_external_lib.py
@@ -54,7 +54,9 @@ bn_beta = relay.var("bn_beta")
 bn_mmean = relay.var("bn_mean")
 bn_mvar = relay.var("bn_var")
 
-simple_net = relay.nn.conv2d(data=data, weight=weight, kernel_size=(3,3), channels=out_channels, padding=(1, 1))
+simple_net = relay.nn.conv2d(
+    data=data, weight=weight, kernel_size=(3, 3), channels=out_channels, padding=(1, 1)
+)
 simple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, bn_mmean, bn_mvar)[0]
 simple_net = relay.nn.relu(simple_net)
 simple_net = relay.Function(relay.analysis.free_vars(simple_net), simple_net)
@@ -68,14 +70,15 @@ net, params = testing.create_workload(simple_net)
 # We build and run this network with cuda backend, as usual.
 # By setting the logging level to DEBUG, the result of Relay graph compilation will be dumped as pseudo code.
 import logging
-logging.basicConfig(level=logging.DEBUG) # to dump TVM IR after fusion
+
+logging.basicConfig(level=logging.DEBUG)  # to dump TVM IR after fusion
 
 target = "cuda"
 lib = relay.build_module.build(net, target, params=params)
 
 ctx = tvm.context(target, 0)
 data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
-module = runtime.GraphModule(lib['default'](ctx))
+module = runtime.GraphModule(lib["default"](ctx))
 module.set_input("data", data)
 module.run()
 out_shape = (batch_size, out_channels, 224, 224)
@@ -491,12 +494,12 @@ out_cuda = out.asnumpy()
 # We can use cuDNN to replace convolution kernels with cuDNN ones.
 # To do that, all we need to do is to append the option " -libs=cudnn" to the target string.
 net, params = testing.create_workload(simple_net)
-target = "cuda -libs=cudnn" # use cudnn for convolution
+target = "cuda -libs=cudnn"  # use cudnn for convolution
 lib = relay.build_module.build(net, target, params=params)
 
 ctx = tvm.context(target, 0)
 data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
-module = runtime.GraphModule(lib['default'](ctx))
+module = runtime.GraphModule(lib["default"](ctx))
 module.set_input("data", data)
 module.run()
 out_shape = (batch_size, out_channels, 224, 224)
diff --git a/docs/_downloads/48bd751ebaae08fce134e559f86a25cc/tune_relay_vta.ipynb b/docs/_downloads/48bd751ebaae08fce134e559f86a25cc/tune_relay_vta.ipynb
index 33a19f9..67f1eaa 100644
--- a/docs/_downloads/48bd751ebaae08fce134e559f86a25cc/tune_relay_vta.ipynb
+++ b/docs/_downloads/48bd751ebaae08fce134e559f86a25cc/tune_relay_vta.ipynb
@@ -51,7 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "def compile_network(env, target, model, start_pack, stop_pack):\n\n    # Populate the shape and data type dictionary\n    dtype_dict = {\"data\": 'float32'}\n    shape_dict = {\"data\": (env.BATCH, 3, 224, 224)}\n\n    # Get off the shelf gluon model, and convert to relay\n    gluon_model = vision.get_model(model, pretrained=True)\n    mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)\n\n    # Update shape and type dictionary\n    shape_dict.update({k: v.shape for [...]
+        "def compile_network(env, target, model, start_pack, stop_pack):\n\n    # Populate the shape and data type dictionary\n    dtype_dict = {\"data\": \"float32\"}\n    shape_dict = {\"data\": (env.BATCH, 3, 224, 224)}\n\n    # Get off the shelf gluon model, and convert to relay\n    gluon_model = vision.get_model(model, pretrained=True)\n    mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)\n\n    # Update shape and type dictionary\n    shape_dict.update({k: v.shape f [...]
       ]
     },
     {
@@ -83,7 +83,7 @@
       },
       "outputs": [],
       "source": [
-        "# Tracker host and port can be set by your environment\ntracker_host = os.environ.get(\"TVM_TRACKER_HOST\", '0.0.0.0')\ntracker_port = int(os.environ.get(\"TVM_TRACKER_PORT\", 9190))\n\n# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file\nenv = vta.get_env()\n\n# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.\n# Set ``device=arm_cpu`` to run inference on the CPU\n# or ``device=vta`` to run inference on the FPG [...]
+        "# Tracker host and port can be set by your environment\ntracker_host = os.environ.get(\"TVM_TRACKER_HOST\", \"0.0.0.0\")\ntracker_port = int(os.environ.get(\"TVM_TRACKER_PORT\", 9190))\n\n# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file\nenv = vta.get_env()\n\n# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.\n# Set ``device=arm_cpu`` to run inference on the CPU\n# or ``device=vta`` to run inference on the F [...]
       ]
     },
     {
@@ -108,7 +108,7 @@
       },
       "outputs": [],
       "source": [
-        "# You can skip the implementation of this function for this tutorial.\ndef tune_tasks(tasks,\n               measure_option,\n               tuner='xgb',\n               n_trial=1000,\n               early_stopping=None,\n               log_filename='tuning.log',\n               use_transfer_learning=True):\n\n    # create tmp log file\n    tmp_log_file = log_filename + \".tmp\"\n    if os.path.exists(tmp_log_file):\n        os.remove(tmp_log_file)\n\n    for i, tsk in enumerate [...]
+        "# You can skip the implementation of this function for this tutorial.\ndef tune_tasks(\n    tasks,\n    measure_option,\n    tuner=\"xgb\",\n    n_trial=1000,\n    early_stopping=None,\n    log_filename=\"tuning.log\",\n    use_transfer_learning=True,\n):\n\n    # create tmp log file\n    tmp_log_file = log_filename + \".tmp\"\n    if os.path.exists(tmp_log_file):\n        os.remove(tmp_log_file)\n\n    for i, tsk in enumerate(reversed(tasks)):\n        prefix = \"[Task %2d/%2d] [...]
       ]
     },
     {
@@ -126,7 +126,7 @@
       },
       "outputs": [],
       "source": [
-        "def register_vta_tuning_tasks():\n    from tvm.autotvm.task import TaskExtractEnv\n\n    @tvm.te.tag_scope(tag=topi.tag.ELEMWISE)\n    def my_clip(x, a_min, a_max):\n        \"\"\"Unlike topi's current clip, put min and max into two stages.\"\"\"\n        const_min = tvm.tir.const(a_min, x.dtype)\n        const_max = tvm.tir.const(a_max, x.dtype)\n        x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name=\"clipA\")\n        x = te.compute(x.shape, lambda *i:  [...]
+        "def register_vta_tuning_tasks():\n    from tvm.autotvm.task import TaskExtractEnv\n\n    @tvm.te.tag_scope(tag=topi.tag.ELEMWISE)\n    def my_clip(x, a_min, a_max):\n        \"\"\"Unlike topi's current clip, put min and max into two stages.\"\"\"\n        const_min = tvm.tir.const(a_min, x.dtype)\n        const_max = tvm.tir.const(a_max, x.dtype)\n        x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name=\"clipA\")\n        x = te.compute(x.shape, lambda *i:  [...]
       ]
     },
     {
@@ -144,7 +144,7 @@
       },
       "outputs": [],
       "source": [
-        "def tune_and_evaluate(tuning_opt):\n\n    if env.TARGET != \"sim\":\n        # Get remote from fleet node\n        remote = autotvm.measure.request_remote(env.TARGET,\n                                                tracker_host,\n                                                tracker_port,\n                                                timeout=10000)\n        # Reconfigure the JIT runtime and FPGA.\n        vta.reconfig_runtime(remote)\n        vta.program_fpga(remote, bitst [...]
+        "def tune_and_evaluate(tuning_opt):\n\n    if env.TARGET != \"sim\":\n        # Get remote from fleet node\n        remote = autotvm.measure.request_remote(\n            env.TARGET, tracker_host, tracker_port, timeout=10000\n        )\n        # Reconfigure the JIT runtime and FPGA.\n        vta.reconfig_runtime(remote)\n        vta.program_fpga(remote, bitstream=None)\n    else:\n        # In simulation mode, host the RPC server locally.\n        remote = rpc.LocalSession()\n\n  [...]
       ]
     },
     {
diff --git a/docs/_downloads/4dd41316d6ea7ff2b6993aab65428bf5/cross_compilation_and_rpc.ipynb b/docs/_downloads/4dd41316d6ea7ff2b6993aab65428bf5/cross_compilation_and_rpc.ipynb
index 4b9c4df..e5d7707 100644
--- a/docs/_downloads/4dd41316d6ea7ff2b6993aab65428bf5/cross_compilation_and_rpc.ipynb
+++ b/docs/_downloads/4dd41316d6ea7ff2b6993aab65428bf5/cross_compilation_and_rpc.ipynb
@@ -47,7 +47,7 @@
       },
       "outputs": [],
       "source": [
-        "import numpy as np\n\nimport tvm\nfrom tvm import te\nfrom tvm import rpc\nfrom tvm.contrib import util\n\nn = tvm.runtime.convert(1024)\nA = te.placeholder((n,), name='A')\nB = te.compute((n,), lambda i: A[i] + 1.0, name='B')\ns = te.create_schedule(B.op)"
+        "import numpy as np\n\nimport tvm\nfrom tvm import te\nfrom tvm import rpc\nfrom tvm.contrib import util\n\nn = tvm.runtime.convert(1024)\nA = te.placeholder((n,), name=\"A\")\nB = te.compute((n,), lambda i: A[i] + 1.0, name=\"B\")\ns = te.create_schedule(B.op)"
       ]
     },
     {
@@ -65,7 +65,7 @@
       },
       "outputs": [],
       "source": [
-        "local_demo = True\n\nif local_demo:\n    target = 'llvm'\nelse:\n    target = 'llvm -mtriple=armv7l-linux-gnueabihf'\n\nfunc = tvm.build(s, [A, B], target=target, name='add_one')\n# save the lib at a local temp folder\ntemp = util.tempdir()\npath = temp.relpath('lib.tar')\nfunc.export_library(path)"
+        "local_demo = True\n\nif local_demo:\n    target = \"llvm\"\nelse:\n    target = \"llvm -mtriple=armv7l-linux-gnueabihf\"\n\nfunc = tvm.build(s, [A, B], target=target, name=\"add_one\")\n# save the lib at a local temp folder\ntemp = util.tempdir()\npath = temp.relpath(\"lib.tar\")\nfunc.export_library(path)"
       ]
     },
     {
@@ -90,7 +90,7 @@
       },
       "outputs": [],
       "source": [
-        "if local_demo:\n    remote = rpc.LocalSession()\nelse:\n    # The following is my environment, change this to the IP address of your target device\n    host = '10.77.1.162'\n    port = 9090\n    remote = rpc.connect(host, port)"
+        "if local_demo:\n    remote = rpc.LocalSession()\nelse:\n    # The following is my environment, change this to the IP address of your target device\n    host = \"10.77.1.162\"\n    port = 9090\n    remote = rpc.connect(host, port)"
       ]
     },
     {
@@ -108,7 +108,7 @@
       },
       "outputs": [],
       "source": [
-        "remote.upload(path)\nfunc = remote.load_module('lib.tar')\n\n# create arrays on the remote device\nctx = remote.cpu()\na = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)\nb = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)\n# the function will run on the remote device\nfunc(a, b)\nnp.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)"
+        "remote.upload(path)\nfunc = remote.load_module(\"lib.tar\")\n\n# create arrays on the remote device\nctx = remote.cpu()\na = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)\nb = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)\n# the function will run on the remote device\nfunc(a, b)\nnp.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)"
       ]
     },
     {
@@ -126,7 +126,7 @@
       },
       "outputs": [],
       "source": [
-        "time_f = func.time_evaluator(func.entry_name, ctx, number=10)\ncost = time_f(a, b).mean\nprint('%g secs/op' % cost)"
+        "time_f = func.time_evaluator(func.entry_name, ctx, number=10)\ncost = time_f(a, b).mean\nprint(\"%g secs/op\" % cost)"
       ]
     },
     {
@@ -144,7 +144,7 @@
       },
       "outputs": [],
       "source": [
-        "def run_opencl():\n    # NOTE: This is the setting for my rk3399 board. You need to modify\n    # them according to your environment.\n    target_host = \"llvm -mtriple=aarch64-linux-gnu\"\n    opencl_device_host = '10.77.1.145'\n    opencl_device_port = 9090\n\n    # create schedule for the above \"add one\" compute declaration\n    s = te.create_schedule(B.op)\n    xo, xi = s[B].split(B.op.axis[0], factor=32)\n    s[B].bind(xo, te.thread_axis(\"blockIdx.x\"))\n    s[B].bind(xi [...]
+        "def run_opencl():\n    # NOTE: This is the setting for my rk3399 board. You need to modify\n    # them according to your environment.\n    target_host = \"llvm -mtriple=aarch64-linux-gnu\"\n    opencl_device_host = \"10.77.1.145\"\n    opencl_device_port = 9090\n\n    # create schedule for the above \"add one\" compute declaration\n    s = te.create_schedule(B.op)\n    xo, xi = s[B].split(B.op.axis[0], factor=32)\n    s[B].bind(xo, te.thread_axis(\"blockIdx.x\"))\n    s[B].bind( [...]
       ]
     },
     {
diff --git a/docs/_downloads/4e9540fc014621d8d3bd14869c1ab227/scan.ipynb b/docs/_downloads/4e9540fc014621d8d3bd14869c1ab227/scan.ipynb
index f417386..a6506cf 100644
--- a/docs/_downloads/4e9540fc014621d8d3bd14869c1ab227/scan.ipynb
+++ b/docs/_downloads/4e9540fc014621d8d3bd14869c1ab227/scan.ipynb
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "m = te.var(\"m\")\nn = te.var(\"n\")\nX = te.placeholder((m, n), name=\"X\")\ns_state = te.placeholder((m, n))\ns_init = te.compute((1, n), lambda _, i: X[0, i])\ns_update = te.compute((m, n), lambda t, i: s_state[t-1, i] + X[t, i])\ns_scan = tvm.te.scan(s_init, s_update, s_state, inputs=[X])"
+        "m = te.var(\"m\")\nn = te.var(\"n\")\nX = te.placeholder((m, n), name=\"X\")\ns_state = te.placeholder((m, n))\ns_init = te.compute((1, n), lambda _, i: X[0, i])\ns_update = te.compute((m, n), lambda t, i: s_state[t - 1, i] + X[t, i])\ns_scan = tvm.te.scan(s_init, s_update, s_state, inputs=[X])"
       ]
     },
     {
@@ -98,7 +98,7 @@
       },
       "outputs": [],
       "source": [
-        "m = te.var(\"m\")\nn = te.var(\"n\")\nX = te.placeholder((m, n), name=\"X\")\ns_state = te.placeholder((m, n))\ns_init = te.compute((1, n), lambda _, i: X[0, i])\ns_update_s1 = te.compute((m, n), lambda t, i: s_state[t-1, i] * 2, name=\"s1\")\ns_update_s2 = te.compute((m, n), lambda t, i: s_update_s1[t, i] + X[t, i], name=\"s2\")\ns_scan = tvm.te.scan(s_init, s_update_s2, s_state, inputs=[X])"
+        "m = te.var(\"m\")\nn = te.var(\"n\")\nX = te.placeholder((m, n), name=\"X\")\ns_state = te.placeholder((m, n))\ns_init = te.compute((1, n), lambda _, i: X[0, i])\ns_update_s1 = te.compute((m, n), lambda t, i: s_state[t - 1, i] * 2, name=\"s1\")\ns_update_s2 = te.compute((m, n), lambda t, i: s_update_s1[t, i] + X[t, i], name=\"s2\")\ns_scan = tvm.te.scan(s_init, s_update_s2, s_state, inputs=[X])"
       ]
     },
     {
@@ -134,7 +134,7 @@
       },
       "outputs": [],
       "source": [
-        "m = te.var(\"m\")\nn = te.var(\"n\")\nl = te.var(\"l\")\nX = te.placeholder((m, n), name=\"X\")\ns_state1 = te.placeholder((m, n))\ns_state2 = te.placeholder((m, l))\ns_init1 = te.compute((1, n), lambda _, i: X[0, i])\ns_init2 = te.compute((1, l), lambda _, i: 0.0)\ns_update1 = te.compute((m, n), lambda t, i: s_state1[t-1, i] + X[t, i])\ns_update2 = te.compute((m, l), lambda t, i: s_state2[t-1, i] + s_state1[t-1, 0])\ns_scan1, s_scan2 = tvm.te.scan([s_init1, s_init2],\n          [...]
+        "m = te.var(\"m\")\nn = te.var(\"n\")\nl = te.var(\"l\")\nX = te.placeholder((m, n), name=\"X\")\ns_state1 = te.placeholder((m, n))\ns_state2 = te.placeholder((m, l))\ns_init1 = te.compute((1, n), lambda _, i: X[0, i])\ns_init2 = te.compute((1, l), lambda _, i: 0.0)\ns_update1 = te.compute((m, n), lambda t, i: s_state1[t - 1, i] + X[t, i])\ns_update2 = te.compute((m, l), lambda t, i: s_state2[t - 1, i] + s_state1[t - 1, 0])\ns_scan1, s_scan2 = tvm.te.scan(\n    [s_init1, s_init2] [...]
       ]
     },
     {
diff --git a/docs/_downloads/4f4a49a5483a0d0aa4af30f58c3c8664/deploy_quantized.ipynb b/docs/_downloads/4f4a49a5483a0d0aa4af30f58c3c8664/deploy_quantized.ipynb
index d387ac7..fe29d13 100644
--- a/docs/_downloads/4f4a49a5483a0d0aa4af30f58c3c8664/deploy_quantized.ipynb
+++ b/docs/_downloads/4f4a49a5483a0d0aa4af30f58c3c8664/deploy_quantized.ipynb
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "import tvm\nfrom tvm import te\nfrom tvm import relay\nimport mxnet as mx\nfrom tvm.contrib.download import download_testdata\nfrom mxnet import gluon\nimport logging\nimport os\n\nbatch_size = 1\nmodel_name = \"resnet18_v1\"\ntarget = 'cuda'\nctx = tvm.context(target)"
+        "import tvm\nfrom tvm import te\nfrom tvm import relay\nimport mxnet as mx\nfrom tvm.contrib.download import download_testdata\nfrom mxnet import gluon\nimport logging\nimport os\n\nbatch_size = 1\nmodel_name = \"resnet18_v1\"\ntarget = \"cuda\"\nctx = tvm.context(target)"
       ]
     },
     {
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "calibration_rec = download_testdata(\n    'http://data.mxnet.io.s3-website-us-west-1.amazonaws.com/data/val_256_q90.rec',\n    'val_256_q90.rec')\n\ndef get_val_data(num_workers=4):\n    mean_rgb = [123.68, 116.779, 103.939]\n    std_rgb = [58.393, 57.12, 57.375]\n\n    def batch_fn(batch):\n        return batch.data[0].asnumpy(), batch.label[0].asnumpy()\n\n    img_size = 299 if model_name == 'inceptionv3' else 224\n    val_data = mx.io.ImageRecordIter(\n        path_imgrec=cal [...]
+        "calibration_rec = download_testdata(\n    \"http://data.mxnet.io.s3-website-us-west-1.amazonaws.com/data/val_256_q90.rec\",\n    \"val_256_q90.rec\",\n)\n\n\ndef get_val_data(num_workers=4):\n    mean_rgb = [123.68, 116.779, 103.939]\n    std_rgb = [58.393, 57.12, 57.375]\n\n    def batch_fn(batch):\n        return batch.data[0].asnumpy(), batch.label[0].asnumpy()\n\n    img_size = 299 if model_name == \"inceptionv3\" else 224\n    val_data = mx.io.ImageRecordIter(\n        path [...]
       ]
     },
     {
@@ -62,7 +62,7 @@
       },
       "outputs": [],
       "source": [
-        "calibration_samples = 10\n\ndef calibrate_dataset():\n    val_data, batch_fn = get_val_data()\n    val_data.reset()\n    for i, batch in enumerate(val_data):\n        if i * batch_size >= calibration_samples:\n            break\n        data, _ = batch_fn(batch)\n        yield {'data': data}"
+        "calibration_samples = 10\n\n\ndef calibrate_dataset():\n    val_data, batch_fn = get_val_data()\n    val_data.reset()\n    for i, batch in enumerate(val_data):\n        if i * batch_size >= calibration_samples:\n            break\n        data, _ = batch_fn(batch)\n        yield {\"data\": data}"
       ]
     },
     {
@@ -80,7 +80,7 @@
       },
       "outputs": [],
       "source": [
-        "def get_model():\n    gluon_model = gluon.model_zoo.vision.get_model(model_name, pretrained=True)\n    img_size = 299 if model_name == 'inceptionv3' else 224\n    data_shape = (batch_size, 3, img_size, img_size)\n    mod, params = relay.frontend.from_mxnet(gluon_model, {\"data\": data_shape})\n    return mod, params"
+        "def get_model():\n    gluon_model = gluon.model_zoo.vision.get_model(model_name, pretrained=True)\n    img_size = 299 if model_name == \"inceptionv3\" else 224\n    data_shape = (batch_size, 3, img_size, img_size)\n    mod, params = relay.frontend.from_mxnet(gluon_model, {\"data\": data_shape})\n    return mod, params"
       ]
     },
     {
@@ -98,7 +98,7 @@
       },
       "outputs": [],
       "source": [
-        "def quantize(mod, params, data_aware):\n    if data_aware:\n        with relay.quantize.qconfig(calibrate_mode='kl_divergence', weight_scale='max'):\n            mod = relay.quantize.quantize(mod, params, dataset=calibrate_dataset())\n    else:\n        with relay.quantize.qconfig(calibrate_mode='global_scale', global_scale=8.0):\n            mod = relay.quantize.quantize(mod, params)\n    return mod"
+        "def quantize(mod, params, data_aware):\n    if data_aware:\n        with relay.quantize.qconfig(calibrate_mode=\"kl_divergence\", weight_scale=\"max\"):\n            mod = relay.quantize.quantize(mod, params, dataset=calibrate_dataset())\n    else:\n        with relay.quantize.qconfig(calibrate_mode=\"global_scale\", global_scale=8.0):\n            mod = relay.quantize.quantize(mod, params)\n    return mod"
       ]
     },
     {
@@ -116,7 +116,7 @@
       },
       "outputs": [],
       "source": [
-        "def run_inference(mod):\n    executor = relay.create_executor('vm', mod, ctx, target)\n    val_data, batch_fn = get_val_data()\n    for i, batch in enumerate(val_data):\n        data, label = batch_fn(batch)\n        prediction = executor.evaluate()(data)\n        if i > 10:  # only run inference on a few samples in this tutorial\n            break\n\ndef main():\n    mod, params = get_model()\n    mod = quantize(mod, params, data_aware=True)\n    run_inference(mod)\n\nif __name [...]
+        "def run_inference(mod):\n    executor = relay.create_executor(\"vm\", mod, ctx, target)\n    val_data, batch_fn = get_val_data()\n    for i, batch in enumerate(val_data):\n        data, label = batch_fn(batch)\n        prediction = executor.evaluate()(data)\n        if i > 10:  # only run inference on a few samples in this tutorial\n            break\n\n\ndef main():\n    mod, params = get_model()\n    mod = quantize(mod, params, data_aware=True)\n    run_inference(mod)\n\n\nif  [...]
       ]
     }
   ],
diff --git a/docs/_downloads/50b174352ccf0a0defcbd8e6b40145e2/from_tensorflow.py b/docs/_downloads/50b174352ccf0a0defcbd8e6b40145e2/from_tensorflow.py
index 10d505c..a3e8173 100644
--- a/docs/_downloads/50b174352ccf0a0defcbd8e6b40145e2/from_tensorflow.py
+++ b/docs/_downloads/50b174352ccf0a0defcbd8e6b40145e2/from_tensorflow.py
@@ -35,6 +35,7 @@ import os.path
 
 # Tensorflow imports
 import tensorflow as tf
+
 try:
     tf_compat_v1 = tf.compat.v1
 except ImportError:
@@ -44,10 +45,10 @@ except ImportError:
 import tvm.relay.testing.tf as tf_testing
 
 # Base location for model related files.
-repo_base = 'https://github.com/dmlc/web-data/raw/master/tensorflow/models/InceptionV1/'
+repo_base = "https://github.com/dmlc/web-data/raw/master/tensorflow/models/InceptionV1/"
 
 # Test image
-img_name = 'elephant-299.jpg'
+img_name = "elephant-299.jpg"
 image_url = os.path.join(repo_base, img_name)
 
 ######################################################################
@@ -56,25 +57,25 @@ image_url = os.path.join(repo_base, img_name)
 # Please refer docs/frontend/tensorflow.md for more details for various models
 # from tensorflow.
 
-model_name = 'classify_image_graph_def-with_shapes.pb'
+model_name = "classify_image_graph_def-with_shapes.pb"
 model_url = os.path.join(repo_base, model_name)
 
 # Image label map
-map_proto = 'imagenet_2012_challenge_label_map_proto.pbtxt'
+map_proto = "imagenet_2012_challenge_label_map_proto.pbtxt"
 map_proto_url = os.path.join(repo_base, map_proto)
 
 # Human readable text for labels
-label_map = 'imagenet_synset_to_human_label_map.txt'
+label_map = "imagenet_synset_to_human_label_map.txt"
 label_map_url = os.path.join(repo_base, label_map)
 
 # Target settings
 # Use these commented settings to build for cuda.
-#target = 'cuda'
-#target_host = 'llvm'
-#layout = "NCHW"
-#ctx = tvm.gpu(0)
-target = 'llvm'
-target_host = 'llvm'
+# target = 'cuda'
+# target_host = 'llvm'
+# layout = "NCHW"
+# ctx = tvm.gpu(0)
+target = "llvm"
+target_host = "llvm"
 layout = None
 ctx = tvm.cpu(0)
 
@@ -84,25 +85,25 @@ ctx = tvm.cpu(0)
 # Download files listed above.
 from tvm.contrib.download import download_testdata
 
-img_path = download_testdata(image_url, img_name, module='data')
-model_path = download_testdata(model_url, model_name, module=['tf', 'InceptionV1'])
-map_proto_path = download_testdata(map_proto_url, map_proto, module='data')
-label_path = download_testdata(label_map_url, label_map, module='data')
+img_path = download_testdata(image_url, img_name, module="data")
+model_path = download_testdata(model_url, model_name, module=["tf", "InceptionV1"])
+map_proto_path = download_testdata(map_proto_url, map_proto, module="data")
+label_path = download_testdata(label_map_url, label_map, module="data")
 
 ######################################################################
 # Import model
 # ------------
 # Creates tensorflow graph definition from protobuf file.
 
-with tf_compat_v1.gfile.GFile(model_path, 'rb') as f:
+with tf_compat_v1.gfile.GFile(model_path, "rb") as f:
     graph_def = tf_compat_v1.GraphDef()
     graph_def.ParseFromString(f.read())
-    graph = tf.import_graph_def(graph_def, name='')
+    graph = tf.import_graph_def(graph_def, name="")
     # Call the utility to import the graph definition into default graph.
     graph_def = tf_testing.ProcessGraphDefParam(graph_def)
     # Add shapes to the graph.
     with tf_compat_v1.Session() as sess:
-        graph_def = tf_testing.AddShapesToGraphDef(sess, 'softmax')
+        graph_def = tf_testing.AddShapesToGraphDef(sess, "softmax")
 
 ######################################################################
 # Decode image
@@ -115,6 +116,7 @@ with tf_compat_v1.gfile.GFile(model_path, 'rb') as f:
 #
 
 from PIL import Image
+
 image = Image.open(img_path).resize((299, 299))
 
 x = np.array(image)
@@ -127,11 +129,9 @@ x = np.array(image)
 # Results:
 #   sym: relay expr for given tensorflow protobuf.
 #   params: params converted from tensorflow params (tensor protobuf).
-shape_dict = {'DecodeJpeg/contents': x.shape}
-dtype_dict = {'DecodeJpeg/contents': 'uint8'}
-mod, params = relay.frontend.from_tensorflow(graph_def,
-                                             layout=layout,
-                                             shape=shape_dict)
+shape_dict = {"DecodeJpeg/contents": x.shape}
+dtype_dict = {"DecodeJpeg/contents": "uint8"}
+mod, params = relay.frontend.from_tensorflow(graph_def, layout=layout, shape=shape_dict)
 
 print("Tensorflow protobuf imported to relay frontend.")
 ######################################################################
@@ -153,14 +153,15 @@ with tvm.transform.PassContext(opt_level=3):
 # Now we can try deploying the compiled model on target.
 
 from tvm.contrib import graph_runtime
-dtype = 'uint8'
-m = graph_runtime.GraphModule(lib['default'](ctx))
+
+dtype = "uint8"
+m = graph_runtime.GraphModule(lib["default"](ctx))
 # set inputs
-m.set_input('DecodeJpeg/contents', tvm.nd.array(x.astype(dtype)))
+m.set_input("DecodeJpeg/contents", tvm.nd.array(x.astype(dtype)))
 # execute
 m.run()
 # get outputs
-tvm_output = m.get_output(0, tvm.nd.empty(((1, 1008)), 'float32'))
+tvm_output = m.get_output(0, tvm.nd.empty(((1, 1008)), "float32"))
 
 ######################################################################
 # Process the output
@@ -170,31 +171,32 @@ predictions = tvm_output.asnumpy()
 predictions = np.squeeze(predictions)
 
 # Creates node ID --> English string lookup.
-node_lookup = tf_testing.NodeLookup(label_lookup_path=map_proto_path,
-                                    uid_lookup_path=label_path)
+node_lookup = tf_testing.NodeLookup(label_lookup_path=map_proto_path, uid_lookup_path=label_path)
 
 # Print top 5 predictions from TVM output.
 top_k = predictions.argsort()[-5:][::-1]
 for node_id in top_k:
     human_string = node_lookup.id_to_string(node_id)
     score = predictions[node_id]
-    print('%s (score = %.5f)' % (human_string, score))
+    print("%s (score = %.5f)" % (human_string, score))
 
 ######################################################################
 # Inference on tensorflow
 # -----------------------
 # Run the corresponding model on tensorflow
 
+
 def create_graph():
     """Creates a graph from saved GraphDef file and returns a saver."""
     # Creates graph from saved graph_def.pb.
-    with tf_compat_v1.gfile.GFile(model_path, 'rb') as f:
+    with tf_compat_v1.gfile.GFile(model_path, "rb") as f:
         graph_def = tf_compat_v1.GraphDef()
         graph_def.ParseFromString(f.read())
-        graph = tf.import_graph_def(graph_def, name='')
+        graph = tf.import_graph_def(graph_def, name="")
         # Call the utility to import the graph definition into default graph.
         graph_def = tf_testing.ProcessGraphDefParam(graph_def)
 
+
 def run_inference_on_image(image):
     """Runs inference on an image.
 
@@ -208,29 +210,30 @@ def run_inference_on_image(image):
         Nothing
     """
     if not tf_compat_v1.gfile.Exists(image):
-        tf.logging.fatal('File does not exist %s', image)
-    image_data = tf_compat_v1.gfile.GFile(image, 'rb').read()
+        tf.logging.fatal("File does not exist %s", image)
+    image_data = tf_compat_v1.gfile.GFile(image, "rb").read()
 
     # Creates graph from saved GraphDef.
     create_graph()
 
     with tf_compat_v1.Session() as sess:
-        softmax_tensor = sess.graph.get_tensor_by_name('softmax:0')
-        predictions = sess.run(softmax_tensor,
-                               {'DecodeJpeg/contents:0': image_data})
+        softmax_tensor = sess.graph.get_tensor_by_name("softmax:0")
+        predictions = sess.run(softmax_tensor, {"DecodeJpeg/contents:0": image_data})
 
         predictions = np.squeeze(predictions)
 
         # Creates node ID --> English string lookup.
-        node_lookup = tf_testing.NodeLookup(label_lookup_path=map_proto_path,
-                                            uid_lookup_path=label_path)
+        node_lookup = tf_testing.NodeLookup(
+            label_lookup_path=map_proto_path, uid_lookup_path=label_path
+        )
 
         # Print top 5 predictions from tensorflow.
         top_k = predictions.argsort()[-5:][::-1]
-        print ("===== TENSORFLOW RESULTS =======")
+        print("===== TENSORFLOW RESULTS =======")
         for node_id in top_k:
             human_string = node_lookup.id_to_string(node_id)
             score = predictions[node_id]
-            print('%s (score = %.5f)' % (human_string, score))
+            print("%s (score = %.5f)" % (human_string, score))
+
 
 run_inference_on_image(img_path)
diff --git a/docs/_downloads/578004d7db54caef0007609ae5540c72/intro_topi.ipynb b/docs/_downloads/578004d7db54caef0007609ae5540c72/intro_topi.ipynb
index b25dc7d..d2bf573 100644
--- a/docs/_downloads/578004d7db54caef0007609ae5540c72/intro_topi.ipynb
+++ b/docs/_downloads/578004d7db54caef0007609ae5540c72/intro_topi.ipynb
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "n = te.var(\"n\")\nm = te.var(\"m\")\nA = te.placeholder((n, m), name='A')\nk = te.reduce_axis((0, m), \"k\")\nB = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name=\"B\")\ns = te.create_schedule(B.op)"
+        "n = te.var(\"n\")\nm = te.var(\"m\")\nA = te.placeholder((n, m), name=\"A\")\nk = te.reduce_axis((0, m), \"k\")\nB = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name=\"B\")\ns = te.create_schedule(B.op)"
       ]
     },
     {
@@ -159,7 +159,7 @@
       },
       "outputs": [],
       "source": [
-        "func = tvm.build(sg, [a, b, g], 'cuda')\nctx = tvm.gpu(0)\na_np = np.random.uniform(size=(x, y, y)).astype(a.dtype)\nb_np = np.random.uniform(size=(y, y)).astype(b.dtype)\ng_np = np.sum(np.add(a_np + b_np, a_np * b_np) / 2.0)\na_nd = tvm.nd.array(a_np, ctx)\nb_nd = tvm.nd.array(b_np, ctx)\ng_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), ctx)\nfunc(a_nd, b_nd, g_nd)\ntvm.testing.assert_allclose(g_nd.asnumpy(), g_np, rtol=1e-5)"
+        "func = tvm.build(sg, [a, b, g], \"cuda\")\nctx = tvm.gpu(0)\na_np = np.random.uniform(size=(x, y, y)).astype(a.dtype)\nb_np = np.random.uniform(size=(y, y)).astype(b.dtype)\ng_np = np.sum(np.add(a_np + b_np, a_np * b_np) / 2.0)\na_nd = tvm.nd.array(a_np, ctx)\nb_nd = tvm.nd.array(b_np, ctx)\ng_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), ctx)\nfunc(a_nd, b_nd, g_nd)\ntvm.testing.assert_allclose(g_nd.asnumpy(), g_np, rtol=1e-5)"
       ]
     },
     {
@@ -177,7 +177,7 @@
       },
       "outputs": [],
       "source": [
-        "tarray = te.placeholder((512, 512), name=\"tarray\")\nsoftmax_topi = topi.nn.softmax(tarray)\nwith tvm.target.create(\"cuda\"):\n    sst = topi.cuda.schedule_softmax(softmax_topi)\n    print(tvm.lower(sst, [tarray], simple_mode=True))"
+        "tarray = te.placeholder((512, 512), name=\"tarray\")\nsoftmax_topi = topi.nn.softmax(tarray)\nwith tvm.target.Target(\"cuda\"):\n    sst = topi.cuda.schedule_softmax(softmax_topi)\n    print(tvm.lower(sst, [tarray], simple_mode=True))"
       ]
     },
     {
@@ -195,7 +195,7 @@
       },
       "outputs": [],
       "source": [
-        "data = te.placeholder((1, 3, 224, 224))\nkernel = te.placeholder((10, 3, 5, 5))\n\nwith tvm.target.create(\"cuda\"):\n    conv = topi.cuda.conv2d_nchw(data, kernel, 1, 2, 1)\n    out = topi.nn.relu(conv)\n    sconv = topi.cuda.schedule_conv2d_nchw([out])\n    print(tvm.lower(sconv, [data, kernel], simple_mode=True))"
+        "data = te.placeholder((1, 3, 224, 224))\nkernel = te.placeholder((10, 3, 5, 5))\n\nwith tvm.target.Target(\"cuda\"):\n    conv = topi.cuda.conv2d_nchw(data, kernel, 1, 2, 1)\n    out = topi.nn.relu(conv)\n    sconv = topi.cuda.schedule_conv2d_nchw([out])\n    print(tvm.lower(sconv, [data, kernel], simple_mode=True))"
       ]
     },
     {
diff --git a/docs/_downloads/5b32f1dc3e9e2fc5ac5be0918758b967/deploy_quantized.py b/docs/_downloads/5b32f1dc3e9e2fc5ac5be0918758b967/deploy_quantized.py
index 2586318..093bd73 100644
--- a/docs/_downloads/5b32f1dc3e9e2fc5ac5be0918758b967/deploy_quantized.py
+++ b/docs/_downloads/5b32f1dc3e9e2fc5ac5be0918758b967/deploy_quantized.py
@@ -38,7 +38,7 @@ import os
 
 batch_size = 1
 model_name = "resnet18_v1"
-target = 'cuda'
+target = "cuda"
 ctx = tvm.context(target)
 
 ###############################################################################
@@ -47,8 +47,10 @@ ctx = tvm.context(target)
 # We will demonstrate how to prepare the calibration dataset for quantization.
 # We first download the validation set of ImageNet and pre-process the dataset.
 calibration_rec = download_testdata(
-    'http://data.mxnet.io.s3-website-us-west-1.amazonaws.com/data/val_256_q90.rec',
-    'val_256_q90.rec')
+    "http://data.mxnet.io.s3-website-us-west-1.amazonaws.com/data/val_256_q90.rec",
+    "val_256_q90.rec",
+)
+
 
 def get_val_data(num_workers=4):
     mean_rgb = [123.68, 116.779, 103.939]
@@ -57,7 +59,7 @@ def get_val_data(num_workers=4):
     def batch_fn(batch):
         return batch.data[0].asnumpy(), batch.label[0].asnumpy()
 
-    img_size = 299 if model_name == 'inceptionv3' else 224
+    img_size = 299 if model_name == "inceptionv3" else 224
     val_data = mx.io.ImageRecordIter(
         path_imgrec=calibration_rec,
         preprocess_threads=num_workers,
@@ -82,6 +84,7 @@ def get_val_data(num_workers=4):
 
 calibration_samples = 10
 
+
 def calibrate_dataset():
     val_data, batch_fn = get_val_data()
     val_data.reset()
@@ -89,7 +92,7 @@ def calibrate_dataset():
         if i * batch_size >= calibration_samples:
             break
         data, _ = batch_fn(batch)
-        yield {'data': data}
+        yield {"data": data}
 
 
 ###############################################################################
@@ -98,7 +101,7 @@ def calibrate_dataset():
 # We use the Relay MxNet frontend to import a model from the Gluon model zoo.
 def get_model():
     gluon_model = gluon.model_zoo.vision.get_model(model_name, pretrained=True)
-    img_size = 299 if model_name == 'inceptionv3' else 224
+    img_size = 299 if model_name == "inceptionv3" else 224
     data_shape = (batch_size, 3, img_size, img_size)
     mod, params = relay.frontend.from_mxnet(gluon_model, {"data": data_shape})
     return mod, params
@@ -127,12 +130,13 @@ def get_model():
 # Alternatively, we can also use pre-defined global scales. This saves the time
 # for calibration. But the accuracy might be impacted.
 
+
 def quantize(mod, params, data_aware):
     if data_aware:
-        with relay.quantize.qconfig(calibrate_mode='kl_divergence', weight_scale='max'):
+        with relay.quantize.qconfig(calibrate_mode="kl_divergence", weight_scale="max"):
             mod = relay.quantize.quantize(mod, params, dataset=calibrate_dataset())
     else:
-        with relay.quantize.qconfig(calibrate_mode='global_scale', global_scale=8.0):
+        with relay.quantize.qconfig(calibrate_mode="global_scale", global_scale=8.0):
             mod = relay.quantize.quantize(mod, params)
     return mod
 
@@ -142,7 +146,7 @@ def quantize(mod, params, data_aware):
 # -------------
 # We create a Relay VM to build and execute the model.
 def run_inference(mod):
-    executor = relay.create_executor('vm', mod, ctx, target)
+    executor = relay.create_executor("vm", mod, ctx, target)
     val_data, batch_fn = get_val_data()
     for i, batch in enumerate(val_data):
         data, label = batch_fn(batch)
@@ -150,10 +154,12 @@ def run_inference(mod):
         if i > 10:  # only run inference on a few samples in this tutorial
             break
 
+
 def main():
     mod, params = get_model()
     mod = quantize(mod, params, data_aware=True)
     run_inference(mod)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
diff --git a/docs/_downloads/5bd1bb9c6505ea40407fa19f01579414/reduction.py b/docs/_downloads/5bd1bb9c6505ea40407fa19f01579414/reduction.py
index cdfc94e..ecefc28 100644
--- a/docs/_downloads/5bd1bb9c6505ea40407fa19f01579414/reduction.py
+++ b/docs/_downloads/5bd1bb9c6505ea40407fa19f01579414/reduction.py
@@ -56,7 +56,7 @@ import numpy as np
 #
 n = te.var("n")
 m = te.var("m")
-A = te.placeholder((n, m), name='A')
+A = te.placeholder((n, m), name="A")
 k = te.reduce_axis((0, m), "k")
 B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B")
 
@@ -136,12 +136,11 @@ print(fcuda.imported_modules[0].get_source())
 # Verify the correctness of result kernel by comparing it to numpy.
 #
 nn = 128
-ctx  = tvm.gpu(0)
+ctx = tvm.gpu(0)
 a = tvm.nd.array(np.random.uniform(size=(nn, nn)).astype(A.dtype), ctx)
 b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), ctx)
 fcuda(a, b)
-tvm.testing.assert_allclose(
-    b.asnumpy(),  np.sum(a.asnumpy(), axis=1), rtol=1e-4)
+tvm.testing.assert_allclose(b.asnumpy(), np.sum(a.asnumpy(), axis=1), rtol=1e-4)
 
 ######################################################################
 # Describe Convolution via 2D Reduction
@@ -149,15 +148,16 @@ tvm.testing.assert_allclose(
 # In TVM, we can describe convolution via 2D reduction in a simple way.
 # Here is an example for 2D convolution with filter size = [3, 3] and strides = [1, 1].
 #
-n = te.var('n')
-Input = te.placeholder((n, n), name='Input')
-Filter = te.placeholder((3, 3), name='Filter')
-di = te.reduce_axis((0, 3), name='di')
-dj = te.reduce_axis((0, 3), name='dj')
+n = te.var("n")
+Input = te.placeholder((n, n), name="Input")
+Filter = te.placeholder((3, 3), name="Filter")
+di = te.reduce_axis((0, 3), name="di")
+dj = te.reduce_axis((0, 3), name="dj")
 Output = te.compute(
     (n - 2, n - 2),
     lambda i, j: te.sum(Input[i + di, j + dj] * Filter[di, dj], axis=[di, dj]),
-    name='Output')
+    name="Output",
+)
 s = te.create_schedule(Output.op)
 print(tvm.lower(s, [Input, Filter, Output], simple_mode=True))
 
@@ -171,13 +171,12 @@ print(tvm.lower(s, [Input, Filter, Output], simple_mode=True))
 # commutative reduction operation by :any:`te.comm_reducer`.
 #
 
-n = te.var('n')
-m = te.var('m')
-product = te.comm_reducer(lambda x, y: x*y,
-    lambda t: tvm.tir.const(1, dtype=t), name="product")
-A = te.placeholder((n, m), name='A')
-k = te.reduce_axis((0, m), name='k')
-B = te.compute((n,), lambda i: product(A[i, k], axis=k), name='B')
+n = te.var("n")
+m = te.var("m")
+product = te.comm_reducer(lambda x, y: x * y, lambda t: tvm.tir.const(1, dtype=t), name="product")
+A = te.placeholder((n, m), name="A")
+k = te.reduce_axis((0, m), name="k")
+B = te.compute((n,), lambda i: product(A[i, k], axis=k), name="B")
 
 ######################################################################
 # .. note::
diff --git a/docs/_downloads/5c443f88ea44ce77c5ccade429af6e74/deploy_prequantized_tflite.py b/docs/_downloads/5c443f88ea44ce77c5ccade429af6e74/deploy_prequantized_tflite.py
index 0e5f9af..52321b1 100644
--- a/docs/_downloads/5c443f88ea44ce77c5ccade429af6e74/deploy_prequantized_tflite.py
+++ b/docs/_downloads/5c443f88ea44ce77c5ccade429af6e74/deploy_prequantized_tflite.py
@@ -61,12 +61,15 @@ from tvm import relay
 # Download mobilenet V2 TFLite model provided by Google
 from tvm.contrib.download import download_testdata
 
-model_url = "https://storage.googleapis.com/download.tensorflow.org/models/" \
-             "tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz"
+model_url = (
+    "https://storage.googleapis.com/download.tensorflow.org/models/"
+    "tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz"
+)
 
 # Download model tar file and extract it to get mobilenet_v2_1.0_224.tflite
-model_path = download_testdata(model_url, "mobilenet_v2_1.0_224_quant.tgz",
-                               module=['tf', 'official'])
+model_path = download_testdata(
+    model_url, "mobilenet_v2_1.0_224_quant.tgz", module=["tf", "official"]
+)
 model_dir = os.path.dirname(model_path)
 
 
@@ -75,13 +78,15 @@ model_dir = os.path.dirname(model_path)
 # ----------------------------------------------
 def extract(path):
     import tarfile
+
     if path.endswith("tgz") or path.endswith("gz"):
         dir_path = os.path.dirname(path)
         tar = tarfile.open(path)
         tar.extractall(path=dir_path)
         tar.close()
     else:
-        raise RuntimeError('Could not decompress the file: ' + path)
+        raise RuntimeError("Could not decompress the file: " + path)
+
 
 extract(model_path)
 
@@ -95,15 +100,17 @@ extract(model_path)
 # --------------------------------
 def get_real_image(im_height, im_width):
     from PIL import Image
-    repo_base = 'https://github.com/dmlc/web-data/raw/master/tensorflow/models/InceptionV1/'
-    img_name = 'elephant-299.jpg'
+
+    repo_base = "https://github.com/dmlc/web-data/raw/master/tensorflow/models/InceptionV1/"
+    img_name = "elephant-299.jpg"
     image_url = os.path.join(repo_base, img_name)
-    img_path = download_testdata(image_url, img_name, module='data')
+    img_path = download_testdata(image_url, img_name, module="data")
     image = Image.open(img_path).resize((im_height, im_width))
-    x = np.array(image).astype('uint8')
+    x = np.array(image).astype("uint8")
     data = np.reshape(x, (1, im_height, im_width, 3))
     return data
 
+
 data = get_real_image(224, 224)
 
 ######################################################################
@@ -118,9 +125,11 @@ tflite_model_buf = open(tflite_model_file, "rb").read()
 # Get TFLite model from buffer
 try:
     import tflite
+
     tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
 except AttributeError:
     import tflite.Model
+
     tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
 
 ###############################################################################
@@ -143,7 +152,7 @@ def run_tflite_model(tflite_model_buf, input_data):
     # set input
     assert len(input_data) == len(input_details)
     for i in range(len(input_details)):
-        interpreter.set_tensor(input_details[i]['index'], input_data[i])
+        interpreter.set_tensor(input_details[i]["index"], input_data[i])
 
     # Run
     interpreter.invoke()
@@ -151,16 +160,18 @@ def run_tflite_model(tflite_model_buf, input_data):
     # get output
     tflite_output = list()
     for i in range(len(output_details)):
-        tflite_output.append(interpreter.get_tensor(output_details[i]['index']))
+        tflite_output.append(interpreter.get_tensor(output_details[i]["index"]))
 
     return tflite_output
 
+
 ###############################################################################
 # Lets run TVM compiled pre-quantized model inference and get the TVM prediction.
 def run_tvm(lib):
     from tvm.contrib import graph_runtime
-    rt_mod = graph_runtime.GraphModule(lib['default'](tvm.cpu(0)))
-    rt_mod.set_input('input', data)
+
+    rt_mod = graph_runtime.GraphModule(lib["default"](tvm.cpu(0)))
+    rt_mod.set_input("input", data)
     rt_mod.run()
     tvm_res = rt_mod.get_output(0).asnumpy()
     tvm_pred = np.squeeze(tvm_res).argsort()[-5:][::-1]
@@ -185,18 +196,16 @@ tflite_pred = np.squeeze(tflite_res).argsort()[-5:][::-1]
 # frontend parser call for a pre-quantized model is exactly same as frontend parser call for a FP32
 # model. We encourage you to remove the comment from print(mod) and inspect the Relay module. You
 # will see many QNN operators, like, Requantize, Quantize and QNN Conv2D.
-dtype_dict = {'input': data.dtype.name}
-shape_dict = {'input': data.shape}
+dtype_dict = {"input": data.dtype.name}
+shape_dict = {"input": data.shape}
 
-mod, params = relay.frontend.from_tflite(tflite_model,
-                                         shape_dict=shape_dict,
-                                         dtype_dict=dtype_dict)
+mod, params = relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict)
 # print(mod)
 
 ###############################################################################
 # Lets now the compile the Relay module. We use the "llvm" target here. Please replace it with the
 # target platform that you are interested in.
-target = 'llvm'
+target = "llvm"
 with tvm.transform.PassContext(opt_level=3):
     lib = relay.build_module.build(mod, target=target, params=params)
 
diff --git a/docs/_downloads/5df1a8bfe653027789c10728e74a65c0/intrin_math.ipynb b/docs/_downloads/5df1a8bfe653027789c10728e74a65c0/intrin_math.ipynb
index 19c1b72..15d5c32 100644
--- a/docs/_downloads/5df1a8bfe653027789c10728e74a65c0/intrin_math.ipynb
+++ b/docs/_downloads/5df1a8bfe653027789c10728e74a65c0/intrin_math.ipynb
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "n = te.var(\"n\")\nA = te.placeholder((n,), name='A')\nB = te.compute(A.shape,\n                lambda i: tvm.tir.call_pure_extern(\"float32\", \"__expf\", A[i]),\n                name=\"B\")\ns = te.create_schedule(B.op)\nnum_thread = 64\nbx, tx = s[B].split(B.op.axis[0], factor=num_thread)\ns[B].bind(bx, te.thread_axis(\"blockIdx.x\"))\ns[B].bind(tx, te.thread_axis(\"threadIdx.x\"))\nf = tvm.build(s, [A, B], \"cuda\", name=\"myexp\")\nprint(f.imported_modules[0].get_source())"
+        "n = te.var(\"n\")\nA = te.placeholder((n,), name=\"A\")\nB = te.compute(A.shape, lambda i: tvm.tir.call_pure_extern(\"float32\", \"__expf\", A[i]), name=\"B\")\ns = te.create_schedule(B.op)\nnum_thread = 64\nbx, tx = s[B].split(B.op.axis[0], factor=num_thread)\ns[B].bind(bx, te.thread_axis(\"blockIdx.x\"))\ns[B].bind(tx, te.thread_axis(\"threadIdx.x\"))\nf = tvm.build(s, [A, B], \"cuda\", name=\"myexp\")\nprint(f.imported_modules[0].get_source())"
       ]
     },
     {
@@ -62,7 +62,7 @@
       },
       "outputs": [],
       "source": [
-        "n = te.var(\"n\")\nA = te.placeholder((n,), name='A')\nB = te.compute(A.shape, lambda i: te.exp(A[i]), name=\"B\")\ns = te.create_schedule(B.op)\nnum_thread = 64\nbx, tx = s[B].split(B.op.axis[0], factor=num_thread)\ns[B].bind(bx, te.thread_axis(\"blockIdx.x\"))\ns[B].bind(tx, te.thread_axis(\"threadIdx.x\"))\nfcuda = tvm.build(s, [A, B], \"cuda\", name=\"myexp\")\nprint(fcuda.imported_modules[0].get_source())"
+        "n = te.var(\"n\")\nA = te.placeholder((n,), name=\"A\")\nB = te.compute(A.shape, lambda i: te.exp(A[i]), name=\"B\")\ns = te.create_schedule(B.op)\nnum_thread = 64\nbx, tx = s[B].split(B.op.axis[0], factor=num_thread)\ns[B].bind(bx, te.thread_axis(\"blockIdx.x\"))\ns[B].bind(tx, te.thread_axis(\"threadIdx.x\"))\nfcuda = tvm.build(s, [A, B], \"cuda\", name=\"myexp\")\nprint(fcuda.imported_modules[0].get_source())"
       ]
     },
     {
@@ -134,7 +134,7 @@
       },
       "outputs": [],
       "source": [
-        "def mylog(x):\n    \"\"\"customized log intrinsic function\"\"\"\n    return tvm.tir.call_intrin(x.dtype, \"tir.mylog\", x)\n\n\ndef my_cuda_mylog_rule(op):\n    \"\"\"CUDA lowering rule for log\"\"\"\n    if op.dtype == \"float32\":\n        return tvm.tir.call_pure_extern(\"float32\", \"logf\", op.args[0])\n    elif op.dtype == \"float64\":\n        return tvm.tir.call_pure_extern(\"float64\", \"log\", op.args[0])\n    else:\n        return op\n\n# new op registration is trigg [...]
+        "def mylog(x):\n    \"\"\"customized log intrinsic function\"\"\"\n    return tvm.tir.call_intrin(x.dtype, \"tir.mylog\", x)\n\n\ndef my_cuda_mylog_rule(op):\n    \"\"\"CUDA lowering rule for log\"\"\"\n    if op.dtype == \"float32\":\n        return tvm.tir.call_pure_extern(\"float32\", \"logf\", op.args[0])\n    elif op.dtype == \"float64\":\n        return tvm.tir.call_pure_extern(\"float64\", \"log\", op.args[0])\n    else:\n        return op\n\n\n# new op registration is tri [...]
       ]
     },
     {
diff --git a/docs/_downloads/612f9e42b0247df5c8ab277534e2af65/tune_relay_vta.py b/docs/_downloads/612f9e42b0247df5c8ab277534e2af65/tune_relay_vta.py
index 1184006..41fd04e 100644
--- a/docs/_downloads/612f9e42b0247df5c8ab277534e2af65/tune_relay_vta.py
+++ b/docs/_downloads/612f9e42b0247df5c8ab277534e2af65/tune_relay_vta.py
@@ -79,7 +79,7 @@ from vta.top import graph_pack
 def compile_network(env, target, model, start_pack, stop_pack):
 
     # Populate the shape and data type dictionary
-    dtype_dict = {"data": 'float32'}
+    dtype_dict = {"data": "float32"}
     shape_dict = {"data": (env.BATCH, 3, 224, 224)}
 
     # Get off the shelf gluon model, and convert to relay
@@ -99,12 +99,14 @@ def compile_network(env, target, model, start_pack, stop_pack):
     # Perform graph packing and constant folding for VTA target
     if target.device_name == "vta":
         assert env.BLOCK_IN == env.BLOCK_OUT
-        relay_prog = graph_pack(mod["main"],
-                                env.BATCH,
-                                env.BLOCK_OUT,
-                                env.WGT_WIDTH,
-                                start_name=start_pack,
-                                stop_name=stop_pack)
+        relay_prog = graph_pack(
+            mod["main"],
+            env.BATCH,
+            env.BLOCK_OUT,
+            env.WGT_WIDTH,
+            start_name=start_pack,
+            stop_name=stop_pack,
+        )
 
     return relay_prog, params
 
@@ -178,7 +180,7 @@ def compile_network(env, target, model, start_pack, stop_pack):
 # Here we use an Pynq-Z1 board as an example.
 
 # Tracker host and port can be set by your environment
-tracker_host = os.environ.get("TVM_TRACKER_HOST", '0.0.0.0')
+tracker_host = os.environ.get("TVM_TRACKER_HOST", "0.0.0.0")
 tracker_port = int(os.environ.get("TVM_TRACKER_PORT", 9190))
 
 # Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
@@ -201,20 +203,20 @@ stop_pack = "nn.global_avg_pool2d"
 # Tuning option
 log_file = "%s.%s.log" % (device, network)
 tuning_option = {
-    'log_filename': log_file,
-
-    'tuner': 'random',
-    'n_trial': 1000,
-    'early_stopping': None,
-
-    'measure_option': autotvm.measure_option(
+    "log_filename": log_file,
+    "tuner": "random",
+    "n_trial": 1000,
+    "early_stopping": None,
+    "measure_option": autotvm.measure_option(
         builder=autotvm.LocalBuilder(),
-        runner=autotvm.RPCRunner(env.TARGET,
-                                 host=tracker_host,
-                                 port=tracker_port,
-                                 number=5,
-                                 timeout=60,
-                                 check_correctness=True),
+        runner=autotvm.RPCRunner(
+            env.TARGET,
+            host=tracker_host,
+            port=tracker_port,
+            number=5,
+            timeout=60,
+            check_correctness=True,
+        ),
     ),
 }
 
@@ -242,13 +244,15 @@ tuning_option = {
 
 
 # You can skip the implementation of this function for this tutorial.
-def tune_tasks(tasks,
-               measure_option,
-               tuner='xgb',
-               n_trial=1000,
-               early_stopping=None,
-               log_filename='tuning.log',
-               use_transfer_learning=True):
+def tune_tasks(
+    tasks,
+    measure_option,
+    tuner="xgb",
+    n_trial=1000,
+    early_stopping=None,
+    log_filename="tuning.log",
+    use_transfer_learning=True,
+):
 
     # create tmp log file
     tmp_log_file = log_filename + ".tmp"
@@ -259,15 +263,15 @@ def tune_tasks(tasks,
         prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
 
         # create tuner
-        if tuner == 'xgb' or tuner == 'xgb-rank':
-            tuner_obj = XGBTuner(tsk, loss_type='rank')
-        elif tuner == 'xgb_knob':
-            tuner_obj = XGBTuner(tsk, loss_type='rank', feature_type='knob')
-        elif tuner == 'ga':
+        if tuner == "xgb" or tuner == "xgb-rank":
+            tuner_obj = XGBTuner(tsk, loss_type="rank")
+        elif tuner == "xgb_knob":
+            tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="knob")
+        elif tuner == "ga":
             tuner_obj = GATuner(tsk, pop_size=50)
-        elif tuner == 'random':
+        elif tuner == "random":
             tuner_obj = RandomTuner(tsk)
-        elif tuner == 'gridsearch':
+        elif tuner == "gridsearch":
             tuner_obj = GridSearchTuner(tsk)
         else:
             raise ValueError("Invalid tuner: " + tuner)
@@ -278,13 +282,15 @@ def tune_tasks(tasks,
 
         # do tuning
         tsk_trial = min(n_trial, len(tsk.config_space))
-        tuner_obj.tune(n_trial=tsk_trial,
-                       early_stopping=early_stopping,
-                       measure_option=measure_option,
-                       callbacks=[
-                           autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
-                           autotvm.callback.log_to_file(tmp_log_file)
-                       ])
+        tuner_obj.tune(
+            n_trial=tsk_trial,
+            early_stopping=early_stopping,
+            measure_option=measure_option,
+            callbacks=[
+                autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
+                autotvm.callback.log_to_file(tmp_log_file),
+            ],
+        )
 
     # pick best records to a cache file
     autotvm.record.pick_best(tmp_log_file, log_filename)
@@ -321,7 +327,7 @@ def register_vta_tuning_tasks():
             res = my_clip(res, 0, 127)
             res = topi.cast(res, "int8")
 
-        if tvm.target.Target.current().device_name == 'vta':
+        if tvm.target.Target.current().device_name == "vta":
             s = vta.top.schedule_conv2d_packed([res])
         else:
             s = te.create_schedule([res.op])
@@ -336,10 +342,9 @@ def tune_and_evaluate(tuning_opt):
 
     if env.TARGET != "sim":
         # Get remote from fleet node
-        remote = autotvm.measure.request_remote(env.TARGET,
-                                                tracker_host,
-                                                tracker_port,
-                                                timeout=10000)
+        remote = autotvm.measure.request_remote(
+            env.TARGET, tracker_host, tracker_port, timeout=10000
+        )
         # Reconfigure the JIT runtime and FPGA.
         vta.reconfig_runtime(remote)
         vta.program_fpga(remote, bitstream=None)
@@ -354,11 +359,13 @@ def tune_and_evaluate(tuning_opt):
     print("Extract tasks...")
     relay_prog, params = compile_network(env, target, network, start_pack, stop_pack)
     mod = tvm.IRModule.from_expr(relay_prog)
-    tasks = autotvm.task.extract_from_program(mod,
-                                              params=params,
-                                              ops=(relay.op.get("nn.conv2d"),),
-                                              target=target,
-                                              target_host=env.target_host)
+    tasks = autotvm.task.extract_from_program(
+        mod,
+        params=params,
+        ops=(relay.op.get("nn.conv2d"),),
+        target=target,
+        target_host=env.target_host,
+    )
 
     # filter out non-packed conv2d task
     tasks = list(filter(lambda t: len(t.args[0][1]) > 4, tasks))
@@ -376,9 +383,21 @@ def tune_and_evaluate(tuning_opt):
         hkernel, wkernel = wgt[2], wgt[3]
         hstride, wstride = tsk.args[2][0], tsk.args[2][1]
         hpad, wpad = tsk.args[3][0], tsk.args[3][1]
-        print("({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})".format(
-            batch, height, width, in_filter, out_filter, hkernel, wkernel,
-            hpad, wpad, hstride, wstride))
+        print(
+            "({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})".format(
+                batch,
+                height,
+                width,
+                in_filter,
+                out_filter,
+                hkernel,
+                wkernel,
+                hpad,
+                wpad,
+                hstride,
+                wstride,
+            )
+        )
 
     # We do not run the tuning in our webpage server since it takes too long.
     # Comment the following line to run it by yourself.
@@ -394,17 +413,14 @@ def tune_and_evaluate(tuning_opt):
         print("Compile...")
         if target.device_name != "vta":
             with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}):
-                graph, lib, params = relay.build(relay_prog,
-                                                target=target,
-                                                params=params,
-                                                target_host=env.target_host)
+                lib = relay.build(
+                    relay_prog, target=target, params=params, target_host=env.target_host
+                )
         else:
             with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
-                graph, lib, params = relay.build(
-                    relay_prog,
-                    target=target,
-                    params=params,
-                    target_host=env.target_host)
+                lib = relay.build(
+                    relay_prog, target=target, params=params, target_host=env.target_host
+                )
 
         # Export library
         print("Upload...")
@@ -415,21 +431,21 @@ def tune_and_evaluate(tuning_opt):
 
         # Generate the graph runtime
         ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
-        m = graph_runtime.create(graph, lib, ctx)
+        m = graph_runtime.GraphModule(lib["default"](ctx))
 
         # upload parameters to device
-        image = tvm.nd.array(
-            (np.random.uniform(size=(1, 3, 224, 224))).astype('float32'))
-        m.set_input(**params)
-        m.set_input('data', image)
+        image = tvm.nd.array((np.random.uniform(size=(1, 3, 224, 224))).astype("float32"))
+        m.set_input("data", image)
 
         # evaluate
         print("Evaluate inference time cost...")
         timer = m.module.time_evaluator("run", ctx, number=1, repeat=10)
         tcost = timer()
         prof_res = np.array(tcost.results) * 1000  # convert to millisecond
-        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
-              (np.mean(prof_res), np.std(prof_res)))
+        print(
+            "Mean inference time (std dev): %.2f ms (%.2f ms)"
+            % (np.mean(prof_res), np.std(prof_res))
+        )
 
 
 # Run the tuning and evaluate the results
diff --git a/docs/_downloads/64a7765a4ac55f228cf82b8462944a61/schedule_primitives.ipynb b/docs/_downloads/64a7765a4ac55f228cf82b8462944a61/schedule_primitives.ipynb
index 771c9ba..2386da3 100644
--- a/docs/_downloads/64a7765a4ac55f228cf82b8462944a61/schedule_primitives.ipynb
+++ b/docs/_downloads/64a7765a4ac55f228cf82b8462944a61/schedule_primitives.ipynb
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "# declare some variables for use later\nn = te.var('n')\nm = te.var('m')"
+        "# declare some variables for use later\nn = te.var(\"n\")\nm = te.var(\"m\")"
       ]
     },
     {
@@ -62,7 +62,7 @@
       },
       "outputs": [],
       "source": [
-        "# declare a matrix element-wise multiply\nA = te.placeholder((m, n), name='A')\nB = te.placeholder((m, n), name='B')\nC = te.compute((m, n), lambda i, j: A[i, j] * B[i, j], name='C')\n\ns = te.create_schedule([C.op])\n# lower will transform the computation from definition to the real\n# callable function. With argument `simple_mode=True`, it will\n# return you a readable C like statement, we use it here to print the\n# schedule result.\nprint(tvm.lower(s, [A, B, C], simple_mode=True))"
+        "# declare a matrix element-wise multiply\nA = te.placeholder((m, n), name=\"A\")\nB = te.placeholder((m, n), name=\"B\")\nC = te.compute((m, n), lambda i, j: A[i, j] * B[i, j], name=\"C\")\n\ns = te.create_schedule([C.op])\n# lower will transform the computation from definition to the real\n# callable function. With argument `simple_mode=True`, it will\n# return you a readable C like statement, we use it here to print the\n# schedule result.\nprint(tvm.lower(s, [A, B, C], simple [...]
       ]
     },
     {
@@ -87,7 +87,7 @@
       },
       "outputs": [],
       "source": [
-        "A = te.placeholder((m,), name='A')\nB = te.compute((m,), lambda i: A[i]*2, name='B')\n\ns = te.create_schedule(B.op)\nxo, xi = s[B].split(B.op.axis[0], factor=32)\nprint(tvm.lower(s, [A, B], simple_mode=True))"
+        "A = te.placeholder((m,), name=\"A\")\nB = te.compute((m,), lambda i: A[i] * 2, name=\"B\")\n\ns = te.create_schedule(B.op)\nxo, xi = s[B].split(B.op.axis[0], factor=32)\nprint(tvm.lower(s, [A, B], simple_mode=True))"
       ]
     },
     {
@@ -105,7 +105,7 @@
       },
       "outputs": [],
       "source": [
-        "A = te.placeholder((m,), name='A')\nB = te.compute((m,), lambda i: A[i], name='B')\n\ns = te.create_schedule(B.op)\nbx, tx = s[B].split(B.op.axis[0], nparts=32)\nprint(tvm.lower(s, [A, B], simple_mode=True))"
+        "A = te.placeholder((m,), name=\"A\")\nB = te.compute((m,), lambda i: A[i], name=\"B\")\n\ns = te.create_schedule(B.op)\nbx, tx = s[B].split(B.op.axis[0], nparts=32)\nprint(tvm.lower(s, [A, B], simple_mode=True))"
       ]
     },
     {
@@ -123,7 +123,7 @@
       },
       "outputs": [],
       "source": [
-        "A = te.placeholder((m, n), name='A')\nB = te.compute((m, n), lambda i, j: A[i, j], name='B')\n\ns = te.create_schedule(B.op)\nxo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)\nprint(tvm.lower(s, [A, B], simple_mode=True))"
+        "A = te.placeholder((m, n), name=\"A\")\nB = te.compute((m, n), lambda i, j: A[i, j], name=\"B\")\n\ns = te.create_schedule(B.op)\nxo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)\nprint(tvm.lower(s, [A, B], simple_mode=True))"
       ]
     },
     {
@@ -141,7 +141,7 @@
       },
       "outputs": [],
       "source": [
-        "A = te.placeholder((m, n), name='A')\nB = te.compute((m, n), lambda i, j: A[i, j], name='B')\n\ns = te.create_schedule(B.op)\n# tile to four axises first: (i.outer, j.outer, i.inner, j.inner)\nxo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)\n# then fuse (i.inner, j.inner) into one axis: (i.inner.j.inner.fused)\nfused = s[B].fuse(xi, yi)\nprint(tvm.lower(s, [A, B], simple_mode=True))"
+        "A = te.placeholder((m, n), name=\"A\")\nB = te.compute((m, n), lambda i, j: A[i, j], name=\"B\")\n\ns = te.create_schedule(B.op)\n# tile to four axises first: (i.outer, j.outer, i.inner, j.inner)\nxo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)\n# then fuse (i.inner, j.inner) into one axis: (i.inner.j.inner.fused)\nfused = s[B].fuse(xi, yi)\nprint(tvm.lower(s, [A, B], simple_mode=True))"
       ]
     },
     {
@@ -159,7 +159,7 @@
       },
       "outputs": [],
       "source": [
-        "A = te.placeholder((m, n), name='A')\nB = te.compute((m, n), lambda i, j: A[i, j], name='B')\n\ns = te.create_schedule(B.op)\n# tile to four axises first: (i.outer, j.outer, i.inner, j.inner)\nxo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)\n# then reorder the axises: (i.inner, j.outer, i.outer, j.inner)\ns[B].reorder(xi, yo, xo, yi)\nprint(tvm.lower(s, [A, B], simple_mode=True))"
+        "A = te.placeholder((m, n), name=\"A\")\nB = te.compute((m, n), lambda i, j: A[i, j], name=\"B\")\n\ns = te.create_schedule(B.op)\n# tile to four axises first: (i.outer, j.outer, i.inner, j.inner)\nxo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)\n# then reorder the axises: (i.inner, j.outer, i.outer, j.inner)\ns[B].reorder(xi, yo, xo, yi)\nprint(tvm.lower(s, [A, B], simple_mode=True))"
       ]
     },
     {
@@ -177,7 +177,7 @@
       },
       "outputs": [],
       "source": [
-        "A = te.placeholder((n,), name='A')\nB = te.compute(A.shape, lambda i: A[i] * 2, name='B')\n\ns = te.create_schedule(B.op)\nbx, tx = s[B].split(B.op.axis[0], factor=64)\ns[B].bind(bx, te.thread_axis(\"blockIdx.x\"))\ns[B].bind(tx, te.thread_axis(\"threadIdx.x\"))\nprint(tvm.lower(s, [A, B], simple_mode=True))"
+        "A = te.placeholder((n,), name=\"A\")\nB = te.compute(A.shape, lambda i: A[i] * 2, name=\"B\")\n\ns = te.create_schedule(B.op)\nbx, tx = s[B].split(B.op.axis[0], factor=64)\ns[B].bind(bx, te.thread_axis(\"blockIdx.x\"))\ns[B].bind(tx, te.thread_axis(\"threadIdx.x\"))\nprint(tvm.lower(s, [A, B], simple_mode=True))"
       ]
     },
     {
@@ -195,7 +195,7 @@
       },
       "outputs": [],
       "source": [
-        "A = te.placeholder((m,), name='A')\nB = te.compute((m,), lambda i: A[i]+1, name='B')\nC = te.compute((m,), lambda i: B[i]*2, name='C')\n\ns = te.create_schedule(C.op)\nprint(tvm.lower(s, [A, B, C], simple_mode=True))"
+        "A = te.placeholder((m,), name=\"A\")\nB = te.compute((m,), lambda i: A[i] + 1, name=\"B\")\nC = te.compute((m,), lambda i: B[i] * 2, name=\"C\")\n\ns = te.create_schedule(C.op)\nprint(tvm.lower(s, [A, B, C], simple_mode=True))"
       ]
     },
     {
@@ -213,7 +213,7 @@
       },
       "outputs": [],
       "source": [
-        "A = te.placeholder((m,), name='A')\nB = te.compute((m,), lambda i: A[i]+1, name='B')\nC = te.compute((m,), lambda i: B[i]*2, name='C')\n\ns = te.create_schedule(C.op)\ns[B].compute_at(s[C], C.op.axis[0])\nprint(tvm.lower(s, [A, B, C], simple_mode=True))"
+        "A = te.placeholder((m,), name=\"A\")\nB = te.compute((m,), lambda i: A[i] + 1, name=\"B\")\nC = te.compute((m,), lambda i: B[i] * 2, name=\"C\")\n\ns = te.create_schedule(C.op)\ns[B].compute_at(s[C], C.op.axis[0])\nprint(tvm.lower(s, [A, B, C], simple_mode=True))"
       ]
     },
     {
@@ -231,7 +231,7 @@
       },
       "outputs": [],
       "source": [
-        "A = te.placeholder((m,), name='A')\nB = te.compute((m,), lambda i: A[i]+1, name='B')\nC = te.compute((m,), lambda i: B[i]*2, name='C')\n\ns = te.create_schedule(C.op)\ns[B].compute_inline()\nprint(tvm.lower(s, [A, B, C], simple_mode=True))"
+        "A = te.placeholder((m,), name=\"A\")\nB = te.compute((m,), lambda i: A[i] + 1, name=\"B\")\nC = te.compute((m,), lambda i: B[i] * 2, name=\"C\")\n\ns = te.create_schedule(C.op)\ns[B].compute_inline()\nprint(tvm.lower(s, [A, B, C], simple_mode=True))"
       ]
     },
     {
@@ -249,7 +249,7 @@
       },
       "outputs": [],
       "source": [
-        "A = te.placeholder((m,), name='A')\nB = te.compute((m,), lambda i: A[i]+1, name='B')\nC = te.compute((m,), lambda i: B[i]*2, name='C')\n\ns = te.create_schedule(C.op)\ns[B].compute_at(s[C], C.op.axis[0])\ns[B].compute_root()\nprint(tvm.lower(s, [A, B, C], simple_mode=True))"
+        "A = te.placeholder((m,), name=\"A\")\nB = te.compute((m,), lambda i: A[i] + 1, name=\"B\")\nC = te.compute((m,), lambda i: B[i] * 2, name=\"C\")\n\ns = te.create_schedule(C.op)\ns[B].compute_at(s[C], C.op.axis[0])\ns[B].compute_root()\nprint(tvm.lower(s, [A, B, C], simple_mode=True))"
       ]
     },
     {
diff --git a/docs/_downloads/65bd9927a152de6eed3444185b24287f/tensorize.ipynb b/docs/_downloads/65bd9927a152de6eed3444185b24287f/tensorize.ipynb
index 494c364..87eb568 100644
--- a/docs/_downloads/65bd9927a152de6eed3444185b24287f/tensorize.ipynb
+++ b/docs/_downloads/65bd9927a152de6eed3444185b24287f/tensorize.ipynb
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "N, M, L = 1024, 512, 64\nA = te.placeholder((N, L), name='A')\nB = te.placeholder((M, L), name='B')\nk = te.reduce_axis((0, L), name='k')\nC = te.compute((N, M), lambda i, j:\n                te.sum(A[i, k] * B[j, k], axis=k), name='C')\ns = te.create_schedule(C.op)\nprint(tvm.lower(s, [A, B, C], simple_mode=True))"
+        "N, M, L = 1024, 512, 64\nA = te.placeholder((N, L), name=\"A\")\nB = te.placeholder((M, L), name=\"B\")\nk = te.reduce_axis((0, L), name=\"k\")\nC = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[j, k], axis=k), name=\"C\")\ns = te.create_schedule(C.op)\nprint(tvm.lower(s, [A, B, C], simple_mode=True))"
       ]
     },
     {
@@ -62,7 +62,7 @@
       },
       "outputs": [],
       "source": [
-        "factor = 16\nx, y = C.op.axis\nz, = C.op.reduce_axis\nyo, yi = s[C].split(y, factor=factor)\ns[C].reorder(x, yo, yi, z)\nprint(tvm.lower(s, [A, B, C], simple_mode=True))"
+        "factor = 16\nx, y = C.op.axis\n(z,) = C.op.reduce_axis\nyo, yi = s[C].split(y, factor=factor)\ns[C].reorder(x, yo, yi, z)\nprint(tvm.lower(s, [A, B, C], simple_mode=True))"
       ]
     },
     {
@@ -80,7 +80,7 @@
       },
       "outputs": [],
       "source": [
-        "def intrin_gemv(m, l):\n    a = te.placeholder((l,), name='a')\n    b = te.placeholder((m, l), name='b')\n    k = te.reduce_axis((0, l), name='k')\n    c = te.compute((m,), lambda i: te.sum(a[k] * b[i, k], axis=k), name='c')\n    Ab = tvm.tir.decl_buffer(a.shape, a.dtype,\n                         name=\"A\",\n                         offset_factor=1,\n                         strides=[1])\n    Bb = tvm.tir.decl_buffer(b.shape, b.dtype,\n                         name=\"B\",\n    [...]
+        "def intrin_gemv(m, l):\n    a = te.placeholder((l,), name=\"a\")\n    b = te.placeholder((m, l), name=\"b\")\n    k = te.reduce_axis((0, l), name=\"k\")\n    c = te.compute((m,), lambda i: te.sum(a[k] * b[i, k], axis=k), name=\"c\")\n    Ab = tvm.tir.decl_buffer(a.shape, a.dtype, name=\"A\", offset_factor=1, strides=[1])\n    Bb = tvm.tir.decl_buffer(b.shape, b.dtype, name=\"B\", offset_factor=1, strides=[te.var(\"s1\"), 1])\n    Cb = tvm.tir.decl_buffer(c.shape, c.dtype, name=\ [...]
       ]
     },
     {
@@ -116,7 +116,7 @@
       },
       "outputs": [],
       "source": [
-        "def gemv_impl():\n    cc_code = \"\"\"\n      extern \"C\" int gemv_update(float *cc, float *aa, float *bb, int m, int l, int stride) {\n        for (int i = 0; i < m; ++i) {\n            for (int j = 0; j < l; ++j) {\n                cc[i] += aa[j] * bb[i * stride + j];\n            }\n        }\n        return 0;\n      }\n    \"\"\"\n    from tvm.contrib import util, clang\n    temp = util.tempdir()\n    ll_path = temp.relpath(\"temp.ll\")\n    # Create LLVM ir from c source  [...]
+        "def gemv_impl():\n    cc_code = \"\"\"\n      extern \"C\" int gemv_update(float *cc, float *aa, float *bb, int m, int l, int stride) {\n        for (int i = 0; i < m; ++i) {\n            for (int j = 0; j < l; ++j) {\n                cc[i] += aa[j] * bb[i * stride + j];\n            }\n        }\n        return 0;\n      }\n    \"\"\"\n    from tvm.contrib import util, clang\n\n    temp = util.tempdir()\n    ll_path = temp.relpath(\"temp.ll\")\n    # Create LLVM ir from c sourc [...]
       ]
     },
     {
@@ -152,7 +152,7 @@
       },
       "outputs": [],
       "source": [
-        "func = tvm.build(s, [A, B, C], target=\"llvm\", name=\"gemv\")\n\nfrom tvm.topi.util import get_const_tuple\ndtype = A.dtype\nctx = tvm.context(\"cpu\", 0)\na = np.random.uniform(size=get_const_tuple(A.shape)).astype(dtype)\nb = np.random.uniform(size=get_const_tuple(B.shape)).astype(dtype)\nc = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), ctx)\nfunc(tvm.nd.array(a, ctx), tvm.nd.array(b, ctx), c)\ntvm.testing.assert_allclose(c.asnumpy(), np.dot(a, b.T), rtol=1e-3)"
+        "func = tvm.build(s, [A, B, C], target=\"llvm\", name=\"gemv\")\n\nfrom tvm.topi.util import get_const_tuple\n\ndtype = A.dtype\nctx = tvm.context(\"cpu\", 0)\na = np.random.uniform(size=get_const_tuple(A.shape)).astype(dtype)\nb = np.random.uniform(size=get_const_tuple(B.shape)).astype(dtype)\nc = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), ctx)\nfunc(tvm.nd.array(a, ctx), tvm.nd.array(b, ctx), c)\ntvm.testing.assert_allclose(c.asnumpy(), np.dot(a, b.T), rtol=1e-3)"
       ]
     },
     {
@@ -188,7 +188,7 @@
       },
       "outputs": [],
       "source": [
-        "def gemv_impl():\n    cc_code = \"\"\"\n      extern \"C\" int gemv_update(float *cc, float *aa, float *bb, int m, int l, int stride) {\n        for (int i = 0; i < m; ++i) {\n            for (int j = 0; j < l; ++j) {\n                cc[i] += aa[j] * bb[i * stride + j];\n            }\n        }\n        return 0;\n      }\n      extern \"C\" int gemv_reset(float *cc, int m) {\n        for (int i = 0; i < m; ++i) {\n            cc[i] = 0.0;\n        }\n        return 0;\n       [...]
+        "def gemv_impl():\n    cc_code = \"\"\"\n      extern \"C\" int gemv_update(float *cc, float *aa, float *bb, int m, int l, int stride) {\n        for (int i = 0; i < m; ++i) {\n            for (int j = 0; j < l; ++j) {\n                cc[i] += aa[j] * bb[i * stride + j];\n            }\n        }\n        return 0;\n      }\n      extern \"C\" int gemv_reset(float *cc, int m) {\n        for (int i = 0; i < m; ++i) {\n            cc[i] = 0.0;\n        }\n        return 0;\n       [...]
       ]
     },
     {
diff --git a/docs/_downloads/6748a8fb7e82692825b259c20af8372a/opt_conv_cuda.ipynb b/docs/_downloads/6748a8fb7e82692825b259c20af8372a/opt_conv_cuda.ipynb
index fc71dcf..f720e0a 100644
--- a/docs/_downloads/6748a8fb7e82692825b259c20af8372a/opt_conv_cuda.ipynb
+++ b/docs/_downloads/6748a8fb7e82692825b259c20af8372a/opt_conv_cuda.ipynb
@@ -33,7 +33,7 @@
       },
       "outputs": [],
       "source": [
-        "import numpy as np\nimport tvm\nfrom tvm import te\n\n# The sizes of inputs and filters\nbatch = 256\nin_channel = 256\nout_channel = 512\nin_size = 14\nkernel = 3\npad = 1\nstride = 1\n\n# Algorithm\nA = te.placeholder((in_size, in_size, in_channel, batch), name='A')\nW = te.placeholder((kernel, kernel, in_channel, out_channel), name='W')\nout_size = (in_size - kernel + 2*pad) // stride + 1\n# Pad input\nApad = te.compute(\n    (in_size + 2*pad, in_size + 2*pad, in_channel, bat [...]
+        "import numpy as np\nimport tvm\nfrom tvm import te\n\n# The sizes of inputs and filters\nbatch = 256\nin_channel = 256\nout_channel = 512\nin_size = 14\nkernel = 3\npad = 1\nstride = 1\n\n# Algorithm\nA = te.placeholder((in_size, in_size, in_channel, batch), name=\"A\")\nW = te.placeholder((kernel, kernel, in_channel, out_channel), name=\"W\")\nout_size = (in_size - kernel + 2 * pad) // stride + 1\n# Pad input\nApad = te.compute(\n    (in_size + 2 * pad, in_size + 2 * pad, in_ch [...]
       ]
     },
     {
@@ -51,7 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "# Designate the memory hierarchy\ns = te.create_schedule(B.op)\ns[Apad].compute_inline() # compute Apad inline\nAA = s.cache_read(Apad, 'shared', [B])\nWW = s.cache_read(W, \"shared\", [B])\nAL = s.cache_read(AA, \"local\", [B])\nWL = s.cache_read(WW, \"local\", [B])\nBL = s.cache_write(B, \"local\")"
+        "# Designate the memory hierarchy\ns = te.create_schedule(B.op)\ns[Apad].compute_inline()  # compute Apad inline\nAA = s.cache_read(Apad, \"shared\", [B])\nWW = s.cache_read(W, \"shared\", [B])\nAL = s.cache_read(AA, \"local\", [B])\nWL = s.cache_read(WW, \"local\", [B])\nBL = s.cache_write(B, \"local\")"
       ]
     },
     {
@@ -123,7 +123,7 @@
       },
       "outputs": [],
       "source": [
-        "func = tvm.build(s, [A, W, B], 'cuda')\nctx = tvm.gpu(0)\na_np = np.random.uniform(size=(in_size, in_size, in_channel, batch)).astype(A.dtype)\nw_np = np.random.uniform(size=(kernel, kernel, in_channel, out_channel)).astype(W.dtype)\na = tvm.nd.array(a_np, ctx)\nw = tvm.nd.array(w_np, ctx)\nb = tvm.nd.array(np.zeros((out_size, out_size, out_channel, batch), dtype=B.dtype), ctx)\nfunc(a, w, b)\nevaluator = func.time_evaluator(func.entry_name, ctx, number=1)\nprint('Convolution: % [...]
+        "func = tvm.build(s, [A, W, B], \"cuda\")\nctx = tvm.gpu(0)\na_np = np.random.uniform(size=(in_size, in_size, in_channel, batch)).astype(A.dtype)\nw_np = np.random.uniform(size=(kernel, kernel, in_channel, out_channel)).astype(W.dtype)\na = tvm.nd.array(a_np, ctx)\nw = tvm.nd.array(w_np, ctx)\nb = tvm.nd.array(np.zeros((out_size, out_size, out_channel, batch), dtype=B.dtype), ctx)\nfunc(a, w, b)\nevaluator = func.time_evaluator(func.entry_name, ctx, number=1)\nprint(\"Convolution [...]
       ]
     }
   ],
diff --git a/docs/_downloads/67c18c78b0f12c3be5dc41b22637d719/matrix_multiply_opt.py b/docs/_downloads/67c18c78b0f12c3be5dc41b22637d719/matrix_multiply_opt.py
index 77b0381..28600d4 100644
--- a/docs/_downloads/67c18c78b0f12c3be5dc41b22637d719/matrix_multiply_opt.py
+++ b/docs/_downloads/67c18c78b0f12c3be5dc41b22637d719/matrix_multiply_opt.py
@@ -105,62 +105,49 @@ assert in_channels % env.BLOCK_IN == 0
 assert out_channels % env.BLOCK_OUT == 0
 
 # Let's derive the tiled input tensor shapes
-data_shape = (batch_size // env.BATCH,
-              in_channels // env.BLOCK_IN,
-              env.BATCH,
-              env.BLOCK_IN)
-weight_shape = (out_channels // env.BLOCK_OUT,
-                in_channels // env.BLOCK_IN,
-                env.BLOCK_OUT,
-                env.BLOCK_IN)
-output_shape = (batch_size // env.BATCH,
-                out_channels // env.BLOCK_OUT,
-                env.BATCH,
-                env.BLOCK_OUT)
+data_shape = (batch_size // env.BATCH, in_channels // env.BLOCK_IN, env.BATCH, env.BLOCK_IN)
+weight_shape = (
+    out_channels // env.BLOCK_OUT,
+    in_channels // env.BLOCK_IN,
+    env.BLOCK_OUT,
+    env.BLOCK_IN,
+)
+output_shape = (batch_size // env.BATCH, out_channels // env.BLOCK_OUT, env.BATCH, env.BLOCK_OUT)
 num_ops = in_channels * out_channels * batch_size * 2
 
 # Reduction axes
-ic = te.reduce_axis((0, in_channels // env.BLOCK_IN), name='ic')
-ic_tns = te.reduce_axis((0, env.BLOCK_IN), name='ic_tns')
+ic = te.reduce_axis((0, in_channels // env.BLOCK_IN), name="ic")
+ic_tns = te.reduce_axis((0, env.BLOCK_IN), name="ic_tns")
 
 # Input placeholder tensors
 data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
 weight = te.placeholder(weight_shape, name="weight", dtype=env.wgt_dtype)
 
 # Copy buffers
-data_buf = te.compute(data_shape,
-                       lambda *i: data(*i),
-                       "data_buf")
-weight_buf = te.compute(weight_shape,
-                         lambda *i: weight(*i),
-                         "weight_buf")
+data_buf = te.compute(data_shape, lambda *i: data(*i), "data_buf")
+weight_buf = te.compute(weight_shape, lambda *i: weight(*i), "weight_buf")
 
 # Declare matrix multiply computation
-res_gemm = te.compute(output_shape,
-                       lambda bo, co, bi, ci: te.sum(
-                            data_buf[bo, ic, bi, ic_tns].astype(env.acc_dtype) *
-                            weight_buf[co, ic, ci, ic_tns].astype(env.acc_dtype),
-                            axis=[ic, ic_tns]),
-                       name="res_gem")
+res_gemm = te.compute(
+    output_shape,
+    lambda bo, co, bi, ci: te.sum(
+        data_buf[bo, ic, bi, ic_tns].astype(env.acc_dtype)
+        * weight_buf[co, ic, ci, ic_tns].astype(env.acc_dtype),
+        axis=[ic, ic_tns],
+    ),
+    name="res_gem",
+)
 
 # Add shift stage for fix-point normalization
-res_shr = te.compute(output_shape,
-                      lambda *i: res_gemm(*i) >> env.INP_WIDTH,
-                      name="res_shr")
+res_shr = te.compute(output_shape, lambda *i: res_gemm(*i) >> env.INP_WIDTH, name="res_shr")
 
 # Apply clipping between (0, input max value)
-inp_max = (1<<(env.INP_WIDTH-1))-1
-res_max = te.compute(output_shape,
-                      lambda *i: tvm.te.max(res_shr(*i), 0),
-                      "res_max")
-res_min = te.compute(output_shape,
-                      lambda *i: tvm.te.min(res_max(*i), inp_max),
-                      "res_min")
+inp_max = (1 << (env.INP_WIDTH - 1)) - 1
+res_max = te.compute(output_shape, lambda *i: tvm.te.max(res_shr(*i), 0), "res_max")
+res_min = te.compute(output_shape, lambda *i: tvm.te.min(res_max(*i), inp_max), "res_min")
 
 # Apply typecast to input data type before sending results back
-res = te.compute(output_shape,
-                  lambda *i: res_min(*i).astype(env.inp_dtype),
-                  name="res")
+res = te.compute(output_shape, lambda *i: res_min(*i).astype(env.inp_dtype), name="res")
 
 ######################################################################
 # Scheduling the Computation
@@ -333,20 +320,16 @@ f = remote.load_module("gemm.o")
 ctx = remote.ext_dev(0)
 
 # Initialize the data and weight arrays randomly in the int range of (-128, 128]
-data_np = np.random.randint(
-    -128, 128, size=(batch_size, in_channels)).astype(data.dtype)
-weight_np = np.random.randint(
-    -128, 128, size=(out_channels, in_channels)).astype(weight.dtype)
+data_np = np.random.randint(-128, 128, size=(batch_size, in_channels)).astype(data.dtype)
+weight_np = np.random.randint(-128, 128, size=(out_channels, in_channels)).astype(weight.dtype)
 
 # Apply packing to the data and weight arrays from a 2D to a 4D packed layout
-data_packed = data_np.reshape(batch_size // env.BATCH,
-                              env.BATCH,
-                              in_channels // env.BLOCK_IN,
-                              env.BLOCK_IN).transpose((0, 2, 1, 3))
-weight_packed = weight_np.reshape(out_channels // env.BLOCK_OUT,
-                                  env.BLOCK_OUT,
-                                  in_channels // env.BLOCK_IN,
-                                  env.BLOCK_IN).transpose((0, 2, 1, 3))
+data_packed = data_np.reshape(
+    batch_size // env.BATCH, env.BATCH, in_channels // env.BLOCK_IN, env.BLOCK_IN
+).transpose((0, 2, 1, 3))
+weight_packed = weight_np.reshape(
+    out_channels // env.BLOCK_OUT, env.BLOCK_OUT, in_channels // env.BLOCK_IN, env.BLOCK_IN
+).transpose((0, 2, 1, 3))
 
 # Format the input/output arrays with tvm.nd.array to the DLPack standard
 data_nd = tvm.nd.array(data_packed, ctx)
@@ -361,15 +344,13 @@ if env.TARGET in ["sim", "tsim"]:
 f(data_nd, weight_nd, res_nd)
 
 # Verify against numpy implementation
-res_ref = np.dot(data_np.astype(env.acc_dtype),
-                 weight_np.T.astype(env.acc_dtype))
+res_ref = np.dot(data_np.astype(env.acc_dtype), weight_np.T.astype(env.acc_dtype))
 res_ref = res_ref >> env.INP_WIDTH
 res_ref = np.clip(res_ref, 0, inp_max)
 res_ref = res_ref.astype(res.dtype)
-res_ref = res_ref.reshape(batch_size // env.BATCH,
-                          env.BATCH,
-                          out_channels // env.BLOCK_OUT,
-                          env.BLOCK_OUT).transpose((0, 2, 1, 3))
+res_ref = res_ref.reshape(
+    batch_size // env.BATCH, env.BATCH, out_channels // env.BLOCK_OUT, env.BLOCK_OUT
+).transpose((0, 2, 1, 3))
 np.testing.assert_equal(res_ref, res_nd.asnumpy())
 
 # Print stats
diff --git a/docs/_downloads/67c1d621743dc9152a29366a6b72ae71/tuple_inputs.py b/docs/_downloads/67c1d621743dc9152a29366a6b72ae71/tuple_inputs.py
index 828797a..73db7b9 100644
--- a/docs/_downloads/67c1d621743dc9152a29366a6b72ae71/tuple_inputs.py
+++ b/docs/_downloads/67c1d621743dc9152a29366a6b72ae71/tuple_inputs.py
@@ -40,9 +40,9 @@ import numpy as np
 #
 n = te.var("n")
 m = te.var("m")
-A0 = te.placeholder((m, n), name='A0')
-A1 = te.placeholder((m, n), name='A1')
-B0, B1 = te.compute((m, n), lambda i, j: (A0[i, j] + 2, A1[i, j] * 3), name='B')
+A0 = te.placeholder((m, n), name="A0")
+A1 = te.placeholder((m, n), name="A1")
+B0, B1 = te.compute((m, n), lambda i, j: (A0[i, j] + 2, A1[i, j] * 3), name="B")
 
 # The generated IR code would be:
 s = te.create_schedule(B0.op)
@@ -66,20 +66,22 @@ def fcombine(x, y):
     rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1])
     return lhs, rhs
 
+
 # our identity element also need to be a tuple, so `fidentity` accepts
 # two types as inputs.
 def fidentity(t0, t1):
     return tvm.tir.const(-1, t0), tvm.te.min_value(t1)
 
-argmax = te.comm_reducer(fcombine, fidentity, name='argmax')
+
+argmax = te.comm_reducer(fcombine, fidentity, name="argmax")
 
 # describe the reduction computation
-m = te.var('m')
-n = te.var('n')
-idx = te.placeholder((m, n), name='idx', dtype='int32')
-val = te.placeholder((m, n), name='val', dtype='int32')
-k = te.reduce_axis((0, n), 'k')
-T0, T1 = te.compute((m, ), lambda i: argmax((idx[i, k], val[i, k]), axis=k), name='T')
+m = te.var("m")
+n = te.var("n")
+idx = te.placeholder((m, n), name="idx", dtype="int32")
+val = te.placeholder((m, n), name="val", dtype="int32")
+k = te.reduce_axis((0, n), "k")
+T0, T1 = te.compute((m,), lambda i: argmax((idx[i, k], val[i, k]), axis=k), name="T")
 
 # the generated IR code would be:
 s = te.create_schedule(T0.op)
@@ -100,10 +102,10 @@ print(tvm.lower(s, [idx, val, T0, T1], simple_mode=True))
 
 n = te.var("n")
 m = te.var("m")
-A0 = te.placeholder((m, n), name='A0')
-B0, B1 = te.compute((m, n), lambda i, j: (A0[i, j] + 2, A0[i, j] * 3), name='B')
-A1 = te.placeholder((m, n), name='A1')
-C = te.compute((m, n), lambda i, j: A1[i, j] + B0[i, j], name='C')
+A0 = te.placeholder((m, n), name="A0")
+B0, B1 = te.compute((m, n), lambda i, j: (A0[i, j] + 2, A0[i, j] * 3), name="B")
+A1 = te.placeholder((m, n), name="A1")
+C = te.compute((m, n), lambda i, j: A1[i, j] + B0[i, j], name="C")
 
 s = te.create_schedule(C.op)
 s[B0].compute_at(s[C], C.op.axis[0])
diff --git a/docs/_downloads/696dd37904ef92773435ca321ff41bfb/from_onnx.py b/docs/_downloads/696dd37904ef92773435ca321ff41bfb/from_onnx.py
index 9973a08..e68a398 100644
--- a/docs/_downloads/696dd37904ef92773435ca321ff41bfb/from_onnx.py
+++ b/docs/_downloads/696dd37904ef92773435ca321ff41bfb/from_onnx.py
@@ -45,11 +45,15 @@ from tvm.contrib.download import download_testdata
 # The example super resolution model used here is exactly the same model in onnx tutorial
 # http://pytorch.org/tutorials/advanced/super_resolution_with_caffe2.html
 # we skip the pytorch model construction part, and download the saved onnx model
-model_url = ''.join(['https://gist.github.com/zhreshold/',
-                     'bcda4716699ac97ea44f791c24310193/raw/',
-                     '93672b029103648953c4e5ad3ac3aadf346a4cdc/',
-                     'super_resolution_0.2.onnx'])
-model_path = download_testdata(model_url, 'super_resolution.onnx', module='onnx')
+model_url = "".join(
+    [
+        "https://gist.github.com/zhreshold/",
+        "bcda4716699ac97ea44f791c24310193/raw/",
+        "93672b029103648953c4e5ad3ac3aadf346a4cdc/",
+        "super_resolution_0.2.onnx",
+    ]
+)
+model_path = download_testdata(model_url, "super_resolution.onnx", module="onnx")
 # now you have super_resolution.onnx on disk
 onnx_model = onnx.load(model_path)
 
@@ -58,8 +62,9 @@ onnx_model = onnx.load(model_path)
 # ---------------------------------------------
 # A single cat dominates the examples!
 from PIL import Image
-img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
-img_path = download_testdata(img_url, 'cat.png', module='data')
+
+img_url = "https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true"
+img_path = download_testdata(img_url, "cat.png", module="data")
 img = Image.open(img_path).resize((224, 224))
 img_ycbcr = img.convert("YCbCr")  # convert to YCbCr
 img_y, img_cb, img_cr = img_ycbcr.split()
@@ -68,19 +73,19 @@ x = np.array(img_y)[np.newaxis, np.newaxis, :, :]
 ######################################################################
 # Compile the model with relay
 # ---------------------------------------------
-target = 'llvm'
+target = "llvm"
 
-input_name = '1'
+input_name = "1"
 shape_dict = {input_name: x.shape}
 mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
 
 with tvm.transform.PassContext(opt_level=1):
-    intrp = relay.build_module.create_executor('graph', mod, tvm.cpu(0), target)
+    intrp = relay.build_module.create_executor("graph", mod, tvm.cpu(0), target)
 
 ######################################################################
 # Execute on TVM
 # ---------------------------------------------
-dtype = 'float32'
+dtype = "float32"
 tvm_output = intrp.evaluate()(tvm.nd.array(x.astype(dtype)), **params).asnumpy()
 
 ######################################################################
@@ -88,11 +93,12 @@ tvm_output = intrp.evaluate()(tvm.nd.array(x.astype(dtype)), **params).asnumpy()
 # ---------------------------------------------
 # We put input and output image neck to neck
 from matplotlib import pyplot as plt
-out_y = Image.fromarray(np.uint8((tvm_output[0, 0]).clip(0, 255)), mode='L')
+
+out_y = Image.fromarray(np.uint8((tvm_output[0, 0]).clip(0, 255)), mode="L")
 out_cb = img_cb.resize(out_y.size, Image.BICUBIC)
 out_cr = img_cr.resize(out_y.size, Image.BICUBIC)
-result = Image.merge('YCbCr', [out_y, out_cb, out_cr]).convert('RGB')
-canvas = np.full((672, 672*2, 3), 255)
+result = Image.merge("YCbCr", [out_y, out_cb, out_cr]).convert("RGB")
+canvas = np.full((672, 672 * 2, 3), 255)
 canvas[0:224, 0:224, :] = np.asarray(img)
 canvas[:, 672:, :] = np.asarray(result)
 plt.imshow(canvas.astype(np.uint8))
diff --git a/docs/_downloads/6a91d98d4242322072303282a1f2de9c/relay_quick_start.py b/docs/_downloads/6a91d98d4242322072303282a1f2de9c/relay_quick_start.py
index e52a99a..5c7f933 100644
--- a/docs/_downloads/6a91d98d4242322072303282a1f2de9c/relay_quick_start.py
+++ b/docs/_downloads/6a91d98d4242322072303282a1f2de9c/relay_quick_start.py
@@ -66,7 +66,8 @@ data_shape = (batch_size,) + image_shape
 out_shape = (batch_size, num_class)
 
 mod, params = relay.testing.resnet.get_workload(
-    num_layers=18, batch_size=batch_size, image_shape=image_shape)
+    num_layers=18, batch_size=batch_size, image_shape=image_shape
+)
 
 # set show_meta_data=True if you want to show meta data
 print(mod.astext(show_meta_data=False))
@@ -97,7 +98,7 @@ print(mod.astext(show_meta_data=False))
 opt_level = 3
 target = tvm.target.cuda()
 with tvm.transform.PassContext(opt_level=opt_level):
-    graph, lib, params = relay.build(mod, target, params=params)
+    lib = relay.build(mod, target, params=params)
 
 #####################################################################
 # Run the generate library
@@ -108,10 +109,9 @@ with tvm.transform.PassContext(opt_level=opt_level):
 ctx = tvm.gpu()
 data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
 # create module
-module = graph_runtime.create(graph, lib, ctx)
+module = graph_runtime.GraphModule(lib["default"](ctx))
 # set input and parameters
 module.set_input("data", data)
-module.set_input(**params)
 # run
 module.run()
 # get output
@@ -134,22 +134,15 @@ from tvm.contrib import util
 temp = util.tempdir()
 path_lib = temp.relpath("deploy_lib.tar")
 lib.export_library(path_lib)
-with open(temp.relpath("deploy_graph.json"), "w") as fo:
-    fo.write(graph)
-with open(temp.relpath("deploy_param.params"), "wb") as fo:
-    fo.write(relay.save_param_dict(params))
 print(temp.listdir())
 
 ####################################################
 
 # load the module back.
-loaded_json = open(temp.relpath("deploy_graph.json")).read()
 loaded_lib = tvm.runtime.load_module(path_lib)
-loaded_params = bytearray(open(temp.relpath("deploy_param.params"), "rb").read())
 input_data = tvm.nd.array(np.random.uniform(size=data_shape).astype("float32"))
 
-module = graph_runtime.create(loaded_json, loaded_lib, ctx)
-module.load_params(loaded_params)
+module = graph_runtime.GraphModule(loaded_lib["default"](ctx))
 module.run(data=input_data)
 out_deploy = module.get_output(0).asnumpy()
 
diff --git a/docs/_downloads/6be1519353297beeea03fe17712dc16f/using_external_lib.ipynb b/docs/_downloads/6be1519353297beeea03fe17712dc16f/using_external_lib.ipynb
index fe5e4f9..0806646 100644
--- a/docs/_downloads/6be1519353297beeea03fe17712dc16f/using_external_lib.ipynb
+++ b/docs/_downloads/6be1519353297beeea03fe17712dc16f/using_external_lib.ipynb
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "out_channels = 16\nbatch_size = 1\n\ndata = relay.var(\"data\", relay.TensorType((batch_size, 3, 224, 224), \"float32\"))\nweight = relay.var(\"weight\")\nbn_gamma = relay.var(\"bn_gamma\")\nbn_beta = relay.var(\"bn_beta\")\nbn_mmean = relay.var(\"bn_mean\")\nbn_mvar = relay.var(\"bn_var\")\n\nsimple_net = relay.nn.conv2d(data=data, weight=weight, kernel_size=(3,3), channels=out_channels, padding=(1, 1))\nsimple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, bn_mmean,  [...]
+        "out_channels = 16\nbatch_size = 1\n\ndata = relay.var(\"data\", relay.TensorType((batch_size, 3, 224, 224), \"float32\"))\nweight = relay.var(\"weight\")\nbn_gamma = relay.var(\"bn_gamma\")\nbn_beta = relay.var(\"bn_beta\")\nbn_mmean = relay.var(\"bn_mean\")\nbn_mvar = relay.var(\"bn_var\")\n\nsimple_net = relay.nn.conv2d(\n    data=data, weight=weight, kernel_size=(3, 3), channels=out_channels, padding=(1, 1)\n)\nsimple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, b [...]
       ]
     },
     {
@@ -62,7 +62,7 @@
       },
       "outputs": [],
       "source": [
-        "import logging\nlogging.basicConfig(level=logging.DEBUG) # to dump TVM IR after fusion\n\ntarget = \"cuda\"\nlib = relay.build_module.build(net, target, params=params)\n\nctx = tvm.context(target, 0)\ndata = np.random.uniform(-1, 1, size=data_shape).astype(\"float32\")\nmodule = runtime.GraphModule(lib['default'](ctx))\nmodule.set_input(\"data\", data)\nmodule.run()\nout_shape = (batch_size, out_channels, 224, 224)\nout = module.get_output(0, tvm.nd.empty(out_shape))\nout_cuda = [...]
+        "import logging\n\nlogging.basicConfig(level=logging.DEBUG)  # to dump TVM IR after fusion\n\ntarget = \"cuda\"\nlib = relay.build_module.build(net, target, params=params)\n\nctx = tvm.context(target, 0)\ndata = np.random.uniform(-1, 1, size=data_shape).astype(\"float32\")\nmodule = runtime.GraphModule(lib[\"default\"](ctx))\nmodule.set_input(\"data\", data)\nmodule.run()\nout_shape = (batch_size, out_channels, 224, 224)\nout = module.get_output(0, tvm.nd.empty(out_shape))\nout_c [...]
       ]
     },
     {
@@ -87,7 +87,7 @@
       },
       "outputs": [],
       "source": [
-        "net, params = testing.create_workload(simple_net)\ntarget = \"cuda -libs=cudnn\" # use cudnn for convolution\nlib = relay.build_module.build(net, target, params=params)\n\nctx = tvm.context(target, 0)\ndata = np.random.uniform(-1, 1, size=data_shape).astype(\"float32\")\nmodule = runtime.GraphModule(lib['default'](ctx))\nmodule.set_input(\"data\", data)\nmodule.run()\nout_shape = (batch_size, out_channels, 224, 224)\nout = module.get_output(0, tvm.nd.empty(out_shape))\nout_cudnn [...]
+        "net, params = testing.create_workload(simple_net)\ntarget = \"cuda -libs=cudnn\"  # use cudnn for convolution\nlib = relay.build_module.build(net, target, params=params)\n\nctx = tvm.context(target, 0)\ndata = np.random.uniform(-1, 1, size=data_shape).astype(\"float32\")\nmodule = runtime.GraphModule(lib[\"default\"](ctx))\nmodule.set_input(\"data\", data)\nmodule.run()\nout_shape = (batch_size, out_channels, 224, 224)\nout = module.get_output(0, tvm.nd.empty(out_shape))\nout_cu [...]
       ]
     },
     {
diff --git a/docs/_downloads/6c8a9d3bc4c689f8680a968349965ee5/from_pytorch.ipynb b/docs/_downloads/6c8a9d3bc4c689f8680a968349965ee5/from_pytorch.ipynb
index f333463..3023e84 100644
--- a/docs/_downloads/6c8a9d3bc4c689f8680a968349965ee5/from_pytorch.ipynb
+++ b/docs/_downloads/6c8a9d3bc4c689f8680a968349965ee5/from_pytorch.ipynb
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "model_name = 'resnet18'\nmodel = getattr(torchvision.models, model_name)(pretrained=True)\nmodel = model.eval()\n\n# We grab the TorchScripted model via tracing\ninput_shape = [1, 3, 224, 224]\ninput_data = torch.randn(input_shape)\nscripted_model = torch.jit.trace(model, input_data).eval()"
+        "model_name = \"resnet18\"\nmodel = getattr(torchvision.models, model_name)(pretrained=True)\nmodel = model.eval()\n\n# We grab the TorchScripted model via tracing\ninput_shape = [1, 3, 224, 224]\ninput_data = torch.randn(input_shape)\nscripted_model = torch.jit.trace(model, input_data).eval()"
       ]
     },
     {
@@ -62,7 +62,7 @@
       },
       "outputs": [],
       "source": [
-        "from PIL import Image\nimg_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'\nimg_path = download_testdata(img_url, 'cat.png', module='data')\nimg = Image.open(img_path).resize((224, 224))\n\n# Preprocess the image and convert to tensor\nfrom torchvision import transforms\nmy_preprocess = transforms.Compose([\n    transforms.Resize(256),\n    transforms.CenterCrop(224),\n    transforms.ToTensor(),\n    transforms.Normalize(mean=[0.485, 0.456, 0.406],\n   [...]
+        "from PIL import Image\n\nimg_url = \"https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true\"\nimg_path = download_testdata(img_url, \"cat.png\", module=\"data\")\nimg = Image.open(img_path).resize((224, 224))\n\n# Preprocess the image and convert to tensor\nfrom torchvision import transforms\n\nmy_preprocess = transforms.Compose(\n    [\n        transforms.Resize(256),\n        transforms.CenterCrop(224),\n        transforms.ToTensor(),\n        transforms.Normalize [...]
       ]
     },
     {
@@ -80,7 +80,7 @@
       },
       "outputs": [],
       "source": [
-        "input_name = 'input0'\nshape_list = [(input_name, img.shape)]\nmod, params = relay.frontend.from_pytorch(scripted_model,\n                                          shape_list)"
+        "input_name = \"input0\"\nshape_list = [(input_name, img.shape)]\nmod, params = relay.frontend.from_pytorch(scripted_model, shape_list)"
       ]
     },
     {
@@ -98,7 +98,7 @@
       },
       "outputs": [],
       "source": [
-        "target = 'llvm'\ntarget_host = 'llvm'\nctx = tvm.cpu(0)\nwith tvm.transform.PassContext(opt_level=3):\n    lib = relay.build(mod, target=target, target_host=target_host, params=params)"
+        "target = \"llvm\"\ntarget_host = \"llvm\"\nctx = tvm.cpu(0)\nwith tvm.transform.PassContext(opt_level=3):\n    lib = relay.build(mod, target=target, target_host=target_host, params=params)"
       ]
     },
     {
@@ -116,7 +116,7 @@
       },
       "outputs": [],
       "source": [
-        "from tvm.contrib import graph_runtime\ndtype = 'float32'\nm = graph_runtime.GraphModule(lib['default'](ctx))\n# Set inputs\nm.set_input(input_name, tvm.nd.array(img.astype(dtype)))\n# Execute\nm.run()\n# Get outputs\ntvm_output = m.get_output(0)"
+        "from tvm.contrib import graph_runtime\n\ndtype = \"float32\"\nm = graph_runtime.GraphModule(lib[\"default\"](ctx))\n# Set inputs\nm.set_input(input_name, tvm.nd.array(img.astype(dtype)))\n# Execute\nm.run()\n# Get outputs\ntvm_output = m.get_output(0)"
       ]
     },
     {
@@ -134,7 +134,7 @@
       },
       "outputs": [],
       "source": [
-        "synset_url = ''.join(['https://raw.githubusercontent.com/Cadene/',\n                      'pretrained-models.pytorch/master/data/',\n                      'imagenet_synsets.txt'])\nsynset_name = 'imagenet_synsets.txt'\nsynset_path = download_testdata(synset_url, synset_name, module='data')\nwith open(synset_path) as f:\n    synsets = f.readlines()\n\nsynsets = [x.strip() for x in synsets]\nsplits = [line.split(' ') for line in synsets]\nkey_to_classname = {spl[0]:' '.join(spl[1: [...]
+        "synset_url = \"\".join(\n    [\n        \"https://raw.githubusercontent.com/Cadene/\",\n        \"pretrained-models.pytorch/master/data/\",\n        \"imagenet_synsets.txt\",\n    ]\n)\nsynset_name = \"imagenet_synsets.txt\"\nsynset_path = download_testdata(synset_url, synset_name, module=\"data\")\nwith open(synset_path) as f:\n    synsets = f.readlines()\n\nsynsets = [x.strip() for x in synsets]\nsplits = [line.split(\" \") for line in synsets]\nkey_to_classname = {spl[0]: \"  [...]
       ]
     }
   ],
diff --git a/docs/_downloads/70a0767409e81bb5aaa9ce4e7a151dec/tensor_expr_get_started.ipynb b/docs/_downloads/70a0767409e81bb5aaa9ce4e7a151dec/tensor_expr_get_started.ipynb
index 691532a..03e14a2 100644
--- a/docs/_downloads/70a0767409e81bb5aaa9ce4e7a151dec/tensor_expr_get_started.ipynb
+++ b/docs/_downloads/70a0767409e81bb5aaa9ce4e7a151dec/tensor_expr_get_started.ipynb
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "from __future__ import absolute_import, print_function\n\nimport tvm\nfrom tvm import te\nimport numpy as np\n\n# Global declarations of environment.\n\ntgt_host=\"llvm\"\n# Change it to respective GPU if gpu is enabled Ex: cuda, opencl, rocm\ntgt=\"cuda\""
+        "from __future__ import absolute_import, print_function\n\nimport tvm\nfrom tvm import te\nimport numpy as np\n\n# Global declarations of environment.\n\ntgt_host = \"llvm\"\n# Change it to respective GPU if gpu is enabled Ex: cuda, opencl, rocm\ntgt = \"cuda\""
       ]
     },
     {
@@ -51,7 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "n = te.var(\"n\")\nA = te.placeholder((n,), name='A')\nB = te.placeholder((n,), name='B')\nC = te.compute(A.shape, lambda i: A[i] + B[i], name=\"C\")\nprint(type(C))"
+        "n = te.var(\"n\")\nA = te.placeholder((n,), name=\"A\")\nB = te.placeholder((n,), name=\"B\")\nC = te.compute(A.shape, lambda i: A[i] + B[i], name=\"C\")\nprint(type(C))"
       ]
     },
     {
@@ -105,7 +105,7 @@
       },
       "outputs": [],
       "source": [
-        "if tgt == \"cuda\" or tgt == \"rocm\" or tgt.startswith('opencl'):\n  s[C].bind(bx, te.thread_axis(\"blockIdx.x\"))\n  s[C].bind(tx, te.thread_axis(\"threadIdx.x\"))"
+        "if tgt == \"cuda\" or tgt == \"rocm\" or tgt.startswith(\"opencl\"):\n    s[C].bind(bx, te.thread_axis(\"blockIdx.x\"))\n    s[C].bind(tx, te.thread_axis(\"threadIdx.x\"))"
       ]
     },
     {
@@ -159,7 +159,7 @@
       },
       "outputs": [],
       "source": [
-        "if tgt == \"cuda\" or tgt == \"rocm\" or tgt.startswith('opencl'):\n    dev_module = fadd.imported_modules[0]\n    print(\"-----GPU code-----\")\n    print(dev_module.get_source())\nelse:\n    print(fadd.get_source())"
+        "if tgt == \"cuda\" or tgt == \"rocm\" or tgt.startswith(\"opencl\"):\n    dev_module = fadd.imported_modules[0]\n    print(\"-----GPU code-----\")\n    print(dev_module.get_source())\nelse:\n    print(fadd.get_source())"
       ]
     },
     {
@@ -184,7 +184,7 @@
       },
       "outputs": [],
       "source": [
-        "from tvm.contrib import cc\nfrom tvm.contrib import util\n\ntemp = util.tempdir()\nfadd.save(temp.relpath(\"myadd.o\"))\nif tgt == \"cuda\":\n    fadd.imported_modules[0].save(temp.relpath(\"myadd.ptx\"))\nif tgt == \"rocm\":\n    fadd.imported_modules[0].save(temp.relpath(\"myadd.hsaco\"))\nif tgt.startswith('opencl'):\n    fadd.imported_modules[0].save(temp.relpath(\"myadd.cl\"))\ncc.create_shared(temp.relpath(\"myadd.so\"), [temp.relpath(\"myadd.o\")])\nprint(temp.listdir())"
+        "from tvm.contrib import cc\nfrom tvm.contrib import util\n\ntemp = util.tempdir()\nfadd.save(temp.relpath(\"myadd.o\"))\nif tgt == \"cuda\":\n    fadd.imported_modules[0].save(temp.relpath(\"myadd.ptx\"))\nif tgt == \"rocm\":\n    fadd.imported_modules[0].save(temp.relpath(\"myadd.hsaco\"))\nif tgt.startswith(\"opencl\"):\n    fadd.imported_modules[0].save(temp.relpath(\"myadd.cl\"))\ncc.create_shared(temp.relpath(\"myadd.so\"), [temp.relpath(\"myadd.o\")])\nprint(temp.listdir())"
       ]
     },
     {
@@ -209,7 +209,7 @@
       },
       "outputs": [],
       "source": [
-        "fadd1 = tvm.runtime.load_module(temp.relpath(\"myadd.so\"))\nif tgt == \"cuda\":\n    fadd1_dev = tvm.runtime.load_module(temp.relpath(\"myadd.ptx\"))\n    fadd1.import_module(fadd1_dev)\n\nif tgt == \"rocm\":\n    fadd1_dev = tvm.runtime.load_module(temp.relpath(\"myadd.hsaco\"))\n    fadd1.import_module(fadd1_dev)\n\nif tgt.startswith('opencl'):\n    fadd1_dev = tvm.runtime.load_module(temp.relpath(\"myadd.cl\"))\n    fadd1.import_module(fadd1_dev)\n\nfadd1(a, b, c)\ntvm.testi [...]
+        "fadd1 = tvm.runtime.load_module(temp.relpath(\"myadd.so\"))\nif tgt == \"cuda\":\n    fadd1_dev = tvm.runtime.load_module(temp.relpath(\"myadd.ptx\"))\n    fadd1.import_module(fadd1_dev)\n\nif tgt == \"rocm\":\n    fadd1_dev = tvm.runtime.load_module(temp.relpath(\"myadd.hsaco\"))\n    fadd1.import_module(fadd1_dev)\n\nif tgt.startswith(\"opencl\"):\n    fadd1_dev = tvm.runtime.load_module(temp.relpath(\"myadd.cl\"))\n    fadd1.import_module(fadd1_dev)\n\nfadd1(a, b, c)\ntvm.tes [...]
       ]
     },
     {
@@ -252,7 +252,7 @@
       },
       "outputs": [],
       "source": [
-        "if tgt.startswith('opencl'):\n    fadd_cl = tvm.build(s, [A, B, C], tgt, name=\"myadd\")\n    print(\"------opencl code------\")\n    print(fadd_cl.imported_modules[0].get_source())\n    ctx = tvm.cl(0)\n    n = 1024\n    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)\n    b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)\n    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)\n    fadd_cl(a, b, c)\n    tvm.testing.assert_allclose(c.asnumpy(), a.a [...]
+        "if tgt.startswith(\"opencl\"):\n    fadd_cl = tvm.build(s, [A, B, C], tgt, name=\"myadd\")\n    print(\"------opencl code------\")\n    print(fadd_cl.imported_modules[0].get_source())\n    ctx = tvm.cl(0)\n    n = 1024\n    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)\n    b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)\n    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)\n    fadd_cl(a, b, c)\n    tvm.testing.assert_allclose(c.asnumpy(), a [...]
       ]
     },
     {
diff --git a/docs/_downloads/70d345c5409f99cb5de9dc44f147ff6f/build_gcn.py b/docs/_downloads/70d345c5409f99cb5de9dc44f147ff6f/build_gcn.py
index 3f4efeb..5c571ef 100644
--- a/docs/_downloads/70d345c5409f99cb5de9dc44f147ff6f/build_gcn.py
+++ b/docs/_downloads/70d345c5409f99cb5de9dc44f147ff6f/build_gcn.py
@@ -46,14 +46,9 @@ import dgl
 import networkx as nx
 from dgl.nn.pytorch import GraphConv
 
+
 class GCN(nn.Module):
-    def __init__(self,
-                 g,
-                 n_infeat,
-                 n_hidden,
-                 n_classes,
-                 n_layers,
-                 activation):
+    def __init__(self, g, n_infeat, n_hidden, n_classes, n_layers, activation):
         super(GCN, self).__init__()
         self.g = g
         self.layers = nn.ModuleList()
@@ -66,7 +61,7 @@ class GCN(nn.Module):
         h = features
         for i, layer in enumerate(self.layers):
             # handle api changes for differnt DGL version
-            if dgl.__version__ > '0.3':
+            if dgl.__version__ > "0.3":
                 h = layer(self.g, h)
             else:
                 h = layer(h, self.g)
@@ -80,6 +75,7 @@ class GCN(nn.Module):
 from dgl.data import load_data
 from collections import namedtuple
 
+
 def load_dataset(dataset="cora"):
     args = namedtuple("args", ["dataset"])
     data = load_data(args(dataset))
@@ -93,7 +89,7 @@ def load_dataset(dataset="cora"):
 
 
 def evaluate(data, logits):
-    test_mask = data.test_mask # the test set which isn't included in the training phase
+    test_mask = data.test_mask  # the test set which isn't included in the training phase
 
     pred = logits.argmax(axis=1)
     acc = ((pred == data.labels) * test_mask).sum() / test_mask.sum()
@@ -142,16 +138,11 @@ from dgl import DGLGraph
 features = torch.FloatTensor(data.features)
 dgl_g = DGLGraph(g)
 
-torch_model = GCN(dgl_g,
-                  infeat_dim,
-                  num_hidden,
-                  num_classes,
-                  num_layers,
-                  F.relu)
+torch_model = GCN(dgl_g, infeat_dim, num_hidden, num_classes, num_layers, F.relu)
 
 # Download the pretrained weights
-model_url = "https://homes.cs.washington.edu/~cyulin/media/gnn_model/gcn_%s.torch"%(dataset)
-model_path = download_testdata(model_url, "gcn_%s.pickle"%(dataset), module='gcn_model')
+model_url = "https://homes.cs.washington.edu/~cyulin/media/gnn_model/gcn_%s.torch" % (dataset)
+model_path = download_testdata(model_url, "gcn_%s.pickle" % (dataset), module="gcn_model")
 
 # Load the weights into the model
 torch_model.load_state_dict(torch.load(model_path))
@@ -173,7 +164,7 @@ print("Test accuracy of DGL results: {:.2%}".format(acc))
 # Define Graph Convolution Layer in Relay
 # ---------------------------------------
 # To run GCN on TVM, we first need to implement Graph Convolution Layer.
-# You may refer to https://github.com/dmlc/dgl/blob/master/python/dgl/nn/mxnet/conv.py for a GraphConv Layer implemented in DGL with MXNet Backend
+# You may refer to https://github.com/dmlc/dgl/blob/master/python/dgl/nn/mxnet/conv/graphconv.py for a GraphConv Layer implemented in DGL with MXNet Backend
 #
 # The layer is defined with below operations, note that we apply two transposes to keep adjacency matrix on right hand side of sparse_dense operator,
 # this method is temporary and will be updated in next few weeks when we have sparse matrix transpose and support for left sparse operator.
@@ -188,14 +179,8 @@ from tvm.contrib import graph_runtime
 import tvm
 from tvm import te
 
-def GraphConv(layer_name,
-              input_dim,
-              output_dim,
-              adj,
-              input,
-              norm=None,
-              bias=True,
-              activation=None):
+
+def GraphConv(layer_name, input_dim, output_dim, adj, input, norm=None, bias=True, activation=None):
     """
     Parameters
     ----------
@@ -246,6 +231,7 @@ def GraphConv(layer_name,
         output_t = activation(output_t)
     return output_t
 
+
 ######################################################################
 # Prepare the parameters needed in the GraphConv layers
 # -----------------------------------------------------
@@ -253,29 +239,33 @@ def GraphConv(layer_name,
 import numpy as np
 import networkx as nx
 
+
 def prepare_params(g, data):
     params = {}
-    params['infeats'] = data.features.astype('float32') # Only support float32 as feature for now
+    params["infeats"] = data.features.astype("float32")  # Only support float32 as feature for now
 
     # Generate adjacency matrix
     adjacency = nx.to_scipy_sparse_matrix(g)
-    params['g_data'] = adjacency.data.astype('float32')
-    params['indices'] = adjacency.indices.astype('int32')
-    params['indptr'] = adjacency.indptr.astype('int32')
+    params["g_data"] = adjacency.data.astype("float32")
+    params["indices"] = adjacency.indices.astype("int32")
+    params["indptr"] = adjacency.indptr.astype("int32")
 
     # Normalization w.r.t. node degrees
     degs = [g.in_degree[i] for i in range(g.number_of_nodes())]
-    params['norm'] = np.power(degs, -0.5).astype('float32')
-    params['norm'] = params['norm'].reshape((params['norm'].shape[0], 1))
+    params["norm"] = np.power(degs, -0.5).astype("float32")
+    params["norm"] = params["norm"].reshape((params["norm"].shape[0], 1))
 
     return params
 
+
 params = prepare_params(g, data)
 
 # Check shape of features and the validity of adjacency matrix
-assert len(params['infeats'].shape) == 2
-assert params['g_data'] is not None and params['indices'] is not None and params['indptr'] is not None
-assert params['infeats'].shape[0] == params['indptr'].shape[0] - 1
+assert len(params["infeats"].shape) == 2
+assert (
+    params["g_data"] is not None and params["indices"] is not None and params["indptr"] is not None
+)
+assert params["infeats"].shape[0] == params["indptr"].shape[0] - 1
 
 ######################################################################
 # Put layers together
@@ -283,34 +273,38 @@ assert params['infeats'].shape[0] == params['indptr'].shape[0] - 1
 
 # Define input features, norms, adjacency matrix in Relay
 infeats = relay.var("infeats", shape=data.features.shape)
-norm = relay.Constant(tvm.nd.array(params['norm']))
-g_data = relay.Constant(tvm.nd.array(params['g_data']))
-indices = relay.Constant(tvm.nd.array(params['indices']))
-indptr = relay.Constant(tvm.nd.array(params['indptr']))
+norm = relay.Constant(tvm.nd.array(params["norm"]))
+g_data = relay.Constant(tvm.nd.array(params["g_data"]))
+indices = relay.Constant(tvm.nd.array(params["indices"]))
+indptr = relay.Constant(tvm.nd.array(params["indptr"]))
 
-Adjacency = namedtuple('Adjacency', ['data', 'indices', 'indptr'])
+Adjacency = namedtuple("Adjacency", ["data", "indices", "indptr"])
 adj = Adjacency(g_data, indices, indptr)
 
 # Construct the 2-layer GCN
 layers = []
-layers.append(GraphConv(
-    layer_name="layers.0",
-    input_dim=infeat_dim,
-    output_dim=num_hidden,
-    adj=adj,
-    input=infeats,
-    norm=norm,
-    activation=relay.nn.relu
-))
-layers.append(GraphConv(
-    layer_name="layers.1",
-    input_dim=num_hidden,
-    output_dim=num_classes,
-    adj=adj,
-    input=layers[-1],
-    norm=norm,
-    activation=None
-))
+layers.append(
+    GraphConv(
+        layer_name="layers.0",
+        input_dim=infeat_dim,
+        output_dim=num_hidden,
+        adj=adj,
+        input=infeats,
+        norm=norm,
+        activation=relay.nn.relu,
+    )
+)
+layers.append(
+    GraphConv(
+        layer_name="layers.1",
+        input_dim=num_hidden,
+        output_dim=num_classes,
+        adj=adj,
+        input=layers[-1],
+        norm=norm,
+        activation=None,
+    )
+)
 
 # Analyze free variables and generate Relay function
 output = layers[-1]
@@ -324,24 +318,24 @@ model_params = {}
 for param_tensor in torch_model.state_dict():
     model_params[param_tensor] = torch_model.state_dict()[param_tensor].numpy()
 
-for i in range(num_layers+1):
-    params["layers.%d.weight"%(i)] = model_params["layers.%d.weight"%(i)]
-    params["layers.%d.bias"%(i)] = model_params["layers.%d.bias"%(i)]
+for i in range(num_layers + 1):
+    params["layers.%d.weight" % (i)] = model_params["layers.%d.weight" % (i)]
+    params["layers.%d.bias" % (i)] = model_params["layers.%d.bias" % (i)]
 
 # Set the TVM build target
-target = 'llvm' # Currently only support `llvm` as target
+target = "llvm"  # Currently only support `llvm` as target
 
 func = relay.Function(relay.analysis.free_vars(output), output)
 func = relay.build_module.bind_params_by_name(func, params)
 mod = tvm.IRModule()
 mod["main"] = func
 # Build with Relay
-with tvm.transform.PassContext(opt_level=0): # Currently only support opt_level=0
+with tvm.transform.PassContext(opt_level=0):  # Currently only support opt_level=0
     lib = relay.build(mod, target, params=params)
 
 # Generate graph runtime
 ctx = tvm.context(target, 0)
-m = graph_runtime.GraphModule(lib['default'](ctx))
+m = graph_runtime.GraphModule(lib["default"](ctx))
 
 ######################################################################
 # Run the TVM model, test for accuracy and verify with DGL
diff --git a/docs/_downloads/72871483681951fd0400ddc905113f11/from_caffe2.py b/docs/_downloads/72871483681951fd0400ddc905113f11/from_caffe2.py
index 66ea0bb..4f6f647 100644
--- a/docs/_downloads/72871483681951fd0400ddc905113f11/from_caffe2.py
+++ b/docs/_downloads/72871483681951fd0400ddc905113f11/from_caffe2.py
@@ -41,13 +41,16 @@ https://caffe2.ai/docs/getting-started.html
 # ----------------------------
 # We load a pretrained resnet50 classification model provided by Caffe2.
 from caffe2.python.models.download import ModelDownloader
+
 mf = ModelDownloader()
 
+
 class Model:
     def __init__(self, model_name):
         self.init_net, self.predict_net, self.value_info = mf.get_c2_model(model_name)
 
-resnet50 = Model('resnet50')
+
+resnet50 = Model("resnet50")
 
 ######################################################################
 # Load a test image
@@ -57,19 +60,21 @@ from tvm.contrib.download import download_testdata
 from PIL import Image
 from matplotlib import pyplot as plt
 import numpy as np
-img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
-img_path = download_testdata(img_url, 'cat.png', module='data')
+
+img_url = "https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true"
+img_path = download_testdata(img_url, "cat.png", module="data")
 img = Image.open(img_path).resize((224, 224))
 plt.imshow(img)
 plt.show()
 # input preprocess
 def transform_image(image):
-    image = np.array(image) - np.array([123., 117., 104.])
+    image = np.array(image) - np.array([123.0, 117.0, 104.0])
     image /= np.array([58.395, 57.12, 57.375])
     image = image.transpose((2, 0, 1))
-    image = image[np.newaxis, :].astype('float32')
+    image = image[np.newaxis, :].astype("float32")
     return image
 
+
 data = transform_image(img)
 
 ######################################################################
@@ -83,11 +88,14 @@ dtype_dict = {input_name: data.dtype}
 
 # parse Caffe2 model and convert into Relay computation graph
 from tvm import relay, transform
-mod, params = relay.frontend.from_caffe2(resnet50.init_net, resnet50.predict_net, shape_dict, dtype_dict)
+
+mod, params = relay.frontend.from_caffe2(
+    resnet50.init_net, resnet50.predict_net, shape_dict, dtype_dict
+)
 
 # compile the model
 # target x86 CPU
-target = 'llvm'
+target = "llvm"
 with transform.PassContext(opt_level=3):
     lib = relay.build(mod, target, params=params)
 
@@ -98,12 +106,13 @@ with transform.PassContext(opt_level=3):
 import tvm
 from tvm import te
 from tvm.contrib import graph_runtime
+
 # context x86 CPU, use tvm.gpu(0) if you run on GPU
 ctx = tvm.cpu(0)
 # create a runtime executor module
-m = graph_runtime.GraphModule(lib['default'](ctx))
+m = graph_runtime.GraphModule(lib["default"](ctx))
 # set inputs
-m.set_input(input_name, tvm.nd.array(data.astype('float32')))
+m.set_input(input_name, tvm.nd.array(data.astype("float32")))
 # execute
 m.run()
 # get outputs
@@ -115,17 +124,22 @@ top1_tvm = np.argmax(tvm_out.asnumpy()[0])
 # -------------------
 # Look up prediction top 1 index in 1000 class synset.
 from caffe2.python import workspace
-synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
-                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',
-                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',
-                      'imagenet1000_clsid_to_human.txt'])
-synset_name = 'imagenet1000_clsid_to_human.txt'
-synset_path = download_testdata(synset_url, synset_name, module='data')
+
+synset_url = "".join(
+    [
+        "https://gist.githubusercontent.com/zhreshold/",
+        "4d0b62f3d01426887599d4f7ede23ee5/raw/",
+        "596b27d23537e5a1b5751d2b0481ef172f58b539/",
+        "imagenet1000_clsid_to_human.txt",
+    ]
+)
+synset_name = "imagenet1000_clsid_to_human.txt"
+synset_path = download_testdata(synset_url, synset_name, module="data")
 with open(synset_path) as f:
     synset = eval(f.read())
-print('Relay top-1 id: {}, class name: {}'.format(top1_tvm, synset[top1_tvm]))
+print("Relay top-1 id: {}, class name: {}".format(top1_tvm, synset[top1_tvm]))
 # confirm correctness with caffe2 output
 p = workspace.Predictor(resnet50.init_net, resnet50.predict_net)
 caffe2_out = p.run({input_name: data})
 top1_caffe2 = np.argmax(caffe2_out)
-print('Caffe2 top-1 id: {}, class name: {}'.format(top1_caffe2, synset[top1_caffe2]))
+print("Caffe2 top-1 id: {}, class name: {}".format(top1_caffe2, synset[top1_caffe2]))
diff --git a/docs/_downloads/739deb9ab034a5315ce6ba6bf7e5ff44/tune_relay_cuda.ipynb b/docs/_downloads/739deb9ab034a5315ce6ba6bf7e5ff44/tune_relay_cuda.ipynb
index 2ddcec4..59311bb 100644
--- a/docs/_downloads/739deb9ab034a5315ce6ba6bf7e5ff44/tune_relay_cuda.ipynb
+++ b/docs/_downloads/739deb9ab034a5315ce6ba6bf7e5ff44/tune_relay_cuda.ipynb
@@ -51,7 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "def get_network(name, batch_size):\n    \"\"\"Get the symbol definition and random weight of a network\"\"\"\n    input_shape = (batch_size, 3, 224, 224)\n    output_shape = (batch_size, 1000)\n\n    if \"resnet\" in name:\n        n_layer = int(name.split('-')[1])\n        mod, params = relay.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)\n    elif \"vgg\" in name:\n        n_layer = int(name.split('-')[1])\n        mod, params = relay.testi [...]
+        "def get_network(name, batch_size):\n    \"\"\"Get the symbol definition and random weight of a network\"\"\"\n    input_shape = (batch_size, 3, 224, 224)\n    output_shape = (batch_size, 1000)\n\n    if \"resnet\" in name:\n        n_layer = int(name.split(\"-\")[1])\n        mod, params = relay.testing.resnet.get_workload(\n            num_layers=n_layer, batch_size=batch_size, dtype=dtype\n        )\n    elif \"vgg\" in name:\n        n_layer = int(name.split(\"-\")[1])\n      [...]
       ]
     },
     {
@@ -69,7 +69,7 @@
       },
       "outputs": [],
       "source": [
-        "#### DEVICE CONFIG ####\ntarget = tvm.target.cuda()\n\n#### TUNING OPTION ####\nnetwork = 'resnet-18'\nlog_file = \"%s.log\" % network\ndtype = 'float32'\n\ntuning_option = {\n    'log_filename': log_file,\n\n    'tuner': 'xgb',\n    'n_trial': 2000,\n    'early_stopping': 600,\n\n    'measure_option': autotvm.measure_option(\n        builder=autotvm.LocalBuilder(timeout=10),\n        runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, min_repeat_ms=150),\n    ),\n}"
+        "#### DEVICE CONFIG ####\ntarget = tvm.target.cuda()\n\n#### TUNING OPTION ####\nnetwork = \"resnet-18\"\nlog_file = \"%s.log\" % network\ndtype = \"float32\"\n\ntuning_option = {\n    \"log_filename\": log_file,\n    \"tuner\": \"xgb\",\n    \"n_trial\": 2000,\n    \"early_stopping\": 600,\n    \"measure_option\": autotvm.measure_option(\n        builder=autotvm.LocalBuilder(timeout=10),\n        runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, min_repeat_ms=150),\n    ),\n}"
       ]
     },
     {
@@ -94,7 +94,7 @@
       },
       "outputs": [],
       "source": [
-        "# You can skip the implementation of this function for this tutorial.\ndef tune_tasks(tasks,\n               measure_option,\n               tuner='xgb',\n               n_trial=1000,\n               early_stopping=None,\n               log_filename='tuning.log',\n               use_transfer_learning=True):\n    # create tmp log file\n    tmp_log_file = log_filename + \".tmp\"\n    if os.path.exists(tmp_log_file):\n        os.remove(tmp_log_file)\n\n    for i, tsk in enumerate(r [...]
+        "# You can skip the implementation of this function for this tutorial.\ndef tune_tasks(\n    tasks,\n    measure_option,\n    tuner=\"xgb\",\n    n_trial=1000,\n    early_stopping=None,\n    log_filename=\"tuning.log\",\n    use_transfer_learning=True,\n):\n    # create tmp log file\n    tmp_log_file = log_filename + \".tmp\"\n    if os.path.exists(tmp_log_file):\n        os.remove(tmp_log_file)\n\n    for i, tsk in enumerate(reversed(tasks)):\n        prefix = \"[Task %2d/%2d] \ [...]
       ]
     },
     {
@@ -112,7 +112,7 @@
       },
       "outputs": [],
       "source": [
-        "def tune_and_evaluate(tuning_opt):\n    # extract workloads from relay program\n    print(\"Extract tasks...\")\n    mod, params, input_shape, out_shape = get_network(network, batch_size=1)\n    tasks = autotvm.task.extract_from_program(mod[\"main\"], target=target,\n                                              params=params,\n                                              ops=(relay.op.get(\"nn.conv2d\"),))\n\n    # run tuning tasks\n    print(\"Tuning...\")\n    tune_tasks(tas [...]
+        "def tune_and_evaluate(tuning_opt):\n    # extract workloads from relay program\n    print(\"Extract tasks...\")\n    mod, params, input_shape, out_shape = get_network(network, batch_size=1)\n    tasks = autotvm.task.extract_from_program(\n        mod[\"main\"], target=target, params=params, ops=(relay.op.get(\"nn.conv2d\"),)\n    )\n\n    # run tuning tasks\n    print(\"Tuning...\")\n    tune_tasks(tasks, **tuning_opt)\n\n    # compile kernels with history best records\n    with [...]
       ]
     },
     {
@@ -144,7 +144,7 @@
       },
       "outputs": [],
       "source": [
-        "tuning_option = {\n    'log_filename': log_file,\n\n    'tuner': 'xgb',\n    'n_trial': 2000,\n    'early_stopping': 600,\n\n    'measure_option': autotvm.measure_option(\n        builder=autotvm.LocalBuilder(timeout=10),\n        runner=autotvm.RPCRunner(\n            '1080ti',  # change the device key to your key\n            '0.0.0.0', 9190,\n            number=20, repeat=3, timeout=4, min_repeat_ms=150),\n    ),\n}"
+        "tuning_option = {\n    \"log_filename\": log_file,\n    \"tuner\": \"xgb\",\n    \"n_trial\": 2000,\n    \"early_stopping\": 600,\n    \"measure_option\": autotvm.measure_option(\n        builder=autotvm.LocalBuilder(timeout=10),\n        runner=autotvm.RPCRunner(\n            \"1080ti\",  # change the device key to your key\n            \"0.0.0.0\",\n            9190,\n            number=20,\n            repeat=3,\n            timeout=4,\n            min_repeat_ms=150,\n        [...]
       ]
     }
   ],
diff --git a/docs/_downloads/78da213eae381b8ff94cc356ee7c5423/deploy_prequantized.ipynb b/docs/_downloads/78da213eae381b8ff94cc356ee7c5423/deploy_prequantized.ipynb
index 0244980..c508906 100644
--- a/docs/_downloads/78da213eae381b8ff94cc356ee7c5423/deploy_prequantized.ipynb
+++ b/docs/_downloads/78da213eae381b8ff94cc356ee7c5423/deploy_prequantized.ipynb
@@ -51,7 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "def get_transform():\n    import torchvision.transforms as transforms\n    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],\n                                     std=[0.229, 0.224, 0.225])\n    return transforms.Compose([\n            transforms.Resize(256),\n            transforms.CenterCrop(224),\n            transforms.ToTensor(),\n            normalize,\n        ])\n\n\ndef get_real_image(im_height, im_width):\n    img_url = 'https://github.com/dmlc/mxnet.js/blob [...]
+        "def get_transform():\n    import torchvision.transforms as transforms\n\n    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])\n    return transforms.Compose(\n        [\n            transforms.Resize(256),\n            transforms.CenterCrop(224),\n            transforms.ToTensor(),\n            normalize,\n        ]\n    )\n\n\ndef get_real_image(im_height, im_width):\n    img_url = \"https://github.com/dmlc/mxnet.js/blob/master/data/cat.pn [...]
       ]
     },
     {
@@ -105,7 +105,7 @@
       },
       "outputs": [],
       "source": [
-        "def quantize_model(model, inp):\n    model.fuse_model()\n    model.qconfig = torch.quantization.get_default_qconfig('fbgemm')\n    torch.quantization.prepare(model, inplace=True)\n    # Dummy calibration\n    model(inp)\n    torch.quantization.convert(model, inplace=True)"
+        "def quantize_model(model, inp):\n    model.fuse_model()\n    model.qconfig = torch.quantization.get_default_qconfig(\"fbgemm\")\n    torch.quantization.prepare(model, inplace=True)\n    # Dummy calibration\n    model(inp)\n    torch.quantization.convert(model, inplace=True)"
       ]
     },
     {
@@ -231,7 +231,7 @@
       },
       "outputs": [],
       "source": [
-        "n_repeat = 100  # should be bigger to make the measurement more accurate\nctx = tvm.cpu(0)\nftimer = rt_mod.module.time_evaluator(\"run\", ctx, number=1,\n                                      repeat=n_repeat)\nprof_res = np.array(ftimer().results) * 1e3\nprint(\"Elapsed average ms:\", np.mean(prof_res))"
+        "n_repeat = 100  # should be bigger to make the measurement more accurate\nctx = tvm.cpu(0)\nftimer = rt_mod.module.time_evaluator(\"run\", ctx, number=1, repeat=n_repeat)\nprof_res = np.array(ftimer().results) * 1e3\nprint(\"Elapsed average ms:\", np.mean(prof_res))"
       ]
     },
     {
diff --git a/docs/_downloads/7ece74acc230c7d55086182cc8884b09/extern_op.py b/docs/_downloads/7ece74acc230c7d55086182cc8884b09/extern_op.py
index 64e9880..454237a 100644
--- a/docs/_downloads/7ece74acc230c7d55086182cc8884b09/extern_op.py
+++ b/docs/_downloads/7ece74acc230c7d55086182cc8884b09/extern_op.py
@@ -36,6 +36,9 @@ from tvm import te
 import numpy as np
 from tvm.contrib import cblas
 
+if not tvm.get_global_func("tvm.contrib.cblas.matmul", allow_missing=True):
+    raise Exception("Not compiled with cblas support; can't build this tutorial")
+
 ######################################################################
 # Use Extern Tensor Function
 # --------------------------
@@ -54,14 +57,18 @@ from tvm.contrib import cblas
 n = 1024
 l = 128
 m = 235
-bias = te.var('bias', dtype="float32")
-A = te.placeholder((n, l), name='A')
-B = te.placeholder((l, m), name='B')
-C = te.extern((n, m), [A, B],
-               lambda ins, outs: tvm.tir.call_packed(
-                   "tvm.contrib.cblas.matmul",
-                   ins[0], ins[1], outs[0], False, False), name="C")
-D = te.compute(C.shape, lambda i, j: C[i,j] + bias, name="D")
+bias = te.var("bias", dtype="float32")
+A = te.placeholder((n, l), name="A")
+B = te.placeholder((l, m), name="B")
+C = te.extern(
+    (n, m),
+    [A, B],
+    lambda ins, outs: tvm.tir.call_packed(
+        "tvm.contrib.cblas.matmul", ins[0], ins[1], outs[0], False, False
+    ),
+    name="C",
+)
+D = te.compute(C.shape, lambda i, j: C[i, j] + bias, name="D")
 s = te.create_schedule(D.op)
 
 ######################################################################
@@ -76,8 +83,7 @@ b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
 d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
 bb = 10.0
 f(a, b, d, bb)
-tvm.testing.assert_allclose(
-    d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 10, rtol=1e-5)
+tvm.testing.assert_allclose(d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 10, rtol=1e-5)
 
 ######################################################################
 # Extern Contrib Wrappers
@@ -86,8 +92,9 @@ tvm.testing.assert_allclose(
 # the following line is equivalent to the previous example.
 #
 from tvm.contrib import cblas
+
 C = cblas.matmul(A, B)
-D = te.compute(C.shape, lambda i, j: C[i,j] + bias, name="D")
+D = te.compute(C.shape, lambda i, j: C[i, j] + bias, name="D")
 s = te.create_schedule(D.op)
 
 ######################################################################
@@ -107,9 +114,14 @@ def my_tvm_addone(x, y):
     print("my_tvm_addone signatures: %s, %s" % (type(x), type(y)))
     tvm.nd.array(x.asnumpy() + 1).copyto(y)
 
-A = te.placeholder((n,), name='A')
-B = te.extern(A.shape, [A], lambda ins, outs: tvm.tir.call_packed(
-    "tvm.contrib.my_tvm_addone", ins[0], outs[0]), name="C")
+
+A = te.placeholder((n,), name="A")
+B = te.extern(
+    A.shape,
+    [A],
+    lambda ins, outs: tvm.tir.call_packed("tvm.contrib.my_tvm_addone", ins[0], outs[0]),
+    name="C",
+)
 s = te.create_schedule(B.op)
 f = tvm.build(s, [A, B], "llvm")
 a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
diff --git a/docs/_downloads/8246644805c8dfcb0b33ca356cc1fafc/deploy_ssd_gluoncv.ipynb b/docs/_downloads/8246644805c8dfcb0b33ca356cc1fafc/deploy_ssd_gluoncv.ipynb
index 00b35bf..17b24fd 100644
--- a/docs/_downloads/8246644805c8dfcb0b33ca356cc1fafc/deploy_ssd_gluoncv.ipynb
+++ b/docs/_downloads/8246644805c8dfcb0b33ca356cc1fafc/deploy_ssd_gluoncv.ipynb
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "supported_model = [\n    'ssd_512_resnet50_v1_voc',\n    'ssd_512_resnet50_v1_coco',\n    'ssd_512_resnet101_v2_voc',\n    'ssd_512_mobilenet1.0_voc',\n    'ssd_512_mobilenet1.0_coco',\n    'ssd_300_vgg16_atrous_voc'\n    'ssd_512_vgg16_atrous_coco',\n]\n\nmodel_name = supported_model[0]\ndshape = (1, 3, 512, 512)"
+        "supported_model = [\n    \"ssd_512_resnet50_v1_voc\",\n    \"ssd_512_resnet50_v1_coco\",\n    \"ssd_512_resnet101_v2_voc\",\n    \"ssd_512_mobilenet1.0_voc\",\n    \"ssd_512_mobilenet1.0_coco\",\n    \"ssd_300_vgg16_atrous_voc\" \"ssd_512_vgg16_atrous_coco\",\n]\n\nmodel_name = supported_model[0]\ndshape = (1, 3, 512, 512)"
       ]
     },
     {
@@ -62,7 +62,7 @@
       },
       "outputs": [],
       "source": [
-        "im_fname = download_testdata('https://github.com/dmlc/web-data/blob/master/' +\n                             'gluoncv/detection/street_small.jpg?raw=true',\n                             'street_small.jpg', module='data')\nx, img = data.transforms.presets.ssd.load_test(im_fname, short=512)"
+        "im_fname = download_testdata(\n    \"https://github.com/dmlc/web-data/blob/master/\" + \"gluoncv/detection/street_small.jpg?raw=true\",\n    \"street_small.jpg\",\n    module=\"data\",\n)\nx, img = data.transforms.presets.ssd.load_test(im_fname, short=512)"
       ]
     },
     {
@@ -80,7 +80,7 @@
       },
       "outputs": [],
       "source": [
-        "block = model_zoo.get_model(model_name, pretrained=True)\n\ndef build(target):\n    mod, params = relay.frontend.from_mxnet(block, {\"data\": dshape})\n    with tvm.transform.PassContext(opt_level=3):\n        lib = relay.build(mod, target, params=params)\n    return lib"
+        "block = model_zoo.get_model(model_name, pretrained=True)\n\n\ndef build(target):\n    mod, params = relay.frontend.from_mxnet(block, {\"data\": dshape})\n    with tvm.transform.PassContext(opt_level=3):\n        lib = relay.build(mod, target, params=params)\n    return lib"
       ]
     },
     {
@@ -98,7 +98,7 @@
       },
       "outputs": [],
       "source": [
-        "def run(lib, ctx):\n    # Build TVM runtime\n    m = graph_runtime.GraphModule(lib['default'](ctx))\n    tvm_input = tvm.nd.array(x.asnumpy(), ctx=ctx)\n    m.set_input('data', tvm_input)\n    # execute\n    m.run()\n    # get outputs\n    class_IDs, scores, bounding_boxs = m.get_output(0), m.get_output(1), m.get_output(2)\n    return class_IDs, scores, bounding_boxs\n\nfor target in [\"llvm\", \"cuda\"]:\n    ctx = tvm.context(target, 0)\n    if ctx.exist:\n        lib = build( [...]
+        "def run(lib, ctx):\n    # Build TVM runtime\n    m = graph_runtime.GraphModule(lib[\"default\"](ctx))\n    tvm_input = tvm.nd.array(x.asnumpy(), ctx=ctx)\n    m.set_input(\"data\", tvm_input)\n    # execute\n    m.run()\n    # get outputs\n    class_IDs, scores, bounding_boxs = m.get_output(0), m.get_output(1), m.get_output(2)\n    return class_IDs, scores, bounding_boxs\n\n\nfor target in [\"llvm\", \"cuda\"]:\n    ctx = tvm.context(target, 0)\n    if ctx.exist:\n        lib =  [...]
       ]
     },
     {
@@ -116,7 +116,7 @@
       },
       "outputs": [],
       "source": [
-        "ax = utils.viz.plot_bbox(img, bounding_boxs.asnumpy()[0], scores.asnumpy()[0],\n                         class_IDs.asnumpy()[0], class_names=block.classes)\nplt.show()"
+        "ax = utils.viz.plot_bbox(\n    img,\n    bounding_boxs.asnumpy()[0],\n    scores.asnumpy()[0],\n    class_IDs.asnumpy()[0],\n    class_names=block.classes,\n)\nplt.show()"
       ]
     }
   ],
diff --git a/docs/_downloads/835a4def1e256b7a1f711621fc031418/from_darknet.ipynb b/docs/_downloads/835a4def1e256b7a1f711621fc031418/from_darknet.ipynb
index 9353b28..8e154ef 100644
--- a/docs/_downloads/835a4def1e256b7a1f711621fc031418/from_darknet.ipynb
+++ b/docs/_downloads/835a4def1e256b7a1f711621fc031418/from_darknet.ipynb
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "# Model name\nMODEL_NAME = 'yolov3'"
+        "# Model name\nMODEL_NAME = \"yolov3\""
       ]
     },
     {
@@ -62,7 +62,7 @@
       },
       "outputs": [],
       "source": [
-        "CFG_NAME = MODEL_NAME + '.cfg'\nWEIGHTS_NAME = MODEL_NAME + '.weights'\nREPO_URL = 'https://github.com/dmlc/web-data/blob/master/darknet/'\nCFG_URL = REPO_URL + 'cfg/' + CFG_NAME + '?raw=true'\nWEIGHTS_URL = 'https://pjreddie.com/media/files/' + WEIGHTS_NAME\n\ncfg_path = download_testdata(CFG_URL, CFG_NAME, module=\"darknet\")\nweights_path = download_testdata(WEIGHTS_URL, WEIGHTS_NAME, module=\"darknet\")\n\n# Download and Load darknet library\nif sys.platform in ['linux', 'li [...]
+        "CFG_NAME = MODEL_NAME + \".cfg\"\nWEIGHTS_NAME = MODEL_NAME + \".weights\"\nREPO_URL = \"https://github.com/dmlc/web-data/blob/master/darknet/\"\nCFG_URL = REPO_URL + \"cfg/\" + CFG_NAME + \"?raw=true\"\nWEIGHTS_URL = \"https://pjreddie.com/media/files/\" + WEIGHTS_NAME\n\ncfg_path = download_testdata(CFG_URL, CFG_NAME, module=\"darknet\")\nweights_path = download_testdata(WEIGHTS_URL, WEIGHTS_NAME, module=\"darknet\")\n\n# Download and Load darknet library\nif sys.platform in [ [...]
       ]
     },
     {
@@ -80,7 +80,7 @@
       },
       "outputs": [],
       "source": [
-        "target = 'llvm'\ntarget_host = 'llvm'\nctx = tvm.cpu(0)\ndata = np.empty([batch_size, net.c, net.h, net.w], dtype)\nshape = {'data': data.shape}\nprint(\"Compiling the model...\")\nwith tvm.transform.PassContext(opt_level=3):\n    lib = relay.build(mod, target=target, target_host=target_host, params=params)\n\n[neth, netw] = shape['data'][2:] # Current image shape is 608x608"
+        "target = \"llvm\"\ntarget_host = \"llvm\"\nctx = tvm.cpu(0)\ndata = np.empty([batch_size, net.c, net.h, net.w], dtype)\nshape = {\"data\": data.shape}\nprint(\"Compiling the model...\")\nwith tvm.transform.PassContext(opt_level=3):\n    lib = relay.build(mod, target=target, target_host=target_host, params=params)\n\n[neth, netw] = shape[\"data\"][2:]  # Current image shape is 608x608"
       ]
     },
     {
@@ -98,7 +98,7 @@
       },
       "outputs": [],
       "source": [
-        "test_image = 'dog.jpg'\nprint(\"Loading the test image...\")\nimg_url = REPO_URL + 'data/' + test_image + '?raw=true'\nimg_path = download_testdata(img_url, test_image, \"data\")\n\ndata = tvm.relay.testing.darknet.load_image(img_path, netw, neth)"
+        "test_image = \"dog.jpg\"\nprint(\"Loading the test image...\")\nimg_url = REPO_URL + \"data/\" + test_image + \"?raw=true\"\nimg_path = download_testdata(img_url, test_image, \"data\")\n\ndata = tvm.relay.testing.darknet.load_image(img_path, netw, neth)"
       ]
     },
     {
@@ -116,7 +116,7 @@
       },
       "outputs": [],
       "source": [
-        "from tvm.contrib import graph_runtime\n\nm = graph_runtime.GraphModule(lib['default'](ctx))\n\n# set inputs\nm.set_input('data', tvm.nd.array(data.astype(dtype)))\n# execute\nprint(\"Running the test image...\")\n\n# detection\n# thresholds\nthresh = 0.5\nnms_thresh = 0.45\n\nm.run()\n# get outputs\ntvm_out = []\nif MODEL_NAME == 'yolov2':\n    layer_out = {}\n    layer_out['type'] = 'Region'\n    # Get the region layer attributes (n, out_c, out_h, out_w, classes, coords, backgr [...]
+        "from tvm.contrib import graph_runtime\n\nm = graph_runtime.GraphModule(lib[\"default\"](ctx))\n\n# set inputs\nm.set_input(\"data\", tvm.nd.array(data.astype(dtype)))\n# execute\nprint(\"Running the test image...\")\n\n# detection\n# thresholds\nthresh = 0.5\nnms_thresh = 0.45\n\nm.run()\n# get outputs\ntvm_out = []\nif MODEL_NAME == \"yolov2\":\n    layer_out = {}\n    layer_out[\"type\"] = \"Region\"\n    # Get the region layer attributes (n, out_c, out_h, out_w, classes, coor [...]
       ]
     }
   ],
diff --git a/docs/_downloads/836dc3852acf09662e9eb37c4c5e1e1b/opt_gemm.py b/docs/_downloads/836dc3852acf09662e9eb37c4c5e1e1b/opt_gemm.py
index daca89b..ead6660 100644
--- a/docs/_downloads/836dc3852acf09662e9eb37c4c5e1e1b/opt_gemm.py
+++ b/docs/_downloads/836dc3852acf09662e9eb37c4c5e1e1b/opt_gemm.py
@@ -56,6 +56,7 @@ Intel i7-4770HQ CPU. The cache line size should be 64 bytes for all the x86 CPUs
 # Then we write a baseline implementation, the simplest way to write a matrix multiplication in TVM.
 
 import tvm
+import tvm.testing
 from tvm import te
 import numpy
 import timeit
@@ -73,7 +74,7 @@ dtype = "float32"
 # using Intel AVX2(Advanced Vector Extensions) ISA for SIMD
 # To get the best performance, please change the following line
 # to llvm -mcpu=core-avx2, or specific type of CPU you use
-target = 'llvm'
+target = "llvm"
 ctx = tvm.context(target, 0)
 
 # Random generated tensor for testing
@@ -81,31 +82,30 @@ a = tvm.nd.array(numpy.random.rand(M, K).astype(dtype), ctx)
 b = tvm.nd.array(numpy.random.rand(K, N).astype(dtype), ctx)
 
 np_repeat = 100
-np_runing_time = timeit.timeit(setup='import numpy\n'
-                                     'M = ' + str(M) + '\n'
-                                     'K = ' + str(K) + '\n'
-                                     'N = ' + str(N) + '\n'
-                                     'dtype = "float32"\n'
-                                     'a = numpy.random.rand(M, K).astype(dtype)\n'
-                                     'b = numpy.random.rand(K, N).astype(dtype)\n',
-                               stmt='answer = numpy.dot(a, b)',
-                               number=np_repeat)
+np_runing_time = timeit.timeit(
+    setup="import numpy\n"
+    "M = " + str(M) + "\n"
+    "K = " + str(K) + "\n"
+    "N = " + str(N) + "\n"
+    'dtype = "float32"\n'
+    "a = numpy.random.rand(M, K).astype(dtype)\n"
+    "b = numpy.random.rand(K, N).astype(dtype)\n",
+    stmt="answer = numpy.dot(a, b)",
+    number=np_repeat,
+)
 print("Numpy running time: %f" % (np_runing_time / np_repeat))
 
 answer = numpy.dot(a.asnumpy(), b.asnumpy())
 
 # Algorithm
-k = te.reduce_axis((0, K), 'k')
-A = te.placeholder((M, K), name='A')
-B = te.placeholder((K, N), name='B')
-C = te.compute(
-           (M, N),
-           lambda x, y: te.sum(A[x, k] * B[k, y], axis=k),
-           name='C')
+k = te.reduce_axis((0, K), "k")
+A = te.placeholder((M, K), name="A")
+B = te.placeholder((K, N), name="B")
+C = te.compute((M, N), lambda x, y: te.sum(A[x, k] * B[k, y], axis=k), name="C")
 
 # Default schedule
 s = te.create_schedule(C.op)
-func = tvm.build(s, [A, B, C], target=target, name='mmult')
+func = tvm.build(s, [A, B, C], target=target, name="mmult")
 assert func
 
 c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)
@@ -113,7 +113,7 @@ func(a, b, c)
 tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
 evaluator = func.time_evaluator(func.entry_name, ctx, number=1)
-print('Baseline: %f' % evaluator(a, b, c).mean)
+print("Baseline: %f" % evaluator(a, b, c).mean)
 
 ################################################################################################
 # In TVM, we can always inspect lower level IR to debug or optimize our schedule.
@@ -134,23 +134,23 @@ s = te.create_schedule(C.op)
 
 # Blocking by loop tiling
 xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
-k, = s[C].op.reduce_axis
+(k,) = s[C].op.reduce_axis
 ko, ki = s[C].split(k, factor=4)
 
 # Hoist reduction domain outside the blocking loop
 s[C].reorder(xo, yo, ko, ki, xi, yi)
 
-func = tvm.build(s, [A, B, C], target=target, name='mmult')
+func = tvm.build(s, [A, B, C], target=target, name="mmult")
 assert func
 
-c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
+c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)
 func(a, b, c)
 tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
 # By simply tiling the loop 32x32, and hoisting ko, ki outside the blocking loops,
 # we can see big speedup compared with the baseline.
 evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
-print('Opt1: %f' % evaluator(a, b, c).mean)
+print("Opt1: %f" % evaluator(a, b, c).mean)
 
 ################################################################################################
 # Here is the generated IR after blocking.
@@ -168,7 +168,7 @@ print(tvm.lower(s, [A, B, C], simple_mode=True))
 
 s = te.create_schedule(C.op)
 xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
-k, = s[C].op.reduce_axis
+(k,) = s[C].op.reduce_axis
 ko, ki = s[C].split(k, factor=4)
 
 s[C].reorder(xo, yo, ko, ki, xi, yi)
@@ -176,15 +176,15 @@ s[C].reorder(xo, yo, ko, ki, xi, yi)
 # Vectorization
 s[C].vectorize(yi)
 
-func = tvm.build(s, [A, B, C], target=target, name='mmult')
+func = tvm.build(s, [A, B, C], target=target, name="mmult")
 assert func
 
-c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
+c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)
 func(a, b, c)
 tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
 evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
-print('Opt2: %f' % evaluator(a, b, c).mean)
+print("Opt2: %f" % evaluator(a, b, c).mean)
 
 ################################################################################################
 # Here is the generated IR after vectorization.
@@ -202,22 +202,22 @@ print(tvm.lower(s, [A, B, C], simple_mode=True))
 
 s = te.create_schedule(C.op)
 xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
-k, = s[C].op.reduce_axis
+(k,) = s[C].op.reduce_axis
 ko, ki = s[C].split(k, factor=4)
 
 # re-ordering
 s[C].reorder(xo, yo, ko, xi, ki, yi)
 s[C].vectorize(yi)
 
-func = tvm.build(s, [A, B, C], target=target, name='mmult')
+func = tvm.build(s, [A, B, C], target=target, name="mmult")
 assert func
 
-c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
+c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)
 func(a, b, c)
 tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
 evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
-print('Opt3: %f' % evaluator(a, b, c).mean)
+print("Opt3: %f" % evaluator(a, b, c).mean)
 
 ################################################################################################
 # Here is the generated IR after loop permutation.
@@ -245,15 +245,17 @@ print(tvm.lower(s, [A, B, C], simple_mode=True))
 #
 
 # We have to re-write the algorithm slightly.
-packedB = te.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name='packedB')
-C = te.compute((M, N),
-                lambda x, y: te.sum(A[x, k] * packedB[y // bn, k, tvm.tir.indexmod(y, bn)], axis=k),
-                name = 'C')
+packedB = te.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name="packedB")
+C = te.compute(
+    (M, N),
+    lambda x, y: te.sum(A[x, k] * packedB[y // bn, k, tvm.tir.indexmod(y, bn)], axis=k),
+    name="C",
+)
 
 s = te.create_schedule(C.op)
 
 xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
-k, = s[C].op.reduce_axis
+(k,) = s[C].op.reduce_axis
 ko, ki = s[C].split(k, factor=4)
 
 s[C].reorder(xo, yo, ko, xi, ki, yi)
@@ -263,15 +265,15 @@ x, y, z = s[packedB].op.axis
 s[packedB].vectorize(z)
 s[packedB].parallel(x)
 
-func = tvm.build(s, [A, B, C], target=target, name='mmult')
+func = tvm.build(s, [A, B, C], target=target, name="mmult")
 assert func
 
-c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
+c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)
 func(a, b, c)
 tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
 evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
-print('Opt4: %f' % evaluator(a, b, c).mean)
+print("Opt4: %f" % evaluator(a, b, c).mean)
 
 ################################################################################################
 # Here is the generated IR after array packing.
@@ -289,7 +291,7 @@ print(tvm.lower(s, [A, B, C], simple_mode=True))
 s = te.create_schedule(C.op)
 
 # Allocate write cache
-CC = s.cache_write(C, 'global')
+CC = s.cache_write(C, "global")
 
 xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
 
@@ -299,7 +301,7 @@ s[CC].compute_at(s[C], yo)
 # New inner axes
 xc, yc = s[CC].op.axis
 
-k, = s[CC].op.reduce_axis
+(k,) = s[CC].op.reduce_axis
 ko, ki = s[CC].split(k, factor=4)
 s[CC].reorder(ko, xc, ki, yc)
 s[CC].unroll(ki)
@@ -309,15 +311,15 @@ x, y, z = s[packedB].op.axis
 s[packedB].vectorize(z)
 s[packedB].parallel(x)
 
-func = tvm.build(s, [A, B, C], target=target, name='mmult')
+func = tvm.build(s, [A, B, C], target=target, name="mmult")
 assert func
 
-c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
+c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)
 func(a, b, c)
 tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
 evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
-print('Opt5: %f' % evaluator(a, b, c).mean)
+print("Opt5: %f" % evaluator(a, b, c).mean)
 
 ################################################################################################
 # Here is the generated IR after blocking.
@@ -331,7 +333,7 @@ print(tvm.lower(s, [A, B, C], simple_mode=True))
 
 s = te.create_schedule(C.op)
 
-CC = s.cache_write(C, 'global')
+CC = s.cache_write(C, "global")
 
 xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
 
@@ -339,7 +341,7 @@ s[CC].compute_at(s[C], yo)
 
 xc, yc = s[CC].op.axis
 
-k, = s[CC].op.reduce_axis
+(k,) = s[CC].op.reduce_axis
 ko, ki = s[CC].split(k, factor=4)
 s[CC].reorder(ko, xc, ki, yc)
 s[CC].unroll(ki)
@@ -352,16 +354,16 @@ x, y, z = s[packedB].op.axis
 s[packedB].vectorize(z)
 s[packedB].parallel(x)
 
-func = tvm.build(s, [A, B, C], target=target, name = 'mmult')
+func = tvm.build(s, [A, B, C], target=target, name="mmult")
 assert func
 
-c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
+c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)
 func(a, b, c)
 tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
 evaluator = func.time_evaluator(func.entry_name, ctx, number=50)
 opt6_time = evaluator(a, b, c).mean
-print('Opt6: %f' % opt6_time)
+print("Opt6: %f" % opt6_time)
 
 ################################################################################################
 # Here is the generated IR after parallelization.
diff --git a/docs/_downloads/83dedc6352b4016772e17480ef01345d/deploy_model_on_rasp.py b/docs/_downloads/83dedc6352b4016772e17480ef01345d/deploy_model_on_rasp.py
index c9174ad..c6e2d8f 100644
--- a/docs/_downloads/83dedc6352b4016772e17480ef01345d/deploy_model_on_rasp.py
+++ b/docs/_downloads/83dedc6352b4016772e17480ef01345d/deploy_model_on_rasp.py
@@ -104,34 +104,40 @@ from PIL import Image
 import numpy as np
 
 # one line to get the model
-block = get_model('resnet18_v1', pretrained=True)
+block = get_model("resnet18_v1", pretrained=True)
 
 ######################################################################
 # In order to test our model, here we download an image of cat and
 # transform its format.
-img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
-img_name = 'cat.png'
-img_path = download_testdata(img_url, img_name, module='data')
+img_url = "https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true"
+img_name = "cat.png"
+img_path = download_testdata(img_url, img_name, module="data")
 image = Image.open(img_path).resize((224, 224))
 
+
 def transform_image(image):
-    image = np.array(image) - np.array([123., 117., 104.])
+    image = np.array(image) - np.array([123.0, 117.0, 104.0])
     image /= np.array([58.395, 57.12, 57.375])
     image = image.transpose((2, 0, 1))
     image = image[np.newaxis, :]
     return image
 
+
 x = transform_image(image)
 
 ######################################################################
 # synset is used to transform the label from number of ImageNet class to
 # the word human can understand.
-synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
-                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',
-                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',
-                      'imagenet1000_clsid_to_human.txt'])
-synset_name = 'imagenet1000_clsid_to_human.txt'
-synset_path = download_testdata(synset_url, synset_name, module='data')
+synset_url = "".join(
+    [
+        "https://gist.githubusercontent.com/zhreshold/",
+        "4d0b62f3d01426887599d4f7ede23ee5/raw/",
+        "596b27d23537e5a1b5751d2b0481ef172f58b539/",
+        "imagenet1000_clsid_to_human.txt",
+    ]
+)
+synset_name = "imagenet1000_clsid_to_human.txt"
+synset_path = download_testdata(synset_url, synset_name, module="data")
 with open(synset_path) as f:
     synset = eval(f.read())
 
@@ -140,7 +146,7 @@ with open(synset_path) as f:
 # It's as easy as several lines.
 
 # We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon
-shape_dict = {'data': x.shape}
+shape_dict = {"data": x.shape}
 mod, params = relay.frontend.from_mxnet(block, shape_dict)
 # we want a probability so add a softmax operator
 func = mod["main"]
@@ -173,11 +179,11 @@ data_shape = (batch_size,) + image_shape
 local_demo = True
 
 if local_demo:
-    target = tvm.target.create('llvm')
+    target = tvm.target.Target("llvm")
 else:
-    target = tvm.target.arm_cpu('rasp3b')
+    target = tvm.target.arm_cpu("rasp3b")
     # The above line is a simple form of
-    # target = tvm.target.create('llvm -device=arm_cpu -model=bcm2837 -mtriple=armv7l-linux-gnueabihf -mattr=+neon')
+    # target = tvm.target.Target('llvm -device=arm_cpu -model=bcm2837 -mtriple=armv7l-linux-gnueabihf -mattr=+neon')
 
 with tvm.transform.PassContext(opt_level=3):
     lib = relay.build(func, target, params=params)
@@ -188,7 +194,7 @@ with tvm.transform.PassContext(opt_level=3):
 
 # Save the library at local temporary directory.
 tmp = util.tempdir()
-lib_fname = tmp.relpath('net.tar')
+lib_fname = tmp.relpath("net.tar")
 lib.export_library(lib_fname)
 
 ######################################################################
@@ -202,23 +208,23 @@ if local_demo:
     remote = rpc.LocalSession()
 else:
     # The following is my environment, change this to the IP address of your target device
-    host = '10.77.1.162'
+    host = "10.77.1.162"
     port = 9090
     remote = rpc.connect(host, port)
 
 # upload the library to remote device and load it
 remote.upload(lib_fname)
-rlib = remote.load_module('net.tar')
+rlib = remote.load_module("net.tar")
 
 # create the remote runtime module
 ctx = remote.cpu(0)
-module = runtime.GraphModule(rlib['default'](ctx))
+module = runtime.GraphModule(rlib["default"](ctx))
 # set input data
-module.set_input('data', tvm.nd.array(x.astype('float32')))
+module.set_input("data", tvm.nd.array(x.astype("float32")))
 # run
 module.run()
 # get output
 out = module.get_output(0)
 # get top1 result
 top1 = np.argmax(out.asnumpy())
-print('TVM prediction top-1: {}'.format(synset[top1]))
+print("TVM prediction top-1: {}".format(synset[top1]))
diff --git a/docs/_downloads/85ba00b8ada85b8c5367f37b526a8caa/tune_relay_x86.py b/docs/_downloads/85ba00b8ada85b8c5367f37b526a8caa/tune_relay_x86.py
index 92fdafb..1dd947f 100644
--- a/docs/_downloads/85ba00b8ada85b8c5367f37b526a8caa/tune_relay_x86.py
+++ b/docs/_downloads/85ba00b8ada85b8c5367f37b526a8caa/tune_relay_x86.py
@@ -53,25 +53,34 @@ def get_network(name, batch_size):
     output_shape = (batch_size, 1000)
 
     if "resnet" in name:
-        n_layer = int(name.split('-')[1])
-        mod, params = relay.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer, batch_size=batch_size, dtype=dtype
+        )
     elif "vgg" in name:
-        n_layer = int(name.split('-')[1])
-        mod, params = relay.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)
-    elif name == 'mobilenet':
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.vgg.get_workload(
+            num_layers=n_layer, batch_size=batch_size, dtype=dtype
+        )
+    elif name == "mobilenet":
         mod, params = relay.testing.mobilenet.get_workload(batch_size=batch_size, dtype=dtype)
-    elif name == 'squeezenet_v1.1':
-        mod, params = relay.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1', dtype=dtype)
-    elif name == 'inception_v3':
+    elif name == "squeezenet_v1.1":
+        mod, params = relay.testing.squeezenet.get_workload(
+            batch_size=batch_size, version="1.1", dtype=dtype
+        )
+    elif name == "inception_v3":
         input_shape = (1, 3, 299, 299)
         mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
-    elif name == 'mxnet':
+    elif name == "mxnet":
         # an example for mxnet model
         from mxnet.gluon.model_zoo.vision import get_model
-        block = get_model('resnet18_v1', pretrained=True)
+
+        block = get_model("resnet18_v1", pretrained=True)
         mod, params = relay.frontend.from_mxnet(block, shape={input_name: input_shape}, dtype=dtype)
         net = mod["main"]
-        net = relay.Function(net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs)
+        net = relay.Function(
+            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
+        )
         mod = tvm.IRModule.from_expr(net)
     else:
         raise ValueError("Unsupported network: " + name)
@@ -121,55 +130,57 @@ os.environ["TVM_NUM_THREADS"] = str(num_threads)
 # latency of one operator closer to its actual latency during end-to-end inference.
 
 tuning_option = {
-    'log_filename': log_file,
-    'tuner': 'random',
-    'early_stopping': None,
-
-    'measure_option': autotvm.measure_option(
+    "log_filename": log_file,
+    "tuner": "random",
+    "early_stopping": None,
+    "measure_option": autotvm.measure_option(
         builder=autotvm.LocalBuilder(),
-        runner=autotvm.LocalRunner(number=1, repeat=10,
-                                   min_repeat_ms=0,
-                                   enable_cpu_cache_flush=True),
+        runner=autotvm.LocalRunner(
+            number=1, repeat=10, min_repeat_ms=0, enable_cpu_cache_flush=True
+        ),
     ),
 }
 
 
 # You can skip the implementation of this function for this tutorial.
-def tune_kernels(tasks,
-                 measure_option,
-                 tuner='gridsearch',
-                 early_stopping=None,
-                 log_filename='tuning.log'):
+def tune_kernels(
+    tasks, measure_option, tuner="gridsearch", early_stopping=None, log_filename="tuning.log"
+):
 
     for i, task in enumerate(tasks):
-        prefix = "[Task %2d/%2d] " % (i+1, len(tasks))
+        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
 
         # create tuner
-        if tuner == 'xgb' or tuner == 'xgb-rank':
-            tuner_obj = XGBTuner(task, loss_type='rank')
-        elif tuner == 'ga':
+        if tuner == "xgb" or tuner == "xgb-rank":
+            tuner_obj = XGBTuner(task, loss_type="rank")
+        elif tuner == "ga":
             tuner_obj = GATuner(task, pop_size=50)
-        elif tuner == 'random':
+        elif tuner == "random":
             tuner_obj = RandomTuner(task)
-        elif tuner == 'gridsearch':
+        elif tuner == "gridsearch":
             tuner_obj = GridSearchTuner(task)
         else:
             raise ValueError("Invalid tuner: " + tuner)
 
         # do tuning
-        n_trial=len(task.config_space)
-        tuner_obj.tune(n_trial=n_trial,
-                       early_stopping=early_stopping,
-                       measure_option=measure_option,
-                       callbacks=[
-                           autotvm.callback.progress_bar(n_trial, prefix=prefix),
-                           autotvm.callback.log_to_file(log_filename)])
+        n_trial = len(task.config_space)
+        tuner_obj.tune(
+            n_trial=n_trial,
+            early_stopping=early_stopping,
+            measure_option=measure_option,
+            callbacks=[
+                autotvm.callback.progress_bar(n_trial, prefix=prefix),
+                autotvm.callback.log_to_file(log_filename),
+            ],
+        )
 
 
 # Use graph tuner to achieve graph level optimal schedules
 # Set use_DP=False if it takes too long to finish.
 def tune_graph(graph, dshape, records, opt_sch_file, use_DP=True):
-    target_op = [relay.op.get("nn.conv2d"),]
+    target_op = [
+        relay.op.get("nn.conv2d"),
+    ]
     Tuner = DPTuner if use_DP else PBQPTuner
     executor = Tuner(graph, {input_name: dshape}, records, target_op, target)
     executor.benchmark_layout_transform(min_exec_num=2000)
@@ -180,13 +191,14 @@ def tune_graph(graph, dshape, records, opt_sch_file, use_DP=True):
 ########################################################################
 # Finally, we launch tuning jobs and evaluate the end-to-end performance.
 
+
 def tune_and_evaluate(tuning_opt):
     # extract workloads from relay program
     print("Extract tasks...")
     mod, params, data_shape, out_shape = get_network(model_name, batch_size)
-    tasks = autotvm.task.extract_from_program(mod["main"], target=target,
-                                              params=params,
-                                              ops=(relay.op.get("nn.conv2d"),))
+    tasks = autotvm.task.extract_from_program(
+        mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"),)
+    )
 
     # run tuning tasks
     tune_kernels(tasks, **tuning_opt)
@@ -196,22 +208,23 @@ def tune_and_evaluate(tuning_opt):
     with autotvm.apply_graph_best(graph_opt_sch_file):
         print("Compile...")
         with tvm.transform.PassContext(opt_level=3):
-            graph, lib, params = relay.build_module.build(
-                mod, target=target, params=params)
+            lib = relay.build_module.build(mod, target=target, params=params)
 
         # upload parameters to device
         ctx = tvm.cpu()
         data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype))
-        module = runtime.create(graph, lib, ctx)
+        module = runtime.GraphModule(lib["default"](ctx))
         module.set_input(input_name, data_tvm)
-        module.set_input(**params)
 
         # evaluate
         print("Evaluate inference time cost...")
         ftimer = module.module.time_evaluator("run", ctx, number=100, repeat=3)
         prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
-        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
-              (np.mean(prof_res), np.std(prof_res)))
+        print(
+            "Mean inference time (std dev): %.2f ms (%.2f ms)"
+            % (np.mean(prof_res), np.std(prof_res))
+        )
+
 
 # We do not run the tuning in our webpage server since it takes too long.
 # Uncomment the following line to run it by yourself.
diff --git a/docs/_downloads/8631d5082613ab80110d8237562cd480/extern_op.ipynb b/docs/_downloads/8631d5082613ab80110d8237562cd480/extern_op.ipynb
index 2c6705c..ee19072 100644
--- a/docs/_downloads/8631d5082613ab80110d8237562cd480/extern_op.ipynb
+++ b/docs/_downloads/8631d5082613ab80110d8237562cd480/extern_op.ipynb
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "from __future__ import absolute_import, print_function\n\nimport tvm\nfrom tvm import te\nimport numpy as np\nfrom tvm.contrib import cblas"
+        "from __future__ import absolute_import, print_function\n\nimport tvm\nfrom tvm import te\nimport numpy as np\nfrom tvm.contrib import cblas\n\nif not tvm.get_global_func(\"tvm.contrib.cblas.matmul\", allow_missing=True):\n    raise Exception(\"Not compiled with cblas support; can't build this tutorial\")"
       ]
     },
     {
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "n = 1024\nl = 128\nm = 235\nbias = te.var('bias', dtype=\"float32\")\nA = te.placeholder((n, l), name='A')\nB = te.placeholder((l, m), name='B')\nC = te.extern((n, m), [A, B],\n               lambda ins, outs: tvm.tir.call_packed(\n                   \"tvm.contrib.cblas.matmul\",\n                   ins[0], ins[1], outs[0], False, False), name=\"C\")\nD = te.compute(C.shape, lambda i, j: C[i,j] + bias, name=\"D\")\ns = te.create_schedule(D.op)"
+        "n = 1024\nl = 128\nm = 235\nbias = te.var(\"bias\", dtype=\"float32\")\nA = te.placeholder((n, l), name=\"A\")\nB = te.placeholder((l, m), name=\"B\")\nC = te.extern(\n    (n, m),\n    [A, B],\n    lambda ins, outs: tvm.tir.call_packed(\n        \"tvm.contrib.cblas.matmul\", ins[0], ins[1], outs[0], False, False\n    ),\n    name=\"C\",\n)\nD = te.compute(C.shape, lambda i, j: C[i, j] + bias, name=\"D\")\ns = te.create_schedule(D.op)"
       ]
     },
     {
@@ -62,7 +62,7 @@
       },
       "outputs": [],
       "source": [
-        "ctx = tvm.cpu(0)\nf = tvm.build(s, [A, B, D, bias], \"llvm\")\na = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)\nb = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)\nd = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)\nbb = 10.0\nf(a, b, d, bb)\ntvm.testing.assert_allclose(\n    d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 10, rtol=1e-5)"
+        "ctx = tvm.cpu(0)\nf = tvm.build(s, [A, B, D, bias], \"llvm\")\na = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)\nb = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)\nd = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)\nbb = 10.0\nf(a, b, d, bb)\ntvm.testing.assert_allclose(d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 10, rtol=1e-5)"
       ]
     },
     {
@@ -80,7 +80,7 @@
       },
       "outputs": [],
       "source": [
-        "from tvm.contrib import cblas\nC = cblas.matmul(A, B)\nD = te.compute(C.shape, lambda i, j: C[i,j] + bias, name=\"D\")\ns = te.create_schedule(D.op)"
+        "from tvm.contrib import cblas\n\nC = cblas.matmul(A, B)\nD = te.compute(C.shape, lambda i, j: C[i, j] + bias, name=\"D\")\ns = te.create_schedule(D.op)"
       ]
     },
     {
@@ -98,7 +98,7 @@
       },
       "outputs": [],
       "source": [
-        "@tvm.register_func(\"tvm.contrib.my_tvm_addone\")\ndef my_tvm_addone(x, y):\n    print(\"my_tvm_addone signatures: %s, %s\" % (type(x), type(y)))\n    tvm.nd.array(x.asnumpy() + 1).copyto(y)\n\nA = te.placeholder((n,), name='A')\nB = te.extern(A.shape, [A], lambda ins, outs: tvm.tir.call_packed(\n    \"tvm.contrib.my_tvm_addone\", ins[0], outs[0]), name=\"C\")\ns = te.create_schedule(B.op)\nf = tvm.build(s, [A, B], \"llvm\")\na = tvm.nd.array(np.random.uniform(size=(n,)).astype( [...]
+        "@tvm.register_func(\"tvm.contrib.my_tvm_addone\")\ndef my_tvm_addone(x, y):\n    print(\"my_tvm_addone signatures: %s, %s\" % (type(x), type(y)))\n    tvm.nd.array(x.asnumpy() + 1).copyto(y)\n\n\nA = te.placeholder((n,), name=\"A\")\nB = te.extern(\n    A.shape,\n    [A],\n    lambda ins, outs: tvm.tir.call_packed(\"tvm.contrib.my_tvm_addone\", ins[0], outs[0]),\n    name=\"C\",\n)\ns = te.create_schedule(B.op)\nf = tvm.build(s, [A, B], \"llvm\")\na = tvm.nd.array(np.random.unif [...]
       ]
     },
     {
diff --git a/docs/_downloads/870680567a5bf1e4697356b416e302b4/opt_matmul_auto_tensorcore.ipynb b/docs/_downloads/870680567a5bf1e4697356b416e302b4/opt_matmul_auto_tensorcore.ipynb
index 6eb4e22..72fdd6f 100644
--- a/docs/_downloads/870680567a5bf1e4697356b416e302b4/opt_matmul_auto_tensorcore.ipynb
+++ b/docs/_downloads/870680567a5bf1e4697356b416e302b4/opt_matmul_auto_tensorcore.ipynb
@@ -33,14 +33,14 @@
       },
       "outputs": [],
       "source": [
-        "import logging\nimport sys\n\nimport numpy as np\nimport tvm\nfrom tvm import te\n\nfrom tvm import autotvm\nfrom tvm.contrib import nvcc\n\ndef matmul_nn(A, B, L, dtype='float16', layout='NN'):\n    k = te.reduce_axis((0, L), name='k')\n    if dtype == 'float16':\n      out_type = 'float'\n    elif dtype == 'int8':\n      out_type = 'int'\n    elif dtype == 'int4' or dtype == 'int1':\n      out_type = 'int'\n    if (layout == 'NN'):\n      return te.compute((N, M), lambda i, j: [...]
+        "import logging\nimport sys\n\nimport numpy as np\nimport tvm\nfrom tvm import te\n\nfrom tvm import autotvm\nfrom tvm.contrib import nvcc\n\n\ndef matmul_nn(A, B, L, dtype=\"float16\", layout=\"NN\"):\n    k = te.reduce_axis((0, L), name=\"k\")\n    if dtype == \"float16\":\n        out_type = \"float\"\n    elif dtype == \"int8\":\n        out_type = \"int\"\n    elif dtype == \"int4\" or dtype == \"int1\":\n        out_type = \"int\"\n    if layout == \"NN\":\n        return t [...]
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Scheduling the Computation\n--------------------------\nThis schedule is no different than a non-tensorcore matmul schedule on GPU.\nPlease refer to `opt-gemm` tutorial for basics of optimizing matmul schedule.\nWhen the \"tensor_core\" pragma is set, the \"rewrite for tensorcore\" ir pass\nwill automatically transform the schedule for tensorcore codegen,\notherwise normal CUDA code, with lower performance but equal functionality, will be generated.\n\n<div class=\"alert alert-i [...]
+        "Scheduling the Computation\n--------------------------\nThis schedule is no different than a non-tensorcore matmul schedule on GPU.\nPlease refer to `opt-gemm` tutorial for basics of optimizing matmul schedule.\nWhen the \"tensor_core\" pragma is set, the \"rewrite for tensorcore\" ir pass\nwill automatically transform the schedule for tensorcore codegen,\notherwise normal CUDA code, with lower performance but equal functionality, will be generated.\n\n<div class=\"alert alert-i [...]
       ]
     },
     {
@@ -51,7 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "@autotvm.template(\"tutorial/auto_tensorcore/test_gemm\")\ndef test_gemm(N, L, M, dtype, layout):\n    if (layout == \"NN\"):\n      shape_a = (N, L)\n      shape_b = (L, M)\n    elif (layout == \"NT\"):\n      shape_a = (L, N)\n      shape_b = (L, M)\n    elif (layout == \"TN\"):\n      shape_a = (N, L)\n      shape_b = (M, L)\n    elif (layout == \"TT\"):\n      shape_a = (L, N)\n      shape_b = (M, L)\n    else:\n      print (\"Unsupported layout:\", layout)\n      sys.exit(1 [...]
+        "@autotvm.template(\"tutorial/auto_tensorcore/test_gemm\")\ndef test_gemm(N, L, M, dtype, layout):\n    if layout == \"NN\":\n        shape_a = (N, L)\n        shape_b = (L, M)\n    elif layout == \"NT\":\n        shape_a = (L, N)\n        shape_b = (L, M)\n    elif layout == \"TN\":\n        shape_a = (N, L)\n        shape_b = (M, L)\n    elif layout == \"TT\":\n        shape_a = (L, N)\n        shape_b = (M, L)\n    else:\n        print(\"Unsupported layout:\", layout)\n        [...]
       ]
     },
     {
@@ -69,7 +69,7 @@
       },
       "outputs": [],
       "source": [
-        "# check whether the gpu has tensorcore\nif not tvm.gpu(0).exist or not tvm.runtime.enabled(\"cuda\"):\n  print(\"skip because cuda is not enabled..\")\n  sys.exit(0)\n\nctx = tvm.gpu()\nif not nvcc.have_tensorcore(ctx.compute_version):\n  print('the gpu has no tensorcore, skipping...')\n  sys.exit(0)\n\nM, N, L = 512, 32, 512\ndtype = 'float16'\nlayout = 'NN'\nif len(sys.argv) >= 4:\n  M, N, L = int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3])\nif len(sys.argv) >= 5:\n  dtyp [...]
+        "# check whether the gpu has tensorcore\nif not tvm.gpu(0).exist or not tvm.runtime.enabled(\"cuda\"):\n    raise Exception(\"skip building this tutorial because cuda is not enabled..\")\n\nctx = tvm.gpu()\nif not nvcc.have_tensorcore(ctx.compute_version):\n    raise Exception(\"the gpu has no tensorcore, skipping...\")\n\nM, N, L = 512, 32, 512\ndtype = \"float16\"\nlayout = \"NN\"\nif len(sys.argv) >= 4:\n    M, N, L = int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3])\nif le [...]
       ]
     },
     {
diff --git a/docs/_downloads/87b9e8307245d848689e4cdc3e6fa9bf/deploy_sparse.ipynb b/docs/_downloads/87b9e8307245d848689e4cdc3e6fa9bf/deploy_sparse.ipynb
index a6d012c..a66da3c 100644
--- a/docs/_downloads/87b9e8307245d848689e4cdc3e6fa9bf/deploy_sparse.ipynb
+++ b/docs/_downloads/87b9e8307245d848689e4cdc3e6fa9bf/deploy_sparse.ipynb
@@ -69,7 +69,7 @@
       },
       "outputs": [],
       "source": [
-        "def load_keras_model(module, name, seq_len, batch_size, report_runtime=True):\n    model = module.from_pretrained(name)\n    dummy_input = tf.keras.Input(shape=[seq_len], batch_size=batch_size, dtype=\"int32\")\n    dummy_out = model(dummy_input)  # Propagate shapes through the keras model.\n    if report_runtime:\n        np_input = np.random.uniform(\n            size=[batch_size, seq_len], low=0, high=seq_len\n        ).astype(\"int32\")\n        start = time.time()\n         [...]
+        "def load_keras_model(module, name, seq_len, batch_size, report_runtime=True):\n    model = module.from_pretrained(name)\n    dummy_input = tf.keras.Input(shape=[seq_len], batch_size=batch_size, dtype=\"int32\")\n    dummy_out = model(dummy_input)  # Propagate shapes through the keras model.\n    if report_runtime:\n        np_input = np.random.uniform(size=[batch_size, seq_len], low=0, high=seq_len).astype(\n            \"int32\"\n        )\n        start = time.time()\n         [...]
       ]
     },
     {
@@ -87,7 +87,7 @@
       },
       "outputs": [],
       "source": [
-        "def import_graphdef(\n    name,\n    batch_size,\n    seq_len,\n    save_relay=True,\n    relay_file=\"model.json\",\n    relay_params=\"model.params\",\n):\n    abs_path = os.path.dirname(os.path.abspath(__file__))\n    shape_dict = {\"input_1\": (batch_size, seq_len)}\n    relay_file = (\"%s_%d_%d_%s\" % (name, batch_size, seq_len, relay_file)).replace(\n        \"/\", \"_\"\n    )\n    relay_params = (\"%s_%d_%d_%s\" % (name, batch_size, seq_len, relay_params)).replace(\n     [...]
+        "def import_graphdef(\n    name,\n    batch_size,\n    seq_len,\n    save_relay=True,\n    relay_file=\"model.json\",\n    relay_params=\"model.params\",\n):\n    abs_path = os.path.dirname(os.path.abspath(__file__))\n    shape_dict = {\"input_1\": (batch_size, seq_len)}\n    relay_file = (\"%s_%d_%d_%s\" % (name, batch_size, seq_len, relay_file)).replace(\"/\", \"_\")\n    relay_params = (\"%s_%d_%d_%s\" % (name, batch_size, seq_len, relay_params)).replace(\"/\", \"_\")\n    if  [...]
       ]
     },
     {
@@ -105,7 +105,7 @@
       },
       "outputs": [],
       "source": [
-        "def run_relay_graph(mod, params, shape_dict, target, ctx):\n    with relay.build_config(opt_level=3):\n        lib = relay.build(mod, target=target, params=params)\n    input_shape = shape_dict[\"input_1\"]\n    dummy_data = np.random.uniform(size=input_shape, low=0, high=input_shape[1]).astype(\n        \"int32\"\n    )\n\n    m = graph_runtime.GraphModule(lib['default'](ctx))\n    m.set_input(0, dummy_data)\n    m.run()\n    tvm_output = m.get_output(0)\n\n    ftimer = m.modul [...]
+        "def run_relay_graph(mod, params, shape_dict, target, ctx):\n    with relay.build_config(opt_level=3):\n        lib = relay.build(mod, target=target, params=params)\n    input_shape = shape_dict[\"input_1\"]\n    dummy_data = np.random.uniform(size=input_shape, low=0, high=input_shape[1]).astype(\"int32\")\n\n    m = graph_runtime.GraphModule(lib[\"default\"](ctx))\n    m.set_input(0, dummy_data)\n    m.run()\n    tvm_output = m.get_output(0)\n\n    ftimer = m.module.time_evaluat [...]
       ]
     },
     {
@@ -123,7 +123,7 @@
       },
       "outputs": [],
       "source": [
-        "def random_bsr_matrix(M, N, BS_R, BS_C, density, dtype=\"float32\"):\n    Y = np.zeros((M, N), dtype=dtype)\n    assert M % BS_R == 0\n    assert N % BS_C == 0\n    nnz = int(density * M * N)\n    num_blocks = int(nnz / (BS_R * BS_C)) + 1\n    candidate_blocks = np.asarray(\n        list(itertools.product(range(0, M, BS_R), range(0, N, BS_C)))\n    )\n    assert candidate_blocks.shape[0] == M // BS_R * N // BS_C\n    chosen_blocks = candidate_blocks[\n        np.random.choice(ca [...]
+        "def random_bsr_matrix(M, N, BS_R, BS_C, density, dtype=\"float32\"):\n    Y = np.zeros((M, N), dtype=dtype)\n    assert M % BS_R == 0\n    assert N % BS_C == 0\n    nnz = int(density * M * N)\n    num_blocks = int(nnz / (BS_R * BS_C)) + 1\n    candidate_blocks = np.asarray(list(itertools.product(range(0, M, BS_R), range(0, N, BS_C))))\n    assert candidate_blocks.shape[0] == M // BS_R * N // BS_C\n    chosen_blocks = candidate_blocks[\n        np.random.choice(candidate_blocks.s [...]
       ]
     },
     {
diff --git a/docs/_downloads/8a7f17665207908e373e8146da09443a/deploy_prequantized.py b/docs/_downloads/8a7f17665207908e373e8146da09443a/deploy_prequantized.py
index ca741b3..81959db 100644
--- a/docs/_downloads/8a7f17665207908e373e8146da09443a/deploy_prequantized.py
+++ b/docs/_downloads/8a7f17665207908e373e8146da09443a/deploy_prequantized.py
@@ -46,19 +46,21 @@ from tvm.contrib.download import download_testdata
 # Helper functions to run the demo
 def get_transform():
     import torchvision.transforms as transforms
-    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
-                                     std=[0.229, 0.224, 0.225])
-    return transforms.Compose([
+
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    return transforms.Compose(
+        [
             transforms.Resize(256),
             transforms.CenterCrop(224),
             transforms.ToTensor(),
             normalize,
-        ])
+        ]
+    )
 
 
 def get_real_image(im_height, im_width):
-    img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
-    img_path = download_testdata(img_url, 'cat.png', module='data')
+    img_url = "https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true"
+    img_path = download_testdata(img_url, "cat.png", module="data")
     return Image.open(img_path).resize((im_height, im_width))
 
 
@@ -70,12 +72,16 @@ def get_imagenet_input():
 
 
 def get_synset():
-    synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
-                          '4d0b62f3d01426887599d4f7ede23ee5/raw/',
-                          '596b27d23537e5a1b5751d2b0481ef172f58b539/',
-                          'imagenet1000_clsid_to_human.txt'])
-    synset_name = 'imagenet1000_clsid_to_human.txt'
-    synset_path = download_testdata(synset_url, synset_name, module='data')
+    synset_url = "".join(
+        [
+            "https://gist.githubusercontent.com/zhreshold/",
+            "4d0b62f3d01426887599d4f7ede23ee5/raw/",
+            "596b27d23537e5a1b5751d2b0481ef172f58b539/",
+            "imagenet1000_clsid_to_human.txt",
+        ]
+    )
+    synset_name = "imagenet1000_clsid_to_human.txt"
+    synset_path = download_testdata(synset_url, synset_name, module="data")
     with open(synset_path) as f:
         return eval(f.read())
 
@@ -84,7 +90,7 @@ def run_tvm_model(mod, params, input_name, inp, target="llvm"):
     with tvm.transform.PassContext(opt_level=3):
         lib = relay.build(mod, target=target, params=params)
 
-    runtime = tvm.contrib.graph_runtime.GraphModule(lib['default'](tvm.context(target, 0)))
+    runtime = tvm.contrib.graph_runtime.GraphModule(lib["default"](tvm.context(target, 0)))
 
     runtime.set_input(input_name, inp)
     runtime.run()
@@ -114,9 +120,10 @@ inp = get_imagenet_input()
 # In short, this function takes a floating point model and converts it to uint8.
 # The model is per-channel quantized.
 
+
 def quantize_model(model, inp):
     model.fuse_model()
-    model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
+    model.qconfig = torch.quantization.get_default_qconfig("fbgemm")
     torch.quantization.prepare(model, inplace=True)
     # Dummy calibration
     model(inp)
@@ -192,8 +199,7 @@ print("%d in 1000 raw floating outputs identical." % np.sum(tvm_result[0] == pt_
 # Here we give an example of how to measure performance of TVM compiled models.
 n_repeat = 100  # should be bigger to make the measurement more accurate
 ctx = tvm.cpu(0)
-ftimer = rt_mod.module.time_evaluator("run", ctx, number=1,
-                                      repeat=n_repeat)
+ftimer = rt_mod.module.time_evaluator("run", ctx, number=1, repeat=n_repeat)
 prof_res = np.array(ftimer().results) * 1e3
 print("Elapsed average ms:", np.mean(prof_res))
 
diff --git a/docs/_downloads/91b0339c8f3cc2594cee580dc450149a/tune_matmul_x86.py b/docs/_downloads/91b0339c8f3cc2594cee580dc450149a/tune_matmul_x86.py
new file mode 100644
index 0000000..1a9af42
--- /dev/null
+++ b/docs/_downloads/91b0339c8f3cc2594cee580dc450149a/tune_matmul_x86.py
@@ -0,0 +1,173 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-scheduling matrix multiplication for CPU
+=============================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, \
+            `Chengfan Jia <https://github.com/jcf94/>`_
+
+Different from the existing :ref:`autotvm <tutorials-autotvm-sec>` which relies on 
+manual templates to define the search space, the auto-scheduler does not require any templates.
+The auto-scheduler is template-free, so users only need to write the computation declaration without
+any schedule commands or templates.
+The auto-scheduler can automatically generate a large
+search space and find a good schedule in the space.
+
+We use matrix multiplication as an example in this tutorial.
+"""
+
+import numpy as np
+import tvm
+from tvm import te, testing, auto_scheduler
+
+######################################################################
+# Define the computation
+# ^^^^^^^^^^^^^^^^^^^^^^
+# To begin with, we define the computation of a matmul with bias add.
+# The function should return the list of input/output tensors.
+# From these tensors, the auto-scheduler can get the whole computational graph.
+
+
+@auto_scheduler.register_workload
+def matmul_add(N, L, M, dtype):
+    A = te.placeholder((N, L), name="A", dtype=dtype)
+    B = te.placeholder((L, M), name="B", dtype=dtype)
+    C = te.placeholder((N, M), name="C", dtype=dtype)
+
+    k = te.reduce_axis((0, L), name="k")
+    matmul = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="matmul")
+    out = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name="out")
+
+    return [A, B, C, out]
+
+
+######################################################################
+# Create the search task
+# ^^^^^^^^^^^^^^^^^^^^^^
+# We then create a search task with N=L=M=128 and dtype="float32"
+
+target = tvm.target.Target("llvm")
+task = auto_scheduler.create_task(matmul_add, (128, 128, 128, "float32"), target)
+
+# Inspect the computational graph
+print(task.compute_dag)
+
+######################################################################
+# Next, we set parameters for the auto-scheduler.
+#
+# * `num_measure_trials` is the number of measurement trials we can use during the search.
+#   We only make 10 trials in this tutorial for a fast demonstration. In practice, 1000 is a
+#   good value for the search to converge. You can do more trials according to your time budget.
+# * In addition, we use `RecordToFile` to dump measurement records into a file `matmul.json`.
+#   The measurement records can be used to query the history best, resume the search,
+#   and do more analyses later.
+# * see :any:`auto_schedule.TuningOptions`: for more parameters
+
+tune_option = auto_scheduler.TuningOptions(
+    num_measure_trials=10, measure_callbacks=[auto_scheduler.RecordToFile("matmul.json")]
+)
+
+######################################################################
+# Run the search
+# ^^^^^^^^^^^^^^
+# Now we get all inputs ready. Pretty simple, isn't it?
+# We can kick off the search and let the auto-scheduler do its magic.
+# After some measurement trials, it will return the best schedule it found.
+
+sch, args = auto_scheduler.auto_schedule(task, tuning_options=tune_option)
+
+######################################################################
+# We can lower the schedule to see the IR after auto-scheduling.
+# The auto-scheduler correctly performs optimizations including multi-level tiling,
+# parallelization, vectorization, unrolling and fusion.
+
+print(tvm.lower(sch, args, simple_mode=True))
+
+######################################################################
+# Check correctness
+# ^^^^^^^^^^^^^^^^^
+# We build the binary and check its correctness
+
+func = tvm.build(sch, args)
+a_np = np.random.uniform(size=(128, 128)).astype(np.float32)
+b_np = np.random.uniform(size=(128, 128)).astype(np.float32)
+c_np = np.random.uniform(size=(128, 128)).astype(np.float32)
+d_np = a_np.dot(b_np) + c_np
+
+d_tvm = tvm.nd.empty(d_np.shape)
+func(tvm.nd.array(a_np), tvm.nd.array(b_np), tvm.nd.array(c_np), d_tvm)
+
+tvm.testing.assert_allclose(d_np, d_tvm.asnumpy(), rtol=1e-3)
+
+######################################################################
+# Using the record file
+# ^^^^^^^^^^^^^^^^^^^^^
+# During the search, all measuremnt records are dumpped into the record
+# file "matmul.json". The measurement records can be used to re-apply search results,
+# resume the search, and perform other analyses.
+
+######################################################################
+# Here is an example where we load the best schedule from a file,
+# print the equivalent python schedule API, and build the binary again.
+
+# Load the measuremnt record for the best schedule
+inp, res = auto_scheduler.load_best("matmul.json", task.workload_key)
+
+# Print equivalent python schedule API. This can be used for debugging and
+# learning the behavior of the auto-scheduler.
+print(task.compute_dag.print_python_code_from_state(inp.state))
+
+# Rebuild the binary. This shows how you can apply the best schedule from a
+# log file without reruning the search again.
+sch, args = task.compute_dag.apply_steps_from_state(inp.state)
+func = tvm.build(sch, args)
+
+######################################################################
+# A more complicated example is to resume the search.
+# In this case, we need to create the search policy and cost model by ourselves
+# and resume the status of search policy and cost model with the log file.
+# In the example below we resume the status and do more 5 trials.
+
+
+def resume_search(task, log_file):
+    cost_model = auto_scheduler.XGBModel()
+    cost_model.update_from_file(log_file)
+    search_policy = auto_scheduler.SketchPolicy(
+        task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)]
+    )
+    tune_option = auto_scheduler.TuningOptions(
+        num_measure_trials=5, measure_callbacks=[auto_scheduler.RecordToFile(log_file)]
+    )
+    sch, args = auto_scheduler.auto_schedule(task, search_policy, tuning_options=tune_option)
+
+
+# resume_search(task, "matmul.json")
+
+######################################################################
+# .. note::
+#   We cannot run the line above because of the conflict between
+#   python's multiprocessing and tvm's thread pool.
+#   After running a tvm generated binary (L112), the python's multiprocessing
+#   library will hang forever.
+#   You have to make sure that you don't run any tvm generated binaries before
+#   calling ansor's search. To run the L156 above, you should comment out L112-114.
+#
+#   You should be careful about this problem in your applications.
+#   There are other workarounds for this problem.
+#   For example, you can start a new thread/process (with the builtin python library
+#   threading or multiprocessing) and run the tvm binaries in the new thread/process.
+#   This provides an isolation and avoids the conflict in the main thread/process.
diff --git a/docs/_downloads/92b34b8e701291844895f4566f6dc366/schedule_primitives.py b/docs/_downloads/92b34b8e701291844895f4566f6dc366/schedule_primitives.py
index 61bfcad..eb48dc2 100644
--- a/docs/_downloads/92b34b8e701291844895f4566f6dc366/schedule_primitives.py
+++ b/docs/_downloads/92b34b8e701291844895f4566f6dc366/schedule_primitives.py
@@ -42,17 +42,17 @@ import numpy as np
 #
 
 # declare some variables for use later
-n = te.var('n')
-m = te.var('m')
+n = te.var("n")
+m = te.var("m")
 
 ######################################################################
 # A schedule can be created from a list of ops, by default the
 # schedule computes tensor in a serial manner in a row-major order.
 
 # declare a matrix element-wise multiply
-A = te.placeholder((m, n), name='A')
-B = te.placeholder((m, n), name='B')
-C = te.compute((m, n), lambda i, j: A[i, j] * B[i, j], name='C')
+A = te.placeholder((m, n), name="A")
+B = te.placeholder((m, n), name="B")
+C = te.compute((m, n), lambda i, j: A[i, j] * B[i, j], name="C")
 
 s = te.create_schedule([C.op])
 # lower will transform the computation from definition to the real
@@ -71,8 +71,8 @@ print(tvm.lower(s, [A, B, C], simple_mode=True))
 # -----
 # :code:`split` can split a specified axis into two axises by
 # :code:`factor`.
-A = te.placeholder((m,), name='A')
-B = te.compute((m,), lambda i: A[i]*2, name='B')
+A = te.placeholder((m,), name="A")
+B = te.compute((m,), lambda i: A[i] * 2, name="B")
 
 s = te.create_schedule(B.op)
 xo, xi = s[B].split(B.op.axis[0], factor=32)
@@ -81,8 +81,8 @@ print(tvm.lower(s, [A, B], simple_mode=True))
 ######################################################################
 # You can also split a axis by :code:`nparts`, which splits the axis
 # contrary with :code:`factor`.
-A = te.placeholder((m,), name='A')
-B = te.compute((m,), lambda i: A[i], name='B')
+A = te.placeholder((m,), name="A")
+B = te.compute((m,), lambda i: A[i], name="B")
 
 s = te.create_schedule(B.op)
 bx, tx = s[B].split(B.op.axis[0], nparts=32)
@@ -93,8 +93,8 @@ print(tvm.lower(s, [A, B], simple_mode=True))
 # ----
 # :code:`tile` help you execute the computation tile by tile over two
 # axises.
-A = te.placeholder((m, n), name='A')
-B = te.compute((m, n), lambda i, j: A[i, j], name='B')
+A = te.placeholder((m, n), name="A")
+B = te.compute((m, n), lambda i, j: A[i, j], name="B")
 
 s = te.create_schedule(B.op)
 xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)
@@ -104,8 +104,8 @@ print(tvm.lower(s, [A, B], simple_mode=True))
 # fuse
 # ----
 # :code:`fuse` can fuse two consecutive axises of one computation.
-A = te.placeholder((m, n), name='A')
-B = te.compute((m, n), lambda i, j: A[i, j], name='B')
+A = te.placeholder((m, n), name="A")
+B = te.compute((m, n), lambda i, j: A[i, j], name="B")
 
 s = te.create_schedule(B.op)
 # tile to four axises first: (i.outer, j.outer, i.inner, j.inner)
@@ -118,8 +118,8 @@ print(tvm.lower(s, [A, B], simple_mode=True))
 # reorder
 # -------
 # :code:`reorder` can reorder the axises in the specified order.
-A = te.placeholder((m, n), name='A')
-B = te.compute((m, n), lambda i, j: A[i, j], name='B')
+A = te.placeholder((m, n), name="A")
+B = te.compute((m, n), lambda i, j: A[i, j], name="B")
 
 s = te.create_schedule(B.op)
 # tile to four axises first: (i.outer, j.outer, i.inner, j.inner)
@@ -133,8 +133,8 @@ print(tvm.lower(s, [A, B], simple_mode=True))
 # ----
 # :code:`bind` can bind a specified axis with a thread axis, often used
 # in gpu programming.
-A = te.placeholder((n,), name='A')
-B = te.compute(A.shape, lambda i: A[i] * 2, name='B')
+A = te.placeholder((n,), name="A")
+B = te.compute(A.shape, lambda i: A[i] * 2, name="B")
 
 s = te.create_schedule(B.op)
 bx, tx = s[B].split(B.op.axis[0], factor=64)
@@ -147,9 +147,9 @@ print(tvm.lower(s, [A, B], simple_mode=True))
 # ----------
 # For a schedule that consists of multiple operators, TVM will compute
 # tensors at the root separately by default.
-A = te.placeholder((m,), name='A')
-B = te.compute((m,), lambda i: A[i]+1, name='B')
-C = te.compute((m,), lambda i: B[i]*2, name='C')
+A = te.placeholder((m,), name="A")
+B = te.compute((m,), lambda i: A[i] + 1, name="B")
+C = te.compute((m,), lambda i: B[i] * 2, name="C")
 
 s = te.create_schedule(C.op)
 print(tvm.lower(s, [A, B, C], simple_mode=True))
@@ -157,9 +157,9 @@ print(tvm.lower(s, [A, B, C], simple_mode=True))
 ######################################################################
 # :code:`compute_at` can move computation of `B` into the first axis
 # of computation of `C`.
-A = te.placeholder((m,), name='A')
-B = te.compute((m,), lambda i: A[i]+1, name='B')
-C = te.compute((m,), lambda i: B[i]*2, name='C')
+A = te.placeholder((m,), name="A")
+B = te.compute((m,), lambda i: A[i] + 1, name="B")
+C = te.compute((m,), lambda i: B[i] * 2, name="C")
 
 s = te.create_schedule(C.op)
 s[B].compute_at(s[C], C.op.axis[0])
@@ -171,9 +171,9 @@ print(tvm.lower(s, [A, B, C], simple_mode=True))
 # :code:`compute_inline` can mark one stage as inline, then the body of
 # computation will be expanded and inserted at the address where the
 # tensor is required.
-A = te.placeholder((m,), name='A')
-B = te.compute((m,), lambda i: A[i]+1, name='B')
-C = te.compute((m,), lambda i: B[i]*2, name='C')
+A = te.placeholder((m,), name="A")
+B = te.compute((m,), lambda i: A[i] + 1, name="B")
+C = te.compute((m,), lambda i: B[i] * 2, name="C")
 
 s = te.create_schedule(C.op)
 s[B].compute_inline()
@@ -183,9 +183,9 @@ print(tvm.lower(s, [A, B, C], simple_mode=True))
 # compute_root
 # ------------
 # :code:`compute_root` can move computation of one stage to the root.
-A = te.placeholder((m,), name='A')
-B = te.compute((m,), lambda i: A[i]+1, name='B')
-C = te.compute((m,), lambda i: B[i]*2, name='C')
+A = te.placeholder((m,), name="A")
+B = te.compute((m,), lambda i: A[i] + 1, name="B")
+C = te.compute((m,), lambda i: B[i] * 2, name="C")
 
 s = te.create_schedule(C.op)
 s[B].compute_at(s[C], C.op.axis[0])
diff --git a/docs/_downloads/9a7956336431664ba6d628347b08f5cb/opt_conv_tensorcore.ipynb b/docs/_downloads/9a7956336431664ba6d628347b08f5cb/opt_conv_tensorcore.ipynb
index a13a62c..77a0d41 100644
--- a/docs/_downloads/9a7956336431664ba6d628347b08f5cb/opt_conv_tensorcore.ipynb
+++ b/docs/_downloads/9a7956336431664ba6d628347b08f5cb/opt_conv_tensorcore.ipynb
@@ -40,7 +40,7 @@
       },
       "outputs": [],
       "source": [
-        "import tvm\nfrom tvm import te\nimport numpy as np\nfrom tvm.contrib import nvcc\n\n# The sizes of inputs and filters\nbatch_size = 256\nheight = 14\nwidth = 14\nin_channels = 256\nout_channels = 512\nkernel_h = 3\nkernel_w = 3\npad_h = 1\npad_w = 1\nstride_h = 1\nstride_w = 1\n\n# TensorCore shape\nblock_size = 16\n\nassert (batch_size % block_size == 0)\nassert (in_channels % block_size == 0)\nassert (out_channels % block_size == 0)\n\n# Input feature map: (N, H, W, IC, n, ic) [...]
+        "import tvm\nfrom tvm import te\nimport numpy as np\nfrom tvm.contrib import nvcc\n\n# The sizes of inputs and filters\nbatch_size = 256\nheight = 14\nwidth = 14\nin_channels = 256\nout_channels = 512\nkernel_h = 3\nkernel_w = 3\npad_h = 1\npad_w = 1\nstride_h = 1\nstride_w = 1\n\n# TensorCore shape\nblock_size = 16\n\nassert batch_size % block_size == 0\nassert in_channels % block_size == 0\nassert out_channels % block_size == 0\n\n# Input feature map: (N, H, W, IC, n, ic)\ndata [...]
       ]
     },
     {
@@ -58,7 +58,7 @@
       },
       "outputs": [],
       "source": [
-        "# Designate the memory hierarchy\nAS = s.cache_read(Apad, 'shared', [Conv])\nWS = s.cache_read(W, 'shared', [Conv])\nAF = s.cache_read(AS, 'wmma.matrix_a', [Conv])\nWF = s.cache_read(WS, 'wmma.matrix_b', [Conv])\nConvF = s.cache_write(Conv, 'wmma.accumulator')"
+        "# Designate the memory hierarchy\nAS = s.cache_read(Apad, \"shared\", [Conv])\nWS = s.cache_read(W, \"shared\", [Conv])\nAF = s.cache_read(AS, \"wmma.matrix_a\", [Conv])\nWF = s.cache_read(WS, \"wmma.matrix_b\", [Conv])\nConvF = s.cache_write(Conv, \"wmma.accumulator\")"
       ]
     },
     {
@@ -76,7 +76,7 @@
       },
       "outputs": [],
       "source": [
-        "def intrin_wmma_load_matrix(scope):\n    n = 16\n    A = te.placeholder((n, n), name='A', dtype='float16')\n    BA = tvm.tir.decl_buffer(A.shape, A.dtype, scope='shared', data_alignment=32, offset_factor=256)\n    C = te.compute((n, n), lambda i, j: A[i, j], name='C')\n    BC = tvm.tir.decl_buffer(C.shape, C.dtype, scope=scope, data_alignment=32, offset_factor=256)\n\n    def intrin_func(ins, outs):\n        ib = tvm.tir.ir_builder.create()\n\n        BA = ins[0]\n        BC = o [...]
+        "def intrin_wmma_load_matrix(scope):\n    n = 16\n    A = te.placeholder((n, n), name=\"A\", dtype=\"float16\")\n    BA = tvm.tir.decl_buffer(A.shape, A.dtype, scope=\"shared\", data_alignment=32, offset_factor=256)\n    C = te.compute((n, n), lambda i, j: A[i, j], name=\"C\")\n    BC = tvm.tir.decl_buffer(C.shape, C.dtype, scope=scope, data_alignment=32, offset_factor=256)\n\n    def intrin_func(ins, outs):\n        ib = tvm.tir.ir_builder.create()\n\n        BA = ins[0]\n       [...]
       ]
     },
     {
@@ -94,7 +94,7 @@
       },
       "outputs": [],
       "source": [
-        "# Define tiling sizes\nblock_row_warps = 4\nblock_col_warps = 2\nwarp_row_tiles = 2\nwarp_col_tiles = 4\nwarp_size = 32\nchunk = 2\n\nblock_x = te.thread_axis('blockIdx.x')\nblock_y = te.thread_axis('blockIdx.y')\nblock_z = te.thread_axis('blockIdx.z')\nthread_x = te.thread_axis('threadIdx.x')\nthread_y = te.thread_axis('threadIdx.y')\nthread_z = te.thread_axis('threadIdx.z')\n\nnc, hc, wc, oc, nnc, ooc = Conv.op.axis\nblock_k = s[Conv].fuse(hc, wc)\ns[Conv].bind(block_k, block_ [...]
+        "# Define tiling sizes\nblock_row_warps = 4\nblock_col_warps = 2\nwarp_row_tiles = 2\nwarp_col_tiles = 4\nwarp_size = 32\nchunk = 2\n\nblock_x = te.thread_axis(\"blockIdx.x\")\nblock_y = te.thread_axis(\"blockIdx.y\")\nblock_z = te.thread_axis(\"blockIdx.z\")\nthread_x = te.thread_axis(\"threadIdx.x\")\nthread_y = te.thread_axis(\"threadIdx.y\")\nthread_z = te.thread_axis(\"threadIdx.z\")\n\nnc, hc, wc, oc, nnc, ooc = Conv.op.axis\nblock_k = s[Conv].fuse(hc, wc)\ns[Conv].bind(blo [...]
       ]
     },
     {
@@ -112,7 +112,7 @@
       },
       "outputs": [],
       "source": [
-        "s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix('wmma.matrix_a'))\ns[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix('wmma.matrix_b'))\ns[Conv].tensorize(nnc, intrin_wmma_store_matrix())\ns[ConvF].tensorize(nnf, intrin_wmma_gemm())\nprint(tvm.lower(s, [A, W, Conv], simple_mode=True))"
+        "s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix(\"wmma.matrix_a\"))\ns[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix(\"wmma.matrix_b\"))\ns[Conv].tensorize(nnc, intrin_wmma_store_matrix())\ns[ConvF].tensorize(nnf, intrin_wmma_gemm())\nprint(tvm.lower(s, [A, W, Conv], simple_mode=True))"
       ]
     },
     {
@@ -130,7 +130,7 @@
       },
       "outputs": [],
       "source": [
-        "ctx = tvm.gpu(0)\nif nvcc.have_tensorcore(ctx.compute_version):\n    with tvm.transform.PassContext(config={\"tir.UnrollLoop\": {\n        \"auto_max_step\": 16\n    }}):\n        func = tvm.build(s, [A, W, Conv], 'cuda')\n    a_np = np.random.uniform(size=data_shape).astype(A.dtype)\n    w_np = np.random.uniform(size=kernel_shape).astype(W.dtype)\n    a = tvm.nd.array(a_np, ctx)\n    w = tvm.nd.array(w_np, ctx)\n    c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx [...]
+        "ctx = tvm.gpu(0)\nif nvcc.have_tensorcore(ctx.compute_version):\n    with tvm.transform.PassContext(config={\"tir.UnrollLoop\": {\"auto_max_step\": 16}}):\n        func = tvm.build(s, [A, W, Conv], \"cuda\")\n    a_np = np.random.uniform(size=data_shape).astype(A.dtype)\n    w_np = np.random.uniform(size=kernel_shape).astype(W.dtype)\n    a = tvm.nd.array(a_np, ctx)\n    w = tvm.nd.array(w_np, ctx)\n    c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx)\n    evaluat [...]
       ]
     },
     {
diff --git a/docs/_downloads/9a950897eeef498440fbe2f0afe2601f/tedd.py b/docs/_downloads/9a950897eeef498440fbe2f0afe2601f/tedd.py
index 7edcde9..e0b8038 100644
--- a/docs/_downloads/9a950897eeef498440fbe2f0afe2601f/tedd.py
+++ b/docs/_downloads/9a950897eeef498440fbe2f0afe2601f/tedd.py
@@ -56,13 +56,13 @@ num_filter = 256
 kernel = 3
 stride = 1
 padding = "SAME"
-dilation=1
+dilation = 1
 
-A = te.placeholder((in_size, in_size, in_channel, batch), name='A')
-W = te.placeholder((kernel, kernel, in_channel, num_filter), name='W')
-B = te.placeholder((1, num_filter, 1), name='bias')
+A = te.placeholder((in_size, in_size, in_channel, batch), name="A")
+W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W")
+B = te.placeholder((1, num_filter, 1), name="bias")
 
-with tvm.target.create("llvm"):
+with tvm.target.Target("llvm"):
     t_conv = topi.nn.conv2d_hwcn(A, W, stride, padding, dilation)
     t_bias = topi.add(t_conv, B)
     t_relu = topi.nn.relu(t_bias)
@@ -77,8 +77,8 @@ with tvm.target.create("llvm"):
 # to render SVG figures showing in notebook directly.
 #
 
-tedd.viz_dataflow_graph(s, dot_file_path = '/tmp/dfg.dot')
-#tedd.viz_dataflow_graph(s, show_svg = True)
+tedd.viz_dataflow_graph(s, dot_file_path="/tmp/dfg.dot")
+# tedd.viz_dataflow_graph(s, show_svg = True)
 
 ######################################################################
 # .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/tedd_dfg.png
@@ -89,8 +89,8 @@ tedd.viz_dataflow_graph(s, dot_file_path = '/tmp/dfg.dot')
 # Edges show nodes' dependency.
 #
 
-tedd.viz_schedule_tree(s, dot_file_path = '/tmp/scheduletree.dot')
-#tedd.viz_schedule_tree(s, show_svg = True)
+tedd.viz_schedule_tree(s, dot_file_path="/tmp/scheduletree.dot")
+# tedd.viz_schedule_tree(s, show_svg = True)
 
 ######################################################################
 # We just rendered the schedule tree graph.  You may notice an warning about ranges not
@@ -101,8 +101,8 @@ tedd.viz_schedule_tree(s, dot_file_path = '/tmp/scheduletree.dot')
 #
 
 s = s.normalize()
-tedd.viz_schedule_tree(s, dot_file_path = '/tmp/scheduletree2.dot')
-#tedd.viz_schedule_tree(s, show_svg = True)
+tedd.viz_schedule_tree(s, dot_file_path="/tmp/scheduletree2.dot")
+# tedd.viz_schedule_tree(s, show_svg = True)
 
 ######################################################################
 # .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/tedd_st.png
@@ -134,8 +134,8 @@ tedd.viz_schedule_tree(s, dot_file_path = '/tmp/scheduletree2.dot')
 #   omitted, making every stage a block, for better readability.
 #
 
-tedd.viz_itervar_relationship_graph(s, dot_file_path = '/tmp/itervar.dot')
-#tedd.viz_itervar_relationship_graph(s, show_svg = True)
+tedd.viz_itervar_relationship_graph(s, dot_file_path="/tmp/itervar.dot")
+# tedd.viz_itervar_relationship_graph(s, show_svg = True)
 
 ######################################################################
 # .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/tedd_itervar_rel.png
diff --git a/docs/_downloads/9b0365fd5723f7c4d4e996637ab9a487/intro_topi.py b/docs/_downloads/9b0365fd5723f7c4d4e996637ab9a487/intro_topi.py
index 5938b69..c9812ff 100644
--- a/docs/_downloads/9b0365fd5723f7c4d4e996637ab9a487/intro_topi.py
+++ b/docs/_downloads/9b0365fd5723f7c4d4e996637ab9a487/intro_topi.py
@@ -39,7 +39,7 @@ import numpy as np
 #
 n = te.var("n")
 m = te.var("m")
-A = te.placeholder((n, m), name='A')
+A = te.placeholder((n, m), name="A")
 k = te.reduce_axis((0, m), "k")
 B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B")
 s = te.create_schedule(B.op)
@@ -97,7 +97,7 @@ print(sg.stages)
 ######################################################################
 # We can test the correctness by comparing with :code:`numpy` result as follows
 #
-func = tvm.build(sg, [a, b, g], 'cuda')
+func = tvm.build(sg, [a, b, g], "cuda")
 ctx = tvm.gpu(0)
 a_np = np.random.uniform(size=(x, y, y)).astype(a.dtype)
 b_np = np.random.uniform(size=(y, y)).astype(b.dtype)
@@ -113,7 +113,7 @@ tvm.testing.assert_allclose(g_nd.asnumpy(), g_np, rtol=1e-5)
 #
 tarray = te.placeholder((512, 512), name="tarray")
 softmax_topi = topi.nn.softmax(tarray)
-with tvm.target.create("cuda"):
+with tvm.target.Target("cuda"):
     sst = topi.cuda.schedule_softmax(softmax_topi)
     print(tvm.lower(sst, [tarray], simple_mode=True))
 
@@ -133,7 +133,7 @@ with tvm.target.create("cuda"):
 data = te.placeholder((1, 3, 224, 224))
 kernel = te.placeholder((10, 3, 5, 5))
 
-with tvm.target.create("cuda"):
+with tvm.target.Target("cuda"):
     conv = topi.cuda.conv2d_nchw(data, kernel, 1, 2, 1)
     out = topi.nn.relu(conv)
     sconv = topi.cuda.schedule_conv2d_nchw([out])
diff --git a/docs/_downloads/9cf0213876be0a9cc4aaa52a1ebd9586/low_level_custom_pass.py b/docs/_downloads/9cf0213876be0a9cc4aaa52a1ebd9586/low_level_custom_pass.py
index 17f864f..44fe59f 100644
--- a/docs/_downloads/9cf0213876be0a9cc4aaa52a1ebd9586/low_level_custom_pass.py
+++ b/docs/_downloads/9cf0213876be0a9cc4aaa52a1ebd9586/low_level_custom_pass.py
@@ -50,12 +50,12 @@ import numpy as np
 #
 
 n = tvm.tir.const(128, "int32")
-a = te.placeholder((n, ), name="a")
-b = te.placeholder((n, ), name="b")
-c = te.compute((n, ), lambda i: a[i] + b[i], name='c')
+a = te.placeholder((n,), name="a")
+b = te.placeholder((n,), name="b")
+c = te.compute((n,), lambda i: a[i] + b[i], name="c")
 
 sch = te.create_schedule(c.op)
-ir  = tvm.lower(sch, [a, b, c])
+ir = tvm.lower(sch, [a, b, c])
 print(ir)
 
 ######################################################################
@@ -83,6 +83,8 @@ print(ir)
 #
 
 loops = []
+
+
 def find_width8(op):
     """ Find all the 'tir.For' nodes whose extent can be divided by 8. """
     if isinstance(op, tvm.tir.For):
@@ -90,6 +92,7 @@ def find_width8(op):
             if op.extent.value % 8 == 0:
                 loops.append(op)
 
+
 #####################################################################
 # IR Transformation
 # ~~~~~~~~~~~~~~~~~
@@ -105,18 +108,20 @@ def find_width8(op):
 #     function will be skipped.
 #
 
+
 def vectorize8(op):
     """ Split can vectorize the loops found in `find_width8`. """
     if op in loops:
         extent = op.extent.value
         name = op.loop_var.name
-        lo, li = te.var(name + '.outer'), te.var(name + '.inner')
+        lo, li = te.var(name + ".outer"), te.var(name + ".inner")
         body = tvm.tir.stmt_functor.substitute(op.body, {op.loop_var: lo * 8 + li})
         body = tvm.tir.For(li, 0, 8, tvm.tir.For.Vectorized, 0, body)
         body = tvm.tir.For(lo, 0, extent // 8, tvm.tir.For.Serial, 0, body)
         return body
     return None
 
+
 @tvm.tir.transform.prim_func_pass(opt_level=0)
 def vectorize(f, mod, ctx):
     global loops
@@ -128,8 +133,7 @@ def vectorize(f, mod, ctx):
 
     # The last list arugment indicates what kinds of nodes will be transformed.
     # Thus, in this case only `For` nodes will call `vectorize8`
-    return f.with_body(
-        tvm.tir.stmt_functor.ir_transform(f.body, None, vectorize8, ['tir.For']))
+    return f.with_body(tvm.tir.stmt_functor.ir_transform(f.body, None, vectorize8, ["tir.For"]))
 
 
 #####################################################################
diff --git a/docs/_downloads/9fc6ca792ca6e47e4bdc761ee1b98501/use_pass_infra.py b/docs/_downloads/9fc6ca792ca6e47e4bdc761ee1b98501/use_pass_infra.py
index 4b842b9..b16eb93 100644
--- a/docs/_downloads/9fc6ca792ca6e47e4bdc761ee1b98501/use_pass_infra.py
+++ b/docs/_downloads/9fc6ca792ca6e47e4bdc761ee1b98501/use_pass_infra.py
@@ -52,11 +52,12 @@ import tvm.relay as relay
 # will be used by various optimizations of the examples in this tutorial.
 # Similarly, users can write a tir primitive function and apply the tir passes.
 
+
 def example():
     shape = (1, 64, 54, 54)
     c_data = np.empty(shape).astype("float32")
     c = relay.const(c_data)
-    weight = relay.var('weight', shape=(64, 64, 3, 3))
+    weight = relay.var("weight", shape=(64, 64, 3, 3))
     x = relay.var("x", relay.TensorType((1, 64, 56, 56), "float32"))
     conv = relay.nn.conv2d(x, weight)
     y = relay.add(c, c)
@@ -67,18 +68,21 @@ def example():
     z2 = relay.add(z, z1)
     return relay.Function([x, weight], z2)
 
+
 ###############################################################################
 # Let us register layout alteration for a conv2d op so that we can apply the
 # layout alteration pass on the example. How alter layout pass works is out
 # the scope of this tutorial.
 
+
 @relay.op.register_alter_op_layout("nn.conv2d", level=101)
 def alter_conv2d(attrs, inputs, tinfos, out_type):
     data, weight = inputs
     new_attrs = dict(attrs)
-    new_attrs['data_layout'] = 'NCHW16c'
+    new_attrs["data_layout"] = "NCHW16c"
     return relay.nn.conv2d(data, weight, **new_attrs)
 
+
 ###############################################################################
 # Optimize the Program
 # --------------------
@@ -148,9 +152,13 @@ print(mod)
 f = example()
 mod = tvm.IRModule.from_expr(f)
 # Glob the interested passes.
-seq = tvm.transform.Sequential([relay.transform.FoldConstant(),
-                                  relay.transform.EliminateCommonSubexpr(),
-                                  relay.transform.FuseOps(fuse_opt_level=2)])
+seq = tvm.transform.Sequential(
+    [
+        relay.transform.FoldConstant(),
+        relay.transform.EliminateCommonSubexpr(),
+        relay.transform.FuseOps(fuse_opt_level=2),
+    ]
+)
 mod1 = seq(mod)
 print(mod1)
 
@@ -191,7 +199,7 @@ print(mod4)
 
 seq1 = tvm.transform.Sequential([relay.transform.AlterOpLayout()])
 with tvm.transform.PassContext(opt_level=3):
-    with tvm.target.create("llvm"):
+    with tvm.target.Target("llvm"):
         mod5 = seq1(mod)
 print(mod5)
 
@@ -207,6 +215,7 @@ print(mod5)
 # visited and each constant in the function will be replaced when we invoke the
 # customized pass.
 
+
 @relay.transform.function_pass(opt_level=1)
 class CustomPipeline:
     """Simple test function to replace one argument to another."""
@@ -221,8 +230,10 @@ class CustomPipeline:
         class ReplaceConstant(tvm.relay.ExprMutator):
             def visit_constant(self, c):
                 return relay.multiply(obj.multiplier, c)
+
         return ReplaceConstant().visit(func)
 
+
 f = example()
 mod = tvm.IRModule.from_expr(f)
 custom_pass = CustomPipeline(multiplier=relay.const(3, "float32"))
@@ -240,16 +251,20 @@ print(mod3)
 
 f = example()
 mod = tvm.IRModule.from_expr(f)
-seq = tvm.transform.Sequential([relay.transform.FoldConstant(),
-                                tvm.transform.PrintIR(),
-                                relay.transform.EliminateCommonSubexpr(),
-                                relay.transform.FuseOps(),
-                                relay.transform.AlterOpLayout()])
+seq = tvm.transform.Sequential(
+    [
+        relay.transform.FoldConstant(),
+        tvm.transform.PrintIR(),
+        relay.transform.EliminateCommonSubexpr(),
+        relay.transform.FuseOps(),
+        relay.transform.AlterOpLayout(),
+    ]
+)
 
 # By inserting the ``PrintIR`` pass after ``FoldConstant``, the pass infra will
 # dump out the module IR when ``FoldConstant`` is done. Users can plug in this
 # pass after any pass they want to debug for viewing the optimization effect.
-# 
+#
 # There is a more flexible debugging mechanism also exposed by the build configuration
 # object. One can pass a tracing function which can be used to execute arbitrary code
 # before and/or after each pass. A tracing function will receive a :py::class:`tvm.IRModule`,
@@ -257,14 +272,16 @@ seq = tvm.transform.Sequential([relay.transform.FoldConstant(),
 # and a boolean indicating whether you are executing before, or after a pass.
 # An example is below.
 
+
 def print_ir(mod, info, is_before):
     """Print the name of the pass, the IR, only before passes execute."""
     if is_before:
         print("Running pass: {}", info)
         print(mod)
 
+
 with tvm.transform.PassContext(opt_level=3, trace=print_ir):
-    with tvm.target.create("llvm"):
+    with tvm.target.Target("llvm"):
         # Perform the optimizations.
         mod = seq(mod)
 print(mod)
diff --git a/docs/_downloads/a2f661bf234a167b5458fa28d8fafedc/tedd.ipynb b/docs/_downloads/a2f661bf234a167b5458fa28d8fafedc/tedd.ipynb
index cb4323a..9dbb495 100644
--- a/docs/_downloads/a2f661bf234a167b5458fa28d8fafedc/tedd.ipynb
+++ b/docs/_downloads/a2f661bf234a167b5458fa28d8fafedc/tedd.ipynb
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "batch = 1\nin_channel = 256\nin_size = 32\nnum_filter = 256\nkernel = 3\nstride = 1\npadding = \"SAME\"\ndilation=1\n\nA = te.placeholder((in_size, in_size, in_channel, batch), name='A')\nW = te.placeholder((kernel, kernel, in_channel, num_filter), name='W')\nB = te.placeholder((1, num_filter, 1), name='bias')\n\nwith tvm.target.create(\"llvm\"):\n    t_conv = topi.nn.conv2d_hwcn(A, W, stride, padding, dilation)\n    t_bias = topi.add(t_conv, B)\n    t_relu = topi.nn.relu(t_bias [...]
+        "batch = 1\nin_channel = 256\nin_size = 32\nnum_filter = 256\nkernel = 3\nstride = 1\npadding = \"SAME\"\ndilation = 1\n\nA = te.placeholder((in_size, in_size, in_channel, batch), name=\"A\")\nW = te.placeholder((kernel, kernel, in_channel, num_filter), name=\"W\")\nB = te.placeholder((1, num_filter, 1), name=\"bias\")\n\nwith tvm.target.Target(\"llvm\"):\n    t_conv = topi.nn.conv2d_hwcn(A, W, stride, padding, dilation)\n    t_bias = topi.add(t_conv, B)\n    t_relu = topi.nn.rel [...]
       ]
     },
     {
@@ -62,7 +62,7 @@
       },
       "outputs": [],
       "source": [
-        "tedd.viz_dataflow_graph(s, dot_file_path = '/tmp/dfg.dot')\n#tedd.viz_dataflow_graph(s, show_svg = True)"
+        "tedd.viz_dataflow_graph(s, dot_file_path=\"/tmp/dfg.dot\")\n# tedd.viz_dataflow_graph(s, show_svg = True)"
       ]
     },
     {
@@ -80,7 +80,7 @@
       },
       "outputs": [],
       "source": [
-        "tedd.viz_schedule_tree(s, dot_file_path = '/tmp/scheduletree.dot')\n#tedd.viz_schedule_tree(s, show_svg = True)"
+        "tedd.viz_schedule_tree(s, dot_file_path=\"/tmp/scheduletree.dot\")\n# tedd.viz_schedule_tree(s, show_svg = True)"
       ]
     },
     {
@@ -98,7 +98,7 @@
       },
       "outputs": [],
       "source": [
-        "s = s.normalize()\ntedd.viz_schedule_tree(s, dot_file_path = '/tmp/scheduletree2.dot')\n#tedd.viz_schedule_tree(s, show_svg = True)"
+        "s = s.normalize()\ntedd.viz_schedule_tree(s, dot_file_path=\"/tmp/scheduletree2.dot\")\n# tedd.viz_schedule_tree(s, show_svg = True)"
       ]
     },
     {
@@ -123,7 +123,7 @@
       },
       "outputs": [],
       "source": [
-        "tedd.viz_itervar_relationship_graph(s, dot_file_path = '/tmp/itervar.dot')\n#tedd.viz_itervar_relationship_graph(s, show_svg = True)"
+        "tedd.viz_itervar_relationship_graph(s, dot_file_path=\"/tmp/itervar.dot\")\n# tedd.viz_itervar_relationship_graph(s, show_svg = True)"
       ]
     },
     {
diff --git a/docs/_downloads/a7ce44923ffcc359fd2e532ac1f62c9e/from_darknet.py b/docs/_downloads/a7ce44923ffcc359fd2e532ac1f62c9e/from_darknet.py
index c49fc8b..bbfb410 100644
--- a/docs/_downloads/a7ce44923ffcc359fd2e532ac1f62c9e/from_darknet.py
+++ b/docs/_downloads/a7ce44923ffcc359fd2e532ac1f62c9e/from_darknet.py
@@ -52,28 +52,28 @@ import tvm.relay.testing.darknet
 # Models are: 'yolov2', 'yolov3' or 'yolov3-tiny'
 
 # Model name
-MODEL_NAME = 'yolov3'
+MODEL_NAME = "yolov3"
 
 ######################################################################
 # Download required files
 # -----------------------
 # Download cfg and weights file if first time.
-CFG_NAME = MODEL_NAME + '.cfg'
-WEIGHTS_NAME = MODEL_NAME + '.weights'
-REPO_URL = 'https://github.com/dmlc/web-data/blob/master/darknet/'
-CFG_URL = REPO_URL + 'cfg/' + CFG_NAME + '?raw=true'
-WEIGHTS_URL = 'https://pjreddie.com/media/files/' + WEIGHTS_NAME
+CFG_NAME = MODEL_NAME + ".cfg"
+WEIGHTS_NAME = MODEL_NAME + ".weights"
+REPO_URL = "https://github.com/dmlc/web-data/blob/master/darknet/"
+CFG_URL = REPO_URL + "cfg/" + CFG_NAME + "?raw=true"
+WEIGHTS_URL = "https://pjreddie.com/media/files/" + WEIGHTS_NAME
 
 cfg_path = download_testdata(CFG_URL, CFG_NAME, module="darknet")
 weights_path = download_testdata(WEIGHTS_URL, WEIGHTS_NAME, module="darknet")
 
 # Download and Load darknet library
-if sys.platform in ['linux', 'linux2']:
-    DARKNET_LIB = 'libdarknet2.0.so'
-    DARKNET_URL = REPO_URL + 'lib/' + DARKNET_LIB + '?raw=true'
-elif sys.platform == 'darwin':
-    DARKNET_LIB = 'libdarknet_mac2.0.so'
-    DARKNET_URL = REPO_URL + 'lib_osx/' + DARKNET_LIB + '?raw=true'
+if sys.platform in ["linux", "linux2"]:
+    DARKNET_LIB = "libdarknet2.0.so"
+    DARKNET_URL = REPO_URL + "lib/" + DARKNET_LIB + "?raw=true"
+elif sys.platform == "darwin":
+    DARKNET_LIB = "libdarknet_mac2.0.so"
+    DARKNET_URL = REPO_URL + "lib_osx/" + DARKNET_LIB + "?raw=true"
 else:
     err = "Darknet lib is not supported on {} platform".format(sys.platform)
     raise NotImplementedError(err)
@@ -81,12 +81,12 @@ else:
 lib_path = download_testdata(DARKNET_URL, DARKNET_LIB, module="darknet")
 
 DARKNET_LIB = __darknetffi__.dlopen(lib_path)
-net = DARKNET_LIB.load_network(cfg_path.encode('utf-8'), weights_path.encode('utf-8'), 0)
-dtype = 'float32'
+net = DARKNET_LIB.load_network(cfg_path.encode("utf-8"), weights_path.encode("utf-8"), 0)
+dtype = "float32"
 batch_size = 1
 
 data = np.empty([batch_size, net.c, net.h, net.w], dtype)
-shape_dict = {'data': data.shape}
+shape_dict = {"data": data.shape}
 print("Converting darknet to relay functions...")
 mod, params = relay.frontend.from_darknet(net, dtype=dtype, shape=data.shape)
 
@@ -94,22 +94,22 @@ mod, params = relay.frontend.from_darknet(net, dtype=dtype, shape=data.shape)
 # Import the graph to Relay
 # -------------------------
 # compile the model
-target = 'llvm'
-target_host = 'llvm'
+target = "llvm"
+target_host = "llvm"
 ctx = tvm.cpu(0)
 data = np.empty([batch_size, net.c, net.h, net.w], dtype)
-shape = {'data': data.shape}
+shape = {"data": data.shape}
 print("Compiling the model...")
 with tvm.transform.PassContext(opt_level=3):
     lib = relay.build(mod, target=target, target_host=target_host, params=params)
 
-[neth, netw] = shape['data'][2:] # Current image shape is 608x608
+[neth, netw] = shape["data"][2:]  # Current image shape is 608x608
 ######################################################################
 # Load a test image
 # -----------------
-test_image = 'dog.jpg'
+test_image = "dog.jpg"
 print("Loading the test image...")
-img_url = REPO_URL + 'data/' + test_image + '?raw=true'
+img_url = REPO_URL + "data/" + test_image + "?raw=true"
 img_path = download_testdata(img_url, test_image, "data")
 
 data = tvm.relay.testing.darknet.load_image(img_path, netw, neth)
@@ -119,10 +119,10 @@ data = tvm.relay.testing.darknet.load_image(img_path, netw, neth)
 # The process is no different from other examples.
 from tvm.contrib import graph_runtime
 
-m = graph_runtime.GraphModule(lib['default'](ctx))
+m = graph_runtime.GraphModule(lib["default"](ctx))
 
 # set inputs
-m.set_input('data', tvm.nd.array(data.astype(dtype)))
+m.set_input("data", tvm.nd.array(data.astype(dtype)))
 # execute
 print("Running the test image...")
 
@@ -134,69 +134,69 @@ nms_thresh = 0.45
 m.run()
 # get outputs
 tvm_out = []
-if MODEL_NAME == 'yolov2':
+if MODEL_NAME == "yolov2":
     layer_out = {}
-    layer_out['type'] = 'Region'
+    layer_out["type"] = "Region"
     # Get the region layer attributes (n, out_c, out_h, out_w, classes, coords, background)
     layer_attr = m.get_output(2).asnumpy()
-    layer_out['biases'] = m.get_output(1).asnumpy()
-    out_shape = (layer_attr[0], layer_attr[1]//layer_attr[0],
-                 layer_attr[2], layer_attr[3])
-    layer_out['output'] = m.get_output(0).asnumpy().reshape(out_shape)
-    layer_out['classes'] = layer_attr[4]
-    layer_out['coords'] = layer_attr[5]
-    layer_out['background'] = layer_attr[6]
+    layer_out["biases"] = m.get_output(1).asnumpy()
+    out_shape = (layer_attr[0], layer_attr[1] // layer_attr[0], layer_attr[2], layer_attr[3])
+    layer_out["output"] = m.get_output(0).asnumpy().reshape(out_shape)
+    layer_out["classes"] = layer_attr[4]
+    layer_out["coords"] = layer_attr[5]
+    layer_out["background"] = layer_attr[6]
     tvm_out.append(layer_out)
 
-elif MODEL_NAME == 'yolov3':
+elif MODEL_NAME == "yolov3":
     for i in range(3):
         layer_out = {}
-        layer_out['type'] = 'Yolo'
+        layer_out["type"] = "Yolo"
         # Get the yolo layer attributes (n, out_c, out_h, out_w, classes, total)
-        layer_attr = m.get_output(i*4+3).asnumpy()
-        layer_out['biases'] = m.get_output(i*4+2).asnumpy()
-        layer_out['mask'] = m.get_output(i*4+1).asnumpy()
-        out_shape = (layer_attr[0], layer_attr[1]//layer_attr[0],
-                     layer_attr[2], layer_attr[3])
-        layer_out['output'] = m.get_output(i*4).asnumpy().reshape(out_shape)
-        layer_out['classes'] = layer_attr[4]
+        layer_attr = m.get_output(i * 4 + 3).asnumpy()
+        layer_out["biases"] = m.get_output(i * 4 + 2).asnumpy()
+        layer_out["mask"] = m.get_output(i * 4 + 1).asnumpy()
+        out_shape = (layer_attr[0], layer_attr[1] // layer_attr[0], layer_attr[2], layer_attr[3])
+        layer_out["output"] = m.get_output(i * 4).asnumpy().reshape(out_shape)
+        layer_out["classes"] = layer_attr[4]
         tvm_out.append(layer_out)
 
-elif MODEL_NAME == 'yolov3-tiny':
+elif MODEL_NAME == "yolov3-tiny":
     for i in range(2):
         layer_out = {}
-        layer_out['type'] = 'Yolo'
+        layer_out["type"] = "Yolo"
         # Get the yolo layer attributes (n, out_c, out_h, out_w, classes, total)
-        layer_attr = m.get_output(i*4+3).asnumpy()
-        layer_out['biases'] = m.get_output(i*4+2).asnumpy()
-        layer_out['mask'] = m.get_output(i*4+1).asnumpy()
-        out_shape = (layer_attr[0], layer_attr[1]//layer_attr[0],
-                     layer_attr[2], layer_attr[3])
-        layer_out['output'] = m.get_output(i*4).asnumpy().reshape(out_shape)
-        layer_out['classes'] = layer_attr[4]
+        layer_attr = m.get_output(i * 4 + 3).asnumpy()
+        layer_out["biases"] = m.get_output(i * 4 + 2).asnumpy()
+        layer_out["mask"] = m.get_output(i * 4 + 1).asnumpy()
+        out_shape = (layer_attr[0], layer_attr[1] // layer_attr[0], layer_attr[2], layer_attr[3])
+        layer_out["output"] = m.get_output(i * 4).asnumpy().reshape(out_shape)
+        layer_out["classes"] = layer_attr[4]
         tvm_out.append(layer_out)
         thresh = 0.560
 
 # do the detection and bring up the bounding boxes
 img = tvm.relay.testing.darknet.load_image_color(img_path)
 _, im_h, im_w = img.shape
-dets = tvm.relay.testing.yolo_detection.fill_network_boxes((netw, neth), (im_w, im_h), thresh,
-                                                      1, tvm_out)
+dets = tvm.relay.testing.yolo_detection.fill_network_boxes(
+    (netw, neth), (im_w, im_h), thresh, 1, tvm_out
+)
 last_layer = net.layers[net.n - 1]
 tvm.relay.testing.yolo_detection.do_nms_sort(dets, last_layer.classes, nms_thresh)
 
-coco_name = 'coco.names'
-coco_url = REPO_URL + 'data/' + coco_name + '?raw=true'
-font_name = 'arial.ttf'
-font_url = REPO_URL + 'data/' + font_name + '?raw=true'
-coco_path = download_testdata(coco_url, coco_name, module='data')
-font_path = download_testdata(font_url, font_name, module='data')
+coco_name = "coco.names"
+coco_url = REPO_URL + "data/" + coco_name + "?raw=true"
+font_name = "arial.ttf"
+font_url = REPO_URL + "data/" + font_name + "?raw=true"
+coco_path = download_testdata(coco_url, coco_name, module="data")
+font_path = download_testdata(font_url, font_name, module="data")
 
 with open(coco_path) as f:
     content = f.readlines()
 
 names = [x.strip() for x in content]
 
-tvm.relay.testing.yolo_detection.draw_detections(font_path, img, dets, thresh, names, last_layer.classes)
+tvm.relay.testing.yolo_detection.draw_detections(
+    font_path, img, dets, thresh, names, last_layer.classes
+)
 plt.imshow(img.transpose(1, 2, 0))
 plt.show()
diff --git a/docs/_downloads/b4d760859f6d9338f70bdb79ddfa3aa8/opt_conv_cuda.py b/docs/_downloads/b4d760859f6d9338f70bdb79ddfa3aa8/opt_conv_cuda.py
index 025e53e..f50d302 100644
--- a/docs/_downloads/b4d760859f6d9338f70bdb79ddfa3aa8/opt_conv_cuda.py
+++ b/docs/_downloads/b4d760859f6d9338f70bdb79ddfa3aa8/opt_conv_cuda.py
@@ -54,28 +54,31 @@ pad = 1
 stride = 1
 
 # Algorithm
-A = te.placeholder((in_size, in_size, in_channel, batch), name='A')
-W = te.placeholder((kernel, kernel, in_channel, out_channel), name='W')
-out_size = (in_size - kernel + 2*pad) // stride + 1
+A = te.placeholder((in_size, in_size, in_channel, batch), name="A")
+W = te.placeholder((kernel, kernel, in_channel, out_channel), name="W")
+out_size = (in_size - kernel + 2 * pad) // stride + 1
 # Pad input
 Apad = te.compute(
-    (in_size + 2*pad, in_size + 2*pad, in_channel, batch),
+    (in_size + 2 * pad, in_size + 2 * pad, in_channel, batch),
     lambda yy, xx, cc, nn: tvm.tir.if_then_else(
-        tvm.tir.all(yy >= pad, yy - pad < in_size,
-                xx >= pad, xx - pad < in_size),
-        A[yy - pad, xx - pad, cc, nn], tvm.tir.const(0., "float32")),
-    name='Apad')
+        tvm.tir.all(yy >= pad, yy - pad < in_size, xx >= pad, xx - pad < in_size),
+        A[yy - pad, xx - pad, cc, nn],
+        tvm.tir.const(0.0, "float32"),
+    ),
+    name="Apad",
+)
 # Create reduction variables
-rc = te.reduce_axis((0, in_channel), name='rc')
-ry = te.reduce_axis((0, kernel), name='ry')
-rx = te.reduce_axis((0, kernel), name='rx')
+rc = te.reduce_axis((0, in_channel), name="rc")
+ry = te.reduce_axis((0, kernel), name="ry")
+rx = te.reduce_axis((0, kernel), name="rx")
 # Compute the convolution
 B = te.compute(
     (out_size, out_size, out_channel, batch),
     lambda yy, xx, ff, nn: te.sum(
-        Apad[yy * stride + ry, xx * stride + rx, rc, nn] * W[ry, rx, rc, ff],
-        axis=[ry, rx, rc]),
-    name='B')
+        Apad[yy * stride + ry, xx * stride + rx, rc, nn] * W[ry, rx, rc, ff], axis=[ry, rx, rc]
+    ),
+    name="B",
+)
 
 
 ###############################################################################
@@ -103,8 +106,8 @@ B = te.compute(
 
 # Designate the memory hierarchy
 s = te.create_schedule(B.op)
-s[Apad].compute_inline() # compute Apad inline
-AA = s.cache_read(Apad, 'shared', [B])
+s[Apad].compute_inline()  # compute Apad inline
+AA = s.cache_read(Apad, "shared", [B])
 WW = s.cache_read(W, "shared", [B])
 AL = s.cache_read(AA, "local", [B])
 WL = s.cache_read(WW, "local", [B])
@@ -234,7 +237,7 @@ s[WW].vectorize(fi)  # vectorize memory load
 # latency of convolution.
 #
 
-func = tvm.build(s, [A, W, B], 'cuda')
+func = tvm.build(s, [A, W, B], "cuda")
 ctx = tvm.gpu(0)
 a_np = np.random.uniform(size=(in_size, in_size, in_channel, batch)).astype(A.dtype)
 w_np = np.random.uniform(size=(kernel, kernel, in_channel, out_channel)).astype(W.dtype)
@@ -243,4 +246,4 @@ w = tvm.nd.array(w_np, ctx)
 b = tvm.nd.array(np.zeros((out_size, out_size, out_channel, batch), dtype=B.dtype), ctx)
 func(a, w, b)
 evaluator = func.time_evaluator(func.entry_name, ctx, number=1)
-print('Convolution: %f ms' % (evaluator(a, w, b).mean * 1e3))
+print("Convolution: %f ms" % (evaluator(a, w, b).mean * 1e3))
diff --git a/docs/_downloads/b9891d1a23f84eec3271025d99d005f7/tune_relay_x86.ipynb b/docs/_downloads/b9891d1a23f84eec3271025d99d005f7/tune_relay_x86.ipynb
index 99ab3fd..ec69e18 100644
--- a/docs/_downloads/b9891d1a23f84eec3271025d99d005f7/tune_relay_x86.ipynb
+++ b/docs/_downloads/b9891d1a23f84eec3271025d99d005f7/tune_relay_x86.ipynb
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "def get_network(name, batch_size):\n    \"\"\"Get the symbol definition and random weight of a network\"\"\"\n    input_shape = (batch_size, 3, 224, 224)\n    output_shape = (batch_size, 1000)\n\n    if \"resnet\" in name:\n        n_layer = int(name.split('-')[1])\n        mod, params = relay.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)\n    elif \"vgg\" in name:\n        n_layer = int(name.split('-')[1])\n        mod, params = relay.testi [...]
+        "def get_network(name, batch_size):\n    \"\"\"Get the symbol definition and random weight of a network\"\"\"\n    input_shape = (batch_size, 3, 224, 224)\n    output_shape = (batch_size, 1000)\n\n    if \"resnet\" in name:\n        n_layer = int(name.split(\"-\")[1])\n        mod, params = relay.testing.resnet.get_workload(\n            num_layers=n_layer, batch_size=batch_size, dtype=dtype\n        )\n    elif \"vgg\" in name:\n        n_layer = int(name.split(\"-\")[1])\n      [...]
       ]
     },
     {
@@ -62,7 +62,7 @@
       },
       "outputs": [],
       "source": [
-        "tuning_option = {\n    'log_filename': log_file,\n    'tuner': 'random',\n    'early_stopping': None,\n\n    'measure_option': autotvm.measure_option(\n        builder=autotvm.LocalBuilder(),\n        runner=autotvm.LocalRunner(number=1, repeat=10,\n                                   min_repeat_ms=0,\n                                   enable_cpu_cache_flush=True),\n    ),\n}\n\n\n# You can skip the implementation of this function for this tutorial.\ndef tune_kernels(tasks,\n    [...]
+        "tuning_option = {\n    \"log_filename\": log_file,\n    \"tuner\": \"random\",\n    \"early_stopping\": None,\n    \"measure_option\": autotvm.measure_option(\n        builder=autotvm.LocalBuilder(),\n        runner=autotvm.LocalRunner(\n            number=1, repeat=10, min_repeat_ms=0, enable_cpu_cache_flush=True\n        ),\n    ),\n}\n\n\n# You can skip the implementation of this function for this tutorial.\ndef tune_kernels(\n    tasks, measure_option, tuner=\"gridsearch\",  [...]
       ]
     },
     {
@@ -80,7 +80,7 @@
       },
       "outputs": [],
       "source": [
-        "def tune_and_evaluate(tuning_opt):\n    # extract workloads from relay program\n    print(\"Extract tasks...\")\n    mod, params, data_shape, out_shape = get_network(model_name, batch_size)\n    tasks = autotvm.task.extract_from_program(mod[\"main\"], target=target,\n                                              params=params,\n                                              ops=(relay.op.get(\"nn.conv2d\"),))\n\n    # run tuning tasks\n    tune_kernels(tasks, **tuning_opt)\n    t [...]
+        "def tune_and_evaluate(tuning_opt):\n    # extract workloads from relay program\n    print(\"Extract tasks...\")\n    mod, params, data_shape, out_shape = get_network(model_name, batch_size)\n    tasks = autotvm.task.extract_from_program(\n        mod[\"main\"], target=target, params=params, ops=(relay.op.get(\"nn.conv2d\"),)\n    )\n\n    # run tuning tasks\n    tune_kernels(tasks, **tuning_opt)\n    tune_graph(mod[\"main\"], data_shape, log_file, graph_opt_sch_file)\n\n    # co [...]
       ]
     },
     {
diff --git a/docs/_downloads/baa4de13ce6d932de43e0eb5c4cb8f16/tensorize.py b/docs/_downloads/baa4de13ce6d932de43e0eb5c4cb8f16/tensorize.py
index ac5b50f..601adb8 100644
--- a/docs/_downloads/baa4de13ce6d932de43e0eb5c4cb8f16/tensorize.py
+++ b/docs/_downloads/baa4de13ce6d932de43e0eb5c4cb8f16/tensorize.py
@@ -47,11 +47,10 @@ import numpy as np
 # The following lines describe the computation :code:`A * B^T` in TVM.
 #
 N, M, L = 1024, 512, 64
-A = te.placeholder((N, L), name='A')
-B = te.placeholder((M, L), name='B')
-k = te.reduce_axis((0, L), name='k')
-C = te.compute((N, M), lambda i, j:
-                te.sum(A[i, k] * B[j, k], axis=k), name='C')
+A = te.placeholder((N, L), name="A")
+B = te.placeholder((M, L), name="B")
+k = te.reduce_axis((0, L), name="k")
+C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[j, k], axis=k), name="C")
 s = te.create_schedule(C.op)
 print(tvm.lower(s, [A, B, C], simple_mode=True))
 
@@ -66,7 +65,7 @@ print(tvm.lower(s, [A, B, C], simple_mode=True))
 #
 factor = 16
 x, y = C.op.axis
-z, = C.op.reduce_axis
+(z,) = C.op.reduce_axis
 yo, yi = s[C].split(y, factor=factor)
 s[C].reorder(x, yo, yi, z)
 print(tvm.lower(s, [A, B, C], simple_mode=True))
@@ -89,34 +88,35 @@ print(tvm.lower(s, [A, B, C], simple_mode=True))
 # which is done in :code:`intrin_func` below.
 #
 def intrin_gemv(m, l):
-    a = te.placeholder((l,), name='a')
-    b = te.placeholder((m, l), name='b')
-    k = te.reduce_axis((0, l), name='k')
-    c = te.compute((m,), lambda i: te.sum(a[k] * b[i, k], axis=k), name='c')
-    Ab = tvm.tir.decl_buffer(a.shape, a.dtype,
-                         name="A",
-                         offset_factor=1,
-                         strides=[1])
-    Bb = tvm.tir.decl_buffer(b.shape, b.dtype,
-                         name="B",
-                         offset_factor=1,
-                         strides=[te.var("s1"), 1])
-    Cb = tvm.tir.decl_buffer(c.shape, c.dtype,
-                         name="C",
-                         offset_factor=1,
-                         strides=[1])
+    a = te.placeholder((l,), name="a")
+    b = te.placeholder((m, l), name="b")
+    k = te.reduce_axis((0, l), name="k")
+    c = te.compute((m,), lambda i: te.sum(a[k] * b[i, k], axis=k), name="c")
+    Ab = tvm.tir.decl_buffer(a.shape, a.dtype, name="A", offset_factor=1, strides=[1])
+    Bb = tvm.tir.decl_buffer(b.shape, b.dtype, name="B", offset_factor=1, strides=[te.var("s1"), 1])
+    Cb = tvm.tir.decl_buffer(c.shape, c.dtype, name="C", offset_factor=1, strides=[1])
+
     def intrin_func(ins, outs):
         ib = tvm.tir.ir_builder.create()
         aa, bb = ins
         cc = outs[0]
-        ib.emit(tvm.tir.call_extern("int32", "gemv_update",
-                                cc.access_ptr("w"),
-                                aa.access_ptr("r"),
-                                bb.access_ptr("r"),
-                                m, l, bb.strides[0]))
+        ib.emit(
+            tvm.tir.call_extern(
+                "int32",
+                "gemv_update",
+                cc.access_ptr("w"),
+                aa.access_ptr("r"),
+                bb.access_ptr("r"),
+                m,
+                l,
+                bb.strides[0],
+            )
+        )
         return ib.get()
+
     return te.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})
 
+
 ######################################################################
 # Here :code:`te.decl_tensor_intrin` declares how to execute the computation :code:`c.op`.
 # Our implementation simply takes the inputs and outputs,
@@ -161,12 +161,14 @@ def gemv_impl():
       }
     """
     from tvm.contrib import util, clang
+
     temp = util.tempdir()
     ll_path = temp.relpath("temp.ll")
     # Create LLVM ir from c source code
     ll_code = clang.create_llvm(cc_code, output=ll_path)
     return ll_code
 
+
 ######################################################################
 # Now we leverage the pragma attribute :code:`import_llvm` to import llvm asm inline.
 # The importing needs to happen before the tensorized GEMV being executed.
@@ -181,6 +183,7 @@ print(tvm.lower(s, [A, B, C], simple_mode=True))
 func = tvm.build(s, [A, B, C], target="llvm", name="gemv")
 
 from tvm.topi.util import get_const_tuple
+
 dtype = A.dtype
 ctx = tvm.context("cpu", 0)
 a = np.random.uniform(size=get_const_tuple(A.shape)).astype(dtype)
@@ -226,50 +229,56 @@ def gemv_impl():
       }
     """
     from tvm.contrib import util, clang
+
     temp = util.tempdir()
     ll_path = temp.relpath("temp.ll")
     # Create LLVM ir from c source code
     ll_code = clang.create_llvm(cc_code, output=ll_path)
     return ll_code
 
+
 def intrin_gemv(m, l):
-    a = te.placeholder((l,), name='a')
-    b = te.placeholder((m, l), name='b')
-    k = te.reduce_axis((0, l), name='k')
-    c = te.compute((m,), lambda i:
-    te.sum(a[k] * b[i, k], axis=k), name='c')
-    Ab = tvm.tir.decl_buffer(a.shape, a.dtype,
-                         name="A",
-                         offset_factor=1,
-                         strides=[1])
-    Bb = tvm.tir.decl_buffer(b.shape, b.dtype,
-                         name="B",
-                         offset_factor=1,
-                         strides=[te.var("s1"), 1])
-    Cb = tvm.tir.decl_buffer(c.shape, c.dtype,
-                         name="C",
-                         offset_factor=1,
-                         strides=[1])
+    a = te.placeholder((l,), name="a")
+    b = te.placeholder((m, l), name="b")
+    k = te.reduce_axis((0, l), name="k")
+    c = te.compute((m,), lambda i: te.sum(a[k] * b[i, k], axis=k), name="c")
+    Ab = tvm.tir.decl_buffer(a.shape, a.dtype, name="A", offset_factor=1, strides=[1])
+    Bb = tvm.tir.decl_buffer(b.shape, b.dtype, name="B", offset_factor=1, strides=[te.var("s1"), 1])
+    Cb = tvm.tir.decl_buffer(c.shape, c.dtype, name="C", offset_factor=1, strides=[1])
+
     def intrin_func(ins, outs):
         aa, bb = ins
         cc = outs[0]
+
         def _body():
             ib = tvm.tir.ir_builder.create()
-            ib.emit(tvm.tir.call_extern("int32", "gemv_update",
-                                    cc.access_ptr("w"),
-                                    aa.access_ptr("r"),
-                                    bb.access_ptr("r"),
-                                    m, l, bb.strides[0]))
+            ib.emit(
+                tvm.tir.call_extern(
+                    "int32",
+                    "gemv_update",
+                    cc.access_ptr("w"),
+                    aa.access_ptr("r"),
+                    bb.access_ptr("r"),
+                    m,
+                    l,
+                    bb.strides[0],
+                )
+            )
             return ib.get()
+
         def _reduce_reset():
             ib = tvm.tir.ir_builder.create()
             ib.emit(tvm.tir.call_extern("int32", "gemv_reset", cc.access_ptr("w"), m))
             return ib.get()
+
         def _reduce_update():
             return _body()
+
         return _body(), _reduce_reset(), _reduce_update()
+
     return te.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})
 
+
 ######################################################################
 # Note that :code:`intrin_func` now returns a triplet:
 # :code:`(body, reduce_reset, reduce_update)`.
diff --git a/docs/_downloads/baf1373314e0e040008107ff2571b4cd/tune_relay_arm.py b/docs/_downloads/baf1373314e0e040008107ff2571b4cd/tune_relay_arm.py
index d7529b2..a336870 100644
--- a/docs/_downloads/baf1373314e0e040008107ff2571b4cd/tune_relay_arm.py
+++ b/docs/_downloads/baf1373314e0e040008107ff2571b4cd/tune_relay_arm.py
@@ -77,31 +77,41 @@ import tvm.contrib.graph_runtime as runtime
 # We can load some pre-defined network from :code:`relay.testing`.
 # We can also load models from MXNet, ONNX and TensorFlow.
 
+
 def get_network(name, batch_size):
     """Get the symbol definition and random weight of a network"""
     input_shape = (batch_size, 3, 224, 224)
     output_shape = (batch_size, 1000)
 
     if "resnet" in name:
-        n_layer = int(name.split('-')[1])
-        mod, params = relay.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer, batch_size=batch_size, dtype=dtype
+        )
     elif "vgg" in name:
-        n_layer = int(name.split('-')[1])
-        mod, params = relay.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)
-    elif name == 'mobilenet':
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.vgg.get_workload(
+            num_layers=n_layer, batch_size=batch_size, dtype=dtype
+        )
+    elif name == "mobilenet":
         mod, params = relay.testing.mobilenet.get_workload(batch_size=batch_size)
-    elif name == 'squeezenet_v1.1':
-        mod, params = relay.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1', dtype=dtype)
-    elif name == 'inception_v3':
+    elif name == "squeezenet_v1.1":
+        mod, params = relay.testing.squeezenet.get_workload(
+            batch_size=batch_size, version="1.1", dtype=dtype
+        )
+    elif name == "inception_v3":
         input_shape = (1, 3, 299, 299)
         mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
-    elif name == 'mxnet':
+    elif name == "mxnet":
         # an example for mxnet model
         from mxnet.gluon.model_zoo.vision import get_model
-        block = get_model('resnet18_v1', pretrained=True)
-        mod, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype)
+
+        block = get_model("resnet18_v1", pretrained=True)
+        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
         net = mod["main"]
-        net = relay.Function(net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs)
+        net = relay.Function(
+            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
+        )
         mod = tvm.IRModule.from_expr(net)
     else:
         raise ValueError("Unsupported network: " + name)
@@ -190,31 +200,30 @@ def get_network(name, batch_size):
 
 # Replace "aarch64-linux-gnu" with the correct target of your board.
 # This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.
-target = tvm.target.create('llvm -device=arm_cpu -mtriple=aarch64-linux-gnu')
+target = tvm.target.Target("llvm -device=arm_cpu -mtriple=aarch64-linux-gnu")
 
 # Also replace this with the device key in your tracker
-device_key = 'rk3399'
+device_key = "rk3399"
 
 # Set this to True if you use android phone
 use_android = False
 
 #### TUNING OPTION ####
-network = 'resnet-18'
+network = "resnet-18"
 log_file = "%s.%s.log" % (device_key, network)
-dtype = 'float32'
+dtype = "float32"
 
 tuning_option = {
-    'log_filename': log_file,
-
-    'tuner': 'xgb',
-    'n_trial': 1500,
-    'early_stopping': 800,
-
-    'measure_option': autotvm.measure_option(
-        builder=autotvm.LocalBuilder(
-            build_func='ndk' if use_android else 'default'),
+    "log_filename": log_file,
+    "tuner": "xgb",
+    "n_trial": 1500,
+    "early_stopping": 800,
+    "measure_option": autotvm.measure_option(
+        builder=autotvm.LocalBuilder(build_func="ndk" if use_android else "default"),
         runner=autotvm.RPCRunner(
-            device_key, host='0.0.0.0', port=9190,
+            device_key,
+            host="0.0.0.0",
+            port=9190,
             number=5,
             timeout=10,
         ),
@@ -245,31 +254,33 @@ tuning_option = {
 # We will introduce a more sophisticated tuning scheduler in the future.
 
 # You can skip the implementation of this function for this tutorial.
-def tune_tasks(tasks,
-               measure_option,
-               tuner='xgb',
-               n_trial=1000,
-               early_stopping=None,
-               log_filename='tuning.log',
-               use_transfer_learning=True):
+def tune_tasks(
+    tasks,
+    measure_option,
+    tuner="xgb",
+    n_trial=1000,
+    early_stopping=None,
+    log_filename="tuning.log",
+    use_transfer_learning=True,
+):
     # create tmp log file
     tmp_log_file = log_filename + ".tmp"
     if os.path.exists(tmp_log_file):
         os.remove(tmp_log_file)
 
     for i, tsk in enumerate(reversed(tasks)):
-        prefix = "[Task %2d/%2d] " % (i+1, len(tasks))
+        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
 
         # create tuner
-        if tuner == 'xgb' or tuner == 'xgb-rank':
-            tuner_obj = XGBTuner(tsk, loss_type='rank')
-        elif tuner == 'xgb_knob':
-            tuner_obj = XGBTuner(tsk, loss_type='rank', feature_type='knob')
-        elif tuner == 'ga':
+        if tuner == "xgb" or tuner == "xgb-rank":
+            tuner_obj = XGBTuner(tsk, loss_type="rank")
+        elif tuner == "xgb_knob":
+            tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="knob")
+        elif tuner == "ga":
             tuner_obj = GATuner(tsk, pop_size=50)
-        elif tuner == 'random':
+        elif tuner == "random":
             tuner_obj = RandomTuner(tsk)
-        elif tuner == 'gridsearch':
+        elif tuner == "gridsearch":
             tuner_obj = GridSearchTuner(tsk)
         else:
             raise ValueError("Invalid tuner: " + tuner)
@@ -280,13 +291,15 @@ def tune_tasks(tasks,
 
         # do tuning
         tsk_trial = min(n_trial, len(tsk.config_space))
-        tuner_obj.tune(n_trial=tsk_trial,
-                       early_stopping=early_stopping,
-                       measure_option=measure_option,
-                       callbacks=[
-                           autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
-                           autotvm.callback.log_to_file(tmp_log_file)
-                       ])
+        tuner_obj.tune(
+            n_trial=tsk_trial,
+            early_stopping=early_stopping,
+            measure_option=measure_option,
+            callbacks=[
+                autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
+                autotvm.callback.log_to_file(tmp_log_file),
+            ],
+        )
 
     # pick best records to a cache file
     autotvm.record.pick_best(tmp_log_file, log_filename)
@@ -296,13 +309,14 @@ def tune_tasks(tasks,
 ########################################################################
 # Finally, we launch tuning jobs and evaluate the end-to-end performance.
 
+
 def tune_and_evaluate(tuning_opt):
     # extract workloads from relay program
     print("Extract tasks...")
     mod, params, input_shape, _ = get_network(network, batch_size=1)
-    tasks = autotvm.task.extract_from_program(mod["main"], target=target,
-                                              params=params,
-                                              ops=(relay.op.get("nn.conv2d"),))
+    tasks = autotvm.task.extract_from_program(
+        mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"),)
+    )
 
     # run tuning tasks
     print("Tuning...")
@@ -312,13 +326,13 @@ def tune_and_evaluate(tuning_opt):
     with autotvm.apply_history_best(log_file):
         print("Compile...")
         with tvm.transform.PassContext(opt_level=3):
-            graph, lib, params = relay.build_module.build(
-                mod, target=target, params=params)
+            lib = relay.build_module.build(mod, target=target, params=params)
 
         # export library
         tmp = tempdir()
         if use_android:
             from tvm.contrib import ndk
+
             filename = "net.so"
             lib.export_library(tmp.relpath(filename), ndk.create_shared)
         else:
@@ -327,24 +341,25 @@ def tune_and_evaluate(tuning_opt):
 
         # upload module to device
         print("Upload...")
-        remote = autotvm.measure.request_remote(device_key, '0.0.0.0', 9190,
-                                                timeout=10000)
+        remote = autotvm.measure.request_remote(device_key, "0.0.0.0", 9190, timeout=10000)
         remote.upload(tmp.relpath(filename))
         rlib = remote.load_module(filename)
 
         # upload parameters to device
         ctx = remote.context(str(target), 0)
-        module = runtime.create(graph, rlib, ctx)
+        module = runtime.GraphModule(rlib["default"](ctx))
         data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
-        module.set_input('data', data_tvm)
-        module.set_input(**params)
+        module.set_input("data", data_tvm)
 
         # evaluate
         print("Evaluate inference time cost...")
         ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=10)
         prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
-        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
-              (np.mean(prof_res), np.std(prof_res)))
+        print(
+            "Mean inference time (std dev): %.2f ms (%.2f ms)"
+            % (np.mean(prof_res), np.std(prof_res))
+        )
+
 
 # We do not run the tuning in our webpage server since it takes too long.
 # Uncomment the following line to run it by yourself.
diff --git a/docs/_downloads/bba10250678e70879823196c946734fc/vta_get_started.py b/docs/_downloads/bba10250678e70879823196c946734fc/vta_get_started.py
index ab41687..46b050f 100644
--- a/docs/_downloads/bba10250678e70879823196c946734fc/vta_get_started.py
+++ b/docs/_downloads/bba10250678e70879823196c946734fc/vta_get_started.py
@@ -178,7 +178,8 @@ B_buf = te.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: B(*i), "B_buf")
 C_buf = te.compute(
     (o, m, env.BATCH, env.BLOCK_OUT),
     lambda *i: A_buf(*i).astype(env.acc_dtype) + B_buf(*i).astype(env.acc_dtype),
-    name="C_buf")
+    name="C_buf",
+)
 
 ######################################################################
 # Casting the Results
@@ -201,9 +202,8 @@ C_buf = te.compute(
 
 # Cast to output type, and send to main memory
 C = te.compute(
-    (o, m, env.BATCH, env.BLOCK_OUT),
-    lambda *i: C_buf(*i).astype(env.inp_dtype),
-    name="C")
+    (o, m, env.BATCH, env.BLOCK_OUT), lambda *i: C_buf(*i).astype(env.inp_dtype), name="C"
+)
 
 ######################################################################
 # This concludes the computation declaration part of this tutorial.
@@ -353,16 +353,12 @@ f = remote.load_module("vadd.o")
 ctx = remote.ext_dev(0)
 
 # Initialize the A and B arrays randomly in the int range of (-128, 128]
-A_orig = np.random.randint(
-    -128, 128, size=(o * env.BATCH, m * env.BLOCK_OUT)).astype(A.dtype)
-B_orig = np.random.randint(
-    -128, 128, size=(o * env.BATCH, m * env.BLOCK_OUT)).astype(B.dtype)
+A_orig = np.random.randint(-128, 128, size=(o * env.BATCH, m * env.BLOCK_OUT)).astype(A.dtype)
+B_orig = np.random.randint(-128, 128, size=(o * env.BATCH, m * env.BLOCK_OUT)).astype(B.dtype)
 
 # Apply packing to the A and B arrays from a 2D to a 4D packed layout
-A_packed = A_orig.reshape(
-    o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))
-B_packed = B_orig.reshape(
-    o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))
+A_packed = A_orig.reshape(o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))
+B_packed = B_orig.reshape(o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))
 
 # Format the input/output arrays with tvm.nd.array to the DLPack standard
 A_nd = tvm.nd.array(A_packed, ctx)
@@ -380,8 +376,7 @@ f(A_nd, B_nd, C_nd)
 
 # Compute reference result with numpy
 C_ref = (A_orig.astype(env.acc_dtype) + B_orig.astype(env.acc_dtype)).astype(C.dtype)
-C_ref = C_ref.reshape(
-    o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))
+C_ref = C_ref.reshape(o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))
 np.testing.assert_equal(C_ref, C_nd.asnumpy())
 print("Successful vector add test!")
 
diff --git a/docs/_downloads/beb2188d497d67b66bcfbc2c254dccb7/deploy_model_on_rasp.ipynb b/docs/_downloads/beb2188d497d67b66bcfbc2c254dccb7/deploy_model_on_rasp.ipynb
index d152c98..24e2202 100644
--- a/docs/_downloads/beb2188d497d67b66bcfbc2c254dccb7/deploy_model_on_rasp.ipynb
+++ b/docs/_downloads/beb2188d497d67b66bcfbc2c254dccb7/deploy_model_on_rasp.ipynb
@@ -58,7 +58,7 @@
       },
       "outputs": [],
       "source": [
-        "from mxnet.gluon.model_zoo.vision import get_model\nfrom PIL import Image\nimport numpy as np\n\n# one line to get the model\nblock = get_model('resnet18_v1', pretrained=True)"
+        "from mxnet.gluon.model_zoo.vision import get_model\nfrom PIL import Image\nimport numpy as np\n\n# one line to get the model\nblock = get_model(\"resnet18_v1\", pretrained=True)"
       ]
     },
     {
@@ -76,7 +76,7 @@
       },
       "outputs": [],
       "source": [
-        "img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'\nimg_name = 'cat.png'\nimg_path = download_testdata(img_url, img_name, module='data')\nimage = Image.open(img_path).resize((224, 224))\n\ndef transform_image(image):\n    image = np.array(image) - np.array([123., 117., 104.])\n    image /= np.array([58.395, 57.12, 57.375])\n    image = image.transpose((2, 0, 1))\n    image = image[np.newaxis, :]\n    return image\n\nx = transform_image(image)"
+        "img_url = \"https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true\"\nimg_name = \"cat.png\"\nimg_path = download_testdata(img_url, img_name, module=\"data\")\nimage = Image.open(img_path).resize((224, 224))\n\n\ndef transform_image(image):\n    image = np.array(image) - np.array([123.0, 117.0, 104.0])\n    image /= np.array([58.395, 57.12, 57.375])\n    image = image.transpose((2, 0, 1))\n    image = image[np.newaxis, :]\n    return image\n\n\nx = transform_image(image)"
       ]
     },
     {
@@ -94,7 +94,7 @@
       },
       "outputs": [],
       "source": [
-        "synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',\n                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',\n                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',\n                      'imagenet1000_clsid_to_human.txt'])\nsynset_name = 'imagenet1000_clsid_to_human.txt'\nsynset_path = download_testdata(synset_url, synset_name, module='data')\nwith open(synset_path) as f:\n    synset = eval(f.read())"
+        "synset_url = \"\".join(\n    [\n        \"https://gist.githubusercontent.com/zhreshold/\",\n        \"4d0b62f3d01426887599d4f7ede23ee5/raw/\",\n        \"596b27d23537e5a1b5751d2b0481ef172f58b539/\",\n        \"imagenet1000_clsid_to_human.txt\",\n    ]\n)\nsynset_name = \"imagenet1000_clsid_to_human.txt\"\nsynset_path = download_testdata(synset_url, synset_name, module=\"data\")\nwith open(synset_path) as f:\n    synset = eval(f.read())"
       ]
     },
     {
@@ -112,7 +112,7 @@
       },
       "outputs": [],
       "source": [
-        "# We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon\nshape_dict = {'data': x.shape}\nmod, params = relay.frontend.from_mxnet(block, shape_dict)\n# we want a probability so add a softmax operator\nfunc = mod[\"main\"]\nfunc = relay.Function(func.params, relay.nn.softmax(func.body), None, func.type_params, func.attrs)"
+        "# We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon\nshape_dict = {\"data\": x.shape}\nmod, params = relay.frontend.from_mxnet(block, shape_dict)\n# we want a probability so add a softmax operator\nfunc = mod[\"main\"]\nfunc = relay.Function(func.params, relay.nn.softmax(func.body), None, func.type_params, func.attrs)"
       ]
     },
     {
@@ -155,7 +155,7 @@
       },
       "outputs": [],
       "source": [
-        "local_demo = True\n\nif local_demo:\n    target = tvm.target.create('llvm')\nelse:\n    target = tvm.target.arm_cpu('rasp3b')\n    # The above line is a simple form of\n    # target = tvm.target.create('llvm -device=arm_cpu -model=bcm2837 -mtriple=armv7l-linux-gnueabihf -mattr=+neon')\n\nwith tvm.transform.PassContext(opt_level=3):\n    lib = relay.build(func, target, params=params)\n\n# After `relay.build`, you will get three return values: graph,\n# library and the new paramet [...]
+        "local_demo = True\n\nif local_demo:\n    target = tvm.target.Target(\"llvm\")\nelse:\n    target = tvm.target.arm_cpu(\"rasp3b\")\n    # The above line is a simple form of\n    # target = tvm.target.Target('llvm -device=arm_cpu -model=bcm2837 -mtriple=armv7l-linux-gnueabihf -mattr=+neon')\n\nwith tvm.transform.PassContext(opt_level=3):\n    lib = relay.build(func, target, params=params)\n\n# After `relay.build`, you will get three return values: graph,\n# library and the new par [...]
       ]
     },
     {
@@ -173,7 +173,7 @@
       },
       "outputs": [],
       "source": [
-        "# obtain an RPC session from remote device.\nif local_demo:\n    remote = rpc.LocalSession()\nelse:\n    # The following is my environment, change this to the IP address of your target device\n    host = '10.77.1.162'\n    port = 9090\n    remote = rpc.connect(host, port)\n\n# upload the library to remote device and load it\nremote.upload(lib_fname)\nrlib = remote.load_module('net.tar')\n\n# create the remote runtime module\nctx = remote.cpu(0)\nmodule = runtime.GraphModule(rlib [...]
+        "# obtain an RPC session from remote device.\nif local_demo:\n    remote = rpc.LocalSession()\nelse:\n    # The following is my environment, change this to the IP address of your target device\n    host = \"10.77.1.162\"\n    port = 9090\n    remote = rpc.connect(host, port)\n\n# upload the library to remote device and load it\nremote.upload(lib_fname)\nrlib = remote.load_module(\"net.tar\")\n\n# create the remote runtime module\nctx = remote.cpu(0)\nmodule = runtime.GraphModule( [...]
       ]
     }
   ],
diff --git a/docs/_downloads/c316f4b828b813e437473ee752bacdf9/build_gcn.ipynb b/docs/_downloads/c316f4b828b813e437473ee752bacdf9/build_gcn.ipynb
index efd45fa..7918d46 100644
--- a/docs/_downloads/c316f4b828b813e437473ee752bacdf9/build_gcn.ipynb
+++ b/docs/_downloads/c316f4b828b813e437473ee752bacdf9/build_gcn.ipynb
@@ -33,7 +33,7 @@
       },
       "outputs": [],
       "source": [
-        "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport dgl\nimport networkx as nx\nfrom dgl.nn.pytorch import GraphConv\n\nclass GCN(nn.Module):\n    def __init__(self,\n                 g,\n                 n_infeat,\n                 n_hidden,\n                 n_classes,\n                 n_layers,\n                 activation):\n        super(GCN, self).__init__()\n        self.g = g\n        self.layers = nn.ModuleList()\n        self.layers.append(Gra [...]
+        "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport dgl\nimport networkx as nx\nfrom dgl.nn.pytorch import GraphConv\n\n\nclass GCN(nn.Module):\n    def __init__(self, g, n_infeat, n_hidden, n_classes, n_layers, activation):\n        super(GCN, self).__init__()\n        self.g = g\n        self.layers = nn.ModuleList()\n        self.layers.append(GraphConv(n_infeat, n_hidden, activation=activation))\n        for i in range(n_layers - 1):\n            sel [...]
       ]
     },
     {
@@ -51,7 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "from dgl.data import load_data\nfrom collections import namedtuple\n\ndef load_dataset(dataset=\"cora\"):\n    args = namedtuple(\"args\", [\"dataset\"])\n    data = load_data(args(dataset))\n\n    # Remove self-loops to avoid duplicate passing of a node's feature to itself\n    g = data.graph\n    g.remove_edges_from(nx.selfloop_edges(g))\n    g.add_edges_from(zip(g.nodes, g.nodes))\n\n    return g, data\n\n\ndef evaluate(data, logits):\n    test_mask = data.test_mask # the tes [...]
+        "from dgl.data import load_data\nfrom collections import namedtuple\n\n\ndef load_dataset(dataset=\"cora\"):\n    args = namedtuple(\"args\", [\"dataset\"])\n    data = load_data(args(dataset))\n\n    # Remove self-loops to avoid duplicate passing of a node's feature to itself\n    g = data.graph\n    g.remove_edges_from(nx.selfloop_edges(g))\n    g.add_edges_from(zip(g.nodes, g.nodes))\n\n    return g, data\n\n\ndef evaluate(data, logits):\n    test_mask = data.test_mask  # the  [...]
       ]
     },
     {
@@ -87,7 +87,7 @@
       },
       "outputs": [],
       "source": [
-        "from tvm.contrib.download import download_testdata\nfrom dgl import DGLGraph\n\nfeatures = torch.FloatTensor(data.features)\ndgl_g = DGLGraph(g)\n\ntorch_model = GCN(dgl_g,\n                  infeat_dim,\n                  num_hidden,\n                  num_classes,\n                  num_layers,\n                  F.relu)\n\n# Download the pretrained weights\nmodel_url = \"https://homes.cs.washington.edu/~cyulin/media/gnn_model/gcn_%s.torch\"%(dataset)\nmodel_path = download_te [...]
+        "from tvm.contrib.download import download_testdata\nfrom dgl import DGLGraph\n\nfeatures = torch.FloatTensor(data.features)\ndgl_g = DGLGraph(g)\n\ntorch_model = GCN(dgl_g, infeat_dim, num_hidden, num_classes, num_layers, F.relu)\n\n# Download the pretrained weights\nmodel_url = \"https://homes.cs.washington.edu/~cyulin/media/gnn_model/gcn_%s.torch\" % (dataset)\nmodel_path = download_testdata(model_url, \"gcn_%s.pickle\" % (dataset), module=\"gcn_model\")\n\n# Load the weights  [...]
       ]
     },
     {
@@ -112,7 +112,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Define Graph Convolution Layer in Relay\n---------------------------------------\nTo run GCN on TVM, we first need to implement Graph Convolution Layer.\nYou may refer to https://github.com/dmlc/dgl/blob/master/python/dgl/nn/mxnet/conv.py for a GraphConv Layer implemented in DGL with MXNet Backend\n\nThe layer is defined with below operations, note that we apply two transposes to keep adjacency matrix on right hand side of sparse_dense operator,\nthis method is temporary and wil [...]
+        "Define Graph Convolution Layer in Relay\n---------------------------------------\nTo run GCN on TVM, we first need to implement Graph Convolution Layer.\nYou may refer to https://github.com/dmlc/dgl/blob/master/python/dgl/nn/mxnet/conv/graphconv.py for a GraphConv Layer implemented in DGL with MXNet Backend\n\nThe layer is defined with below operations, note that we apply two transposes to keep adjacency matrix on right hand side of sparse_dense operator,\nthis method is tempora [...]
       ]
     },
     {
@@ -123,7 +123,7 @@
       },
       "outputs": [],
       "source": [
-        "from tvm import relay\nfrom tvm.contrib import graph_runtime\nimport tvm\nfrom tvm import te\n\ndef GraphConv(layer_name,\n              input_dim,\n              output_dim,\n              adj,\n              input,\n              norm=None,\n              bias=True,\n              activation=None):\n    \"\"\"\n    Parameters\n    ----------\n    layer_name: str\n    Name of layer\n\n    input_dim: int\n    Input dimension per node feature\n\n    output_dim: int,\n    Output d [...]
+        "from tvm import relay\nfrom tvm.contrib import graph_runtime\nimport tvm\nfrom tvm import te\n\n\ndef GraphConv(layer_name, input_dim, output_dim, adj, input, norm=None, bias=True, activation=None):\n    \"\"\"\n    Parameters\n    ----------\n    layer_name: str\n    Name of layer\n\n    input_dim: int\n    Input dimension per node feature\n\n    output_dim: int,\n    Output dimension per node feature\n\n    adj: namedtuple,\n    Graph representation (Adjacency Matrix) in Spars [...]
       ]
     },
     {
@@ -141,7 +141,7 @@
       },
       "outputs": [],
       "source": [
-        "import numpy as np\nimport networkx as nx\n\ndef prepare_params(g, data):\n    params = {}\n    params['infeats'] = data.features.astype('float32') # Only support float32 as feature for now\n\n    # Generate adjacency matrix\n    adjacency = nx.to_scipy_sparse_matrix(g)\n    params['g_data'] = adjacency.data.astype('float32')\n    params['indices'] = adjacency.indices.astype('int32')\n    params['indptr'] = adjacency.indptr.astype('int32')\n\n    # Normalization w.r.t. node degr [...]
+        "import numpy as np\nimport networkx as nx\n\n\ndef prepare_params(g, data):\n    params = {}\n    params[\"infeats\"] = data.features.astype(\"float32\")  # Only support float32 as feature for now\n\n    # Generate adjacency matrix\n    adjacency = nx.to_scipy_sparse_matrix(g)\n    params[\"g_data\"] = adjacency.data.astype(\"float32\")\n    params[\"indices\"] = adjacency.indices.astype(\"int32\")\n    params[\"indptr\"] = adjacency.indptr.astype(\"int32\")\n\n    # Normalizati [...]
       ]
     },
     {
@@ -159,7 +159,7 @@
       },
       "outputs": [],
       "source": [
-        "# Define input features, norms, adjacency matrix in Relay\ninfeats = relay.var(\"infeats\", shape=data.features.shape)\nnorm = relay.Constant(tvm.nd.array(params['norm']))\ng_data = relay.Constant(tvm.nd.array(params['g_data']))\nindices = relay.Constant(tvm.nd.array(params['indices']))\nindptr = relay.Constant(tvm.nd.array(params['indptr']))\n\nAdjacency = namedtuple('Adjacency', ['data', 'indices', 'indptr'])\nadj = Adjacency(g_data, indices, indptr)\n\n# Construct the 2-layer [...]
+        "# Define input features, norms, adjacency matrix in Relay\ninfeats = relay.var(\"infeats\", shape=data.features.shape)\nnorm = relay.Constant(tvm.nd.array(params[\"norm\"]))\ng_data = relay.Constant(tvm.nd.array(params[\"g_data\"]))\nindices = relay.Constant(tvm.nd.array(params[\"indices\"]))\nindptr = relay.Constant(tvm.nd.array(params[\"indptr\"]))\n\nAdjacency = namedtuple(\"Adjacency\", [\"data\", \"indices\", \"indptr\"])\nadj = Adjacency(g_data, indices, indptr)\n\n# Const [...]
       ]
     },
     {
@@ -177,7 +177,7 @@
       },
       "outputs": [],
       "source": [
-        "model_params = {}\nfor param_tensor in torch_model.state_dict():\n    model_params[param_tensor] = torch_model.state_dict()[param_tensor].numpy()\n\nfor i in range(num_layers+1):\n    params[\"layers.%d.weight\"%(i)] = model_params[\"layers.%d.weight\"%(i)]\n    params[\"layers.%d.bias\"%(i)] = model_params[\"layers.%d.bias\"%(i)]\n\n# Set the TVM build target\ntarget = 'llvm' # Currently only support `llvm` as target\n\nfunc = relay.Function(relay.analysis.free_vars(output), ou [...]
+        "model_params = {}\nfor param_tensor in torch_model.state_dict():\n    model_params[param_tensor] = torch_model.state_dict()[param_tensor].numpy()\n\nfor i in range(num_layers + 1):\n    params[\"layers.%d.weight\" % (i)] = model_params[\"layers.%d.weight\" % (i)]\n    params[\"layers.%d.bias\" % (i)] = model_params[\"layers.%d.bias\" % (i)]\n\n# Set the TVM build target\ntarget = \"llvm\"  # Currently only support `llvm` as target\n\nfunc = relay.Function(relay.analysis.free_var [...]
       ]
     },
     {
diff --git a/docs/_downloads/c49dbffd05b18e5db4049ffe6480aca2/deploy_object_detection_pytorch.ipynb b/docs/_downloads/c49dbffd05b18e5db4049ffe6480aca2/deploy_object_detection_pytorch.ipynb
new file mode 100644
index 0000000..adeb2b3
--- /dev/null
+++ b/docs/_downloads/c49dbffd05b18e5db4049ffe6480aca2/deploy_object_detection_pytorch.ipynb
@@ -0,0 +1,162 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\nCompile PyTorch Object Detection Models\n=======================================\nThis article is an introductory tutorial to deploy PyTorch object\ndetection models with Relay VM.\n\nFor us to begin with, PyTorch should be installed.\nTorchVision is also required since we will be using it as our model zoo.\n\nA quick solution is to install via pip\n\n.. code-block:: bash\n\n    pip install torch==1.4.0\n    pip install torchvision==0.5.0\n\nor please refer to official site\nh [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import tvm\nfrom tvm import relay\nfrom tvm import relay\nfrom tvm.runtime.vm import VirtualMachine\nfrom tvm.contrib.download import download\n\nimport numpy as np\nimport cv2\n\n# PyTorch imports\nimport torch\nimport torchvision"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Load pre-trained maskrcnn from torchvision and do tracing\n---------------------------------------------------------\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "in_size = 300\n\ninput_shape = (1, 3, in_size, in_size)\n\n\ndef do_trace(model, inp):\n    model_trace = torch.jit.trace(model, inp)\n    model_trace.eval()\n    return model_trace\n\n\ndef dict_to_tuple(out_dict):\n    if \"masks\" in out_dict.keys():\n        return out_dict[\"boxes\"], out_dict[\"scores\"], out_dict[\"labels\"], out_dict[\"masks\"]\n    return out_dict[\"boxes\"], out_dict[\"scores\"], out_dict[\"labels\"]\n\n\nclass TraceWrapper(torch.nn.Module):\n    def _ [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Download a test image and pre-process\n-------------------------------------\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "img_path = \"test_street_small.jpg\"\nimg_url = (\n    \"https://raw.githubusercontent.com/dmlc/web-data/\" \"master/gluoncv/detection/street_small.jpg\"\n)\ndownload(img_url, img_path)\n\nimg = cv2.imread(img_path).astype(\"float32\")\nimg = cv2.resize(img, (in_size, in_size))\nimg = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\nimg = np.transpose(img / 255.0, [2, 0, 1])\nimg = np.expand_dims(img, axis=0)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Import the graph to Relay\n-------------------------\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "input_name = \"input0\"\nshape_list = [(input_name, input_shape)]\nmod, params = relay.frontend.from_pytorch(script_module, shape_list)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Compile with Relay VM\n---------------------\nNote: Currently only CPU target is supported. For x86 target, it is\nhighly recommended to build TVM with Intel MKL and Intel OpenMP to get\nbest performance, due to the existence of large dense operator in\ntorchvision rcnn models.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Add \"-libs=mkl\" to get best performance on x86 target.\n# For x86 machine supports AVX512, the complete target is\n# \"llvm -mcpu=skylake-avx512 -libs=mkl\"\ntarget = \"llvm\"\n\nwith tvm.transform.PassContext(opt_level=3, disabled_pass=[\"FoldScaleAxis\"]):\n    vm_exec = relay.vm.compile(mod, target=target, params=params)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Inference with Relay VM\n-----------------------\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "ctx = tvm.cpu()\nvm = VirtualMachine(vm_exec, ctx)\nvm.set_input(\"main\", **{input_name: img})\ntvm_res = vm.run()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Get boxes with score larger than 0.9\n------------------------------------\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "score_threshold = 0.9\nboxes = tvm_res[0].asnumpy().tolist()\nvalid_boxes = []\nfor i, score in enumerate(tvm_res[1].asnumpy().tolist()):\n    if score > score_threshold:\n        valid_boxes.append(boxes[i])\n    else:\n        break\n\nprint(\"Get {} valid boxes\".format(len(valid_boxes)))"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/_downloads/c4d683ae80a4b8a011286f239422638a/deploy_classification.ipynb b/docs/_downloads/c4d683ae80a4b8a011286f239422638a/deploy_classification.ipynb
index 93455af..410e2bb 100644
--- a/docs/_downloads/c4d683ae80a4b8a011286f239422638a/deploy_classification.ipynb
+++ b/docs/_downloads/c4d683ae80a4b8a011286f239422638a/deploy_classification.ipynb
@@ -69,7 +69,7 @@
       },
       "outputs": [],
       "source": [
-        "if env.TARGET not in [\"sim\", \"tsim\"]:\n\n    # Get remote from tracker node if environment variable is set.\n    # To set up the tracker, you'll need to follow the \"Auto-tuning\n    # a convolutional network for VTA\" tutorial.\n    tracker_host = os.environ.get(\"TVM_TRACKER_HOST\", None)\n    tracker_port = os.environ.get(\"TVM_TRACKER_PORT\", None)\n    # Otherwise if you have a device you want to program directly from\n    # the host, make sure you've set the variables  [...]
+        "if env.TARGET not in [\"sim\", \"tsim\"]:\n\n    # Get remote from tracker node if environment variable is set.\n    # To set up the tracker, you'll need to follow the \"Auto-tuning\n    # a convolutional network for VTA\" tutorial.\n    tracker_host = os.environ.get(\"TVM_TRACKER_HOST\", None)\n    tracker_port = os.environ.get(\"TVM_TRACKER_PORT\", None)\n    # Otherwise if you have a device you want to program directly from\n    # the host, make sure you've set the variables  [...]
       ]
     },
     {
@@ -87,7 +87,7 @@
       },
       "outputs": [],
       "source": [
-        "# Load pre-configured AutoTVM schedules\nwith autotvm.tophub.context(target):\n\n    # Populate the shape and data type dictionary for ImageNet classifier input\n    dtype_dict = {\"data\": 'float32'}\n    shape_dict = {\"data\": (env.BATCH, 3, 224, 224)}\n\n    # Get off the shelf gluon model, and convert to relay\n    gluon_model = vision.get_model(model, pretrained=True)\n\n    # Measure build start time\n    build_start = time.time()\n\n    # Start front end compilation\n    [...]
+        "# Load pre-configured AutoTVM schedules\nwith autotvm.tophub.context(target):\n\n    # Populate the shape and data type dictionary for ImageNet classifier input\n    dtype_dict = {\"data\": \"float32\"}\n    shape_dict = {\"data\": (env.BATCH, 3, 224, 224)}\n\n    # Get off the shelf gluon model, and convert to relay\n    gluon_model = vision.get_model(model, pretrained=True)\n\n    # Measure build start time\n    build_start = time.time()\n\n    # Start front end compilation\n  [...]
       ]
     },
     {
@@ -105,7 +105,7 @@
       },
       "outputs": [],
       "source": [
-        "# Download ImageNet categories\ncateg_url = \"https://github.com/uwsaml/web-data/raw/master/vta/models/\"\ncateg_fn = \"synset.txt\"\ndownload.download(join(categ_url, categ_fn), categ_fn)\nsynset = eval(open(categ_fn).read())\n\n# Download test image\nimage_url = 'https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg'\nimage_fn = 'cat.png'\ndownload.download(image_url, image_fn)\n\n# Prepare test image for inference\nimage = Image.open(image_fn).resize((224, 224))\nplt.imsh [...]
+        "# Download ImageNet categories\ncateg_url = \"https://github.com/uwsaml/web-data/raw/master/vta/models/\"\ncateg_fn = \"synset.txt\"\ndownload.download(join(categ_url, categ_fn), categ_fn)\nsynset = eval(open(categ_fn).read())\n\n# Download test image\nimage_url = \"https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg\"\nimage_fn = \"cat.png\"\ndownload.download(image_url, image_fn)\n\n# Prepare test image for inference\nimage = Image.open(image_fn).resize((224, 224))\nplt. [...]
       ]
     }
   ],
diff --git a/docs/_downloads/c634482de86bbc712f15077b3ec3a92f/convolution_opt.ipynb b/docs/_downloads/c634482de86bbc712f15077b3ec3a92f/convolution_opt.ipynb
index 0032e8d..7ec4b29 100644
--- a/docs/_downloads/c634482de86bbc712f15077b3ec3a92f/convolution_opt.ipynb
+++ b/docs/_downloads/c634482de86bbc712f15077b3ec3a92f/convolution_opt.ipynb
@@ -33,7 +33,7 @@
       },
       "outputs": [],
       "source": [
-        "from __future__ import absolute_import, print_function\n\nimport os\nimport tvm\nfrom tvm import te\nimport vta\nimport numpy as np\n\nfrom tvm import rpc\nfrom tvm.contrib import util\nfrom vta.testing import simulator\n\n# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file\nenv = vta.get_env()\n\n# We read the Pynq RPC host IP address and port number from the OS environment\nhost = os.environ.get(\"VTA_RPC_HOST\", \"192.168.2.99\")\nport = int(os.environ. [...]
+        "from __future__ import absolute_import, print_function\n\nimport os\nimport tvm\nimport tvm.testing\nfrom tvm import te\nimport vta\nimport numpy as np\n\nfrom tvm import rpc\nfrom tvm.contrib import util\nfrom vta.testing import simulator\n\n# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file\nenv = vta.get_env()\n\n# We read the Pynq RPC host IP address and port number from the OS environment\nhost = os.environ.get(\"VTA_RPC_HOST\", \"192.168.2.99\")\npo [...]
       ]
     },
     {
@@ -51,7 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "from tvm import topi\n\n# 2D convolution layer dimensions taken from ResNet-18 architecture\n# (9th convolutional layer)\nbatch_size = 1\nheight = 14\nwidth = 14\nin_channels = 256\nout_channels = 256\nkernel_h = 3\nkernel_w = 3\npad_h = 1\npad_w = 1\nstride_h = 1\nstride_w = 1\nassert batch_size % env.BATCH == 0\nassert in_channels % env.BLOCK_IN == 0\nassert out_channels % env.BLOCK_OUT == 0\n\n# Input feature map: (N, IC, H, W, n, ic)\ndata_shape = (batch_size // env.BATCH,\n [...]
+        "from tvm import topi\n\n# 2D convolution layer dimensions taken from ResNet-18 architecture\n# (9th convolutional layer)\nbatch_size = 1\nheight = 14\nwidth = 14\nin_channels = 256\nout_channels = 256\nkernel_h = 3\nkernel_w = 3\npad_h = 1\npad_w = 1\nstride_h = 1\nstride_w = 1\nassert batch_size % env.BATCH == 0\nassert in_channels % env.BLOCK_IN == 0\nassert out_channels % env.BLOCK_OUT == 0\n\n# Input feature map: (N, IC, H, W, n, ic)\ndata_shape = (\n    batch_size // env.BA [...]
       ]
     },
     {
@@ -159,7 +159,7 @@
       },
       "outputs": [],
       "source": [
-        "# This library facilitates 2D convolution testing\nfrom tvm.topi.testing import conv2d_nchw_python\n\n# Compile the TVM module\nmy_conv = vta.build(s, [data, kernel, res], \"ext_dev\", env.target_host, name=\"my_conv\")\ntemp = util.tempdir()\nmy_conv.save(temp.relpath(\"conv2d.o\"))\nremote.upload(temp.relpath(\"conv2d.o\"))\nf = remote.load_module(\"conv2d.o\")\n\n# Get the remote device context\nctx = remote.ext_dev(0)\n\n# Initialize the data and kernel arrays randomly in th [...]
+        "# This library facilitates 2D convolution testing\nfrom tvm.topi.testing import conv2d_nchw_python\n\n# Compile the TVM module\nmy_conv = vta.build(s, [data, kernel, res], \"ext_dev\", env.target_host, name=\"my_conv\")\ntemp = util.tempdir()\nmy_conv.save(temp.relpath(\"conv2d.o\"))\nremote.upload(temp.relpath(\"conv2d.o\"))\nf = remote.load_module(\"conv2d.o\")\n\n# Get the remote device context\nctx = remote.ext_dev(0)\n\n# Initialize the data and kernel arrays randomly in th [...]
       ]
     },
     {
diff --git a/docs/_downloads/ca08de6c440df207921d807474d26f06/deploy_ssd_gluoncv.py b/docs/_downloads/ca08de6c440df207921d807474d26f06/deploy_ssd_gluoncv.py
index 3643c8d..d874487 100644
--- a/docs/_downloads/ca08de6c440df207921d807474d26f06/deploy_ssd_gluoncv.py
+++ b/docs/_downloads/ca08de6c440df207921d807474d26f06/deploy_ssd_gluoncv.py
@@ -58,13 +58,12 @@ from gluoncv import model_zoo, data, utils
 #   to your device.
 
 supported_model = [
-    'ssd_512_resnet50_v1_voc',
-    'ssd_512_resnet50_v1_coco',
-    'ssd_512_resnet101_v2_voc',
-    'ssd_512_mobilenet1.0_voc',
-    'ssd_512_mobilenet1.0_coco',
-    'ssd_300_vgg16_atrous_voc'
-    'ssd_512_vgg16_atrous_coco',
+    "ssd_512_resnet50_v1_voc",
+    "ssd_512_resnet50_v1_coco",
+    "ssd_512_resnet101_v2_voc",
+    "ssd_512_mobilenet1.0_voc",
+    "ssd_512_mobilenet1.0_coco",
+    "ssd_300_vgg16_atrous_voc" "ssd_512_vgg16_atrous_coco",
 ]
 
 model_name = supported_model[0]
@@ -73,9 +72,11 @@ dshape = (1, 3, 512, 512)
 ######################################################################
 # Download and pre-process demo image
 
-im_fname = download_testdata('https://github.com/dmlc/web-data/blob/master/' +
-                             'gluoncv/detection/street_small.jpg?raw=true',
-                             'street_small.jpg', module='data')
+im_fname = download_testdata(
+    "https://github.com/dmlc/web-data/blob/master/" + "gluoncv/detection/street_small.jpg?raw=true",
+    "street_small.jpg",
+    module="data",
+)
 x, img = data.transforms.presets.ssd.load_test(im_fname, short=512)
 
 ######################################################################
@@ -83,26 +84,30 @@ x, img = data.transforms.presets.ssd.load_test(im_fname, short=512)
 
 block = model_zoo.get_model(model_name, pretrained=True)
 
+
 def build(target):
     mod, params = relay.frontend.from_mxnet(block, {"data": dshape})
     with tvm.transform.PassContext(opt_level=3):
         lib = relay.build(mod, target, params=params)
     return lib
 
+
 ######################################################################
 # Create TVM runtime and do inference
 
+
 def run(lib, ctx):
     # Build TVM runtime
-    m = graph_runtime.GraphModule(lib['default'](ctx))
+    m = graph_runtime.GraphModule(lib["default"](ctx))
     tvm_input = tvm.nd.array(x.asnumpy(), ctx=ctx)
-    m.set_input('data', tvm_input)
+    m.set_input("data", tvm_input)
     # execute
     m.run()
     # get outputs
     class_IDs, scores, bounding_boxs = m.get_output(0), m.get_output(1), m.get_output(2)
     return class_IDs, scores, bounding_boxs
 
+
 for target in ["llvm", "cuda"]:
     ctx = tvm.context(target, 0)
     if ctx.exist:
@@ -112,6 +117,11 @@ for target in ["llvm", "cuda"]:
 ######################################################################
 # Display result
 
-ax = utils.viz.plot_bbox(img, bounding_boxs.asnumpy()[0], scores.asnumpy()[0],
-                         class_IDs.asnumpy()[0], class_names=block.classes)
+ax = utils.viz.plot_bbox(
+    img,
+    bounding_boxs.asnumpy()[0],
+    scores.asnumpy()[0],
+    class_IDs.asnumpy()[0],
+    class_names=block.classes,
+)
 plt.show()
diff --git a/docs/_downloads/cbbaae24e5c894dda1ebeded6fded2c3/convolution_opt.py b/docs/_downloads/cbbaae24e5c894dda1ebeded6fded2c3/convolution_opt.py
index d364fef..3f079e8 100644
--- a/docs/_downloads/cbbaae24e5c894dda1ebeded6fded2c3/convolution_opt.py
+++ b/docs/_downloads/cbbaae24e5c894dda1ebeded6fded2c3/convolution_opt.py
@@ -39,6 +39,7 @@ from __future__ import absolute_import, print_function
 
 import os
 import tvm
+import tvm.testing
 from tvm import te
 import vta
 import numpy as np
@@ -143,78 +144,72 @@ assert in_channels % env.BLOCK_IN == 0
 assert out_channels % env.BLOCK_OUT == 0
 
 # Input feature map: (N, IC, H, W, n, ic)
-data_shape = (batch_size // env.BATCH,
-              in_channels // env.BLOCK_IN,
-              height,
-              width,
-              env.BATCH,
-              env.BLOCK_IN)
+data_shape = (
+    batch_size // env.BATCH,
+    in_channels // env.BLOCK_IN,
+    height,
+    width,
+    env.BATCH,
+    env.BLOCK_IN,
+)
 # Kernel: (OC, IC, H, W, oc, ic)
-kernel_shape = (out_channels // env.BLOCK_OUT,
-                in_channels // env.BLOCK_IN,
-                kernel_h,
-                kernel_w,
-                env.BLOCK_OUT,
-                env.BLOCK_IN)
+kernel_shape = (
+    out_channels // env.BLOCK_OUT,
+    in_channels // env.BLOCK_IN,
+    kernel_h,
+    kernel_w,
+    env.BLOCK_OUT,
+    env.BLOCK_IN,
+)
 # Derive output feature map dimensions
 fout_height = (height + 2 * pad_h - kernel_h) // stride_h + 1
 fout_width = (width + 2 * pad_w - kernel_w) // stride_w + 1
 # Output feature map: (N, OC, H, W, n, oc)
-output_shape = (batch_size // env.BATCH,
-                out_channels // env.BLOCK_OUT,
-                fout_height,
-                fout_width,
-                env.BATCH,
-                env.BLOCK_OUT)
+output_shape = (
+    batch_size // env.BATCH,
+    out_channels // env.BLOCK_OUT,
+    fout_height,
+    fout_width,
+    env.BATCH,
+    env.BLOCK_OUT,
+)
 
 # Convolution reduction axes
-dy = te.reduce_axis((0, kernel_h), name='dy')
-dx = te.reduce_axis((0, kernel_w), name='dx')
-ic = te.reduce_axis((0, in_channels // env.BLOCK_IN), name='ic')
-ic_tns = te.reduce_axis((0, env.BLOCK_IN), name='ic_tns')
+dy = te.reduce_axis((0, kernel_h), name="dy")
+dx = te.reduce_axis((0, kernel_w), name="dx")
+ic = te.reduce_axis((0, in_channels // env.BLOCK_IN), name="ic")
+ic_tns = te.reduce_axis((0, env.BLOCK_IN), name="ic_tns")
 
 # Input placeholder tensors
-data = te.placeholder(data_shape,
-                       name="data",
-                       dtype=env.inp_dtype)
-kernel = te.placeholder(kernel_shape,
-                         name="kernel",
-                         dtype=env.wgt_dtype)
+data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
+kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
 
 # Copy buffers:
 #   Apply spatial padding to input feature map
-data_buf = topi.nn.pad(data,
-                       [0, 0, pad_h, pad_w, 0, 0],
-                       name="data_buf")
+data_buf = topi.nn.pad(data, [0, 0, pad_h, pad_w, 0, 0], name="data_buf")
 kernel_buf = te.compute(kernel_shape, lambda *i: kernel(*i), "kernel_buf")
 
 # Declare 2D convolution
 res_conv = te.compute(
     output_shape,
     lambda bo, co, i, j, bi, ci: te.sum(
-      data_buf[bo, ic, i*stride_h+dy, j*stride_w+dx, bi, ic_tns].astype(env.acc_dtype) *
-      kernel_buf[co, ic, dy, dx, ci, ic_tns].astype(env.acc_dtype),
-    axis=[ic, dy, dx, ic_tns]),
-    name="res_conv")
+        data_buf[bo, ic, i * stride_h + dy, j * stride_w + dx, bi, ic_tns].astype(env.acc_dtype)
+        * kernel_buf[co, ic, dy, dx, ci, ic_tns].astype(env.acc_dtype),
+        axis=[ic, dy, dx, ic_tns],
+    ),
+    name="res_conv",
+)
 
 # Add shift stage for fix-point normalization
-res_shr = te.compute(output_shape,
-                      lambda *i: res_conv(*i) >> 8,
-                      name="res_shr")
+res_shr = te.compute(output_shape, lambda *i: res_conv(*i) >> 8, name="res_shr")
 
 # Apply clipping between (0, input max value)
 inp_max = (1 << (env.INP_WIDTH - 1)) - 1
-res_max = te.compute(output_shape,
-                      lambda *i: tvm.te.max(res_shr(*i), 0),
-                      "res_max")
-res_min = te.compute(output_shape,
-                      lambda *i: tvm.te.min(res_max(*i), inp_max),
-                      "res_min")
+res_max = te.compute(output_shape, lambda *i: tvm.te.max(res_shr(*i), 0), "res_max")
+res_min = te.compute(output_shape, lambda *i: tvm.te.min(res_max(*i), inp_max), "res_min")
 
 # Result Tensor
-res = te.compute(output_shape,
-                  lambda *i: res_min(*i).astype(env.inp_dtype),
-                  name="res")
+res = te.compute(output_shape, lambda *i: res_min(*i).astype(env.inp_dtype), name="res")
 
 
 ######################################################################
@@ -385,28 +380,27 @@ ctx = remote.ext_dev(0)
 
 # Initialize the data and kernel arrays randomly in the int range
 # of (-128, 128] in NCHW layout
-data_np = np.random.randint(
-    -128, 128,
-    size=(batch_size, in_channels, height, width)).astype(data.dtype)
+data_np = np.random.randint(-128, 128, size=(batch_size, in_channels, height, width)).astype(
+    data.dtype
+)
 kernel_np = np.random.randint(
-    -128, 128,
-    size=(out_channels, in_channels, kernel_h, kernel_w)).astype(kernel.dtype)
+    -128, 128, size=(out_channels, in_channels, kernel_h, kernel_w)
+).astype(kernel.dtype)
 
 # Apply packing to the data and kernel arrays from a 2D NCHW
 # to a 4D NCHWnc packed layout
-data_packed = data_np.reshape(batch_size // env.BATCH,
-                              env.BATCH,
-                              in_channels // env.BLOCK_IN,
-                              env.BLOCK_IN,
-                              height,
-                              width).transpose((0, 2, 4, 5, 1, 3))
-
-kernel_packed = kernel_np.reshape(out_channels // env.BLOCK_OUT,
-                                  env.BLOCK_OUT,
-                                  in_channels // env.BLOCK_IN,
-                                  env.BLOCK_IN,
-                                  kernel_h,
-                                  kernel_w).transpose((0, 2, 4, 5, 1, 3))
+data_packed = data_np.reshape(
+    batch_size // env.BATCH, env.BATCH, in_channels // env.BLOCK_IN, env.BLOCK_IN, height, width
+).transpose((0, 2, 4, 5, 1, 3))
+
+kernel_packed = kernel_np.reshape(
+    out_channels // env.BLOCK_OUT,
+    env.BLOCK_OUT,
+    in_channels // env.BLOCK_IN,
+    env.BLOCK_IN,
+    kernel_h,
+    kernel_w,
+).transpose((0, 2, 4, 5, 1, 3))
 
 # Format the input/output arrays with tvm.nd.array to the DLPack standard
 data_nd = tvm.nd.array(data_packed, ctx)
@@ -421,19 +415,25 @@ if env.TARGET in ["sim", "tsim"]:
 f(data_nd, kernel_nd, res_nd)
 
 # Verify against numpy implementation
-res_ref = conv2d_nchw_python(data_np.astype(env.acc_dtype),
-                            kernel_np.astype(env.acc_dtype),
-                            (stride_h, stride_w),
-                            (pad_h, pad_w)).astype(env.acc_dtype)
+res_ref = conv2d_nchw_python(
+    data_np.astype(env.acc_dtype),
+    kernel_np.astype(env.acc_dtype),
+    (stride_h, stride_w),
+    (pad_h, pad_w),
+).astype(env.acc_dtype)
 res_ref = res_ref >> env.INP_WIDTH
 res_ref = np.clip(res_ref, 0, inp_max)
 res_ref = res_ref.astype(res.dtype)
-res_ref = res_ref.reshape((batch_size // env.BATCH,
-                           env.BATCH,
-                           out_channels // env.BLOCK_OUT,
-                           env.BLOCK_OUT,
-                           fout_height,
-                           fout_width)).transpose((0, 2, 4, 5, 1, 3))
+res_ref = res_ref.reshape(
+    (
+        batch_size // env.BATCH,
+        env.BATCH,
+        out_channels // env.BLOCK_OUT,
+        env.BLOCK_OUT,
+        fout_height,
+        fout_width,
+    )
+).transpose((0, 2, 4, 5, 1, 3))
 tvm.testing.assert_allclose(res_ref, res_nd.asnumpy())
 
 # Print stats
diff --git a/docs/_downloads/cd8ac9c09164cc04dd9ecd131c536680/micro_tflite.ipynb b/docs/_downloads/cd8ac9c09164cc04dd9ecd131c536680/micro_tflite.ipynb
index 22e3fa2..8a21597 100644
--- a/docs/_downloads/cd8ac9c09164cc04dd9ecd131c536680/micro_tflite.ipynb
+++ b/docs/_downloads/cd8ac9c09164cc04dd9ecd131c536680/micro_tflite.ipynb
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\nMicro TVM with TFLite Models\n============================\n**Author**: `Tom Gall <https://github.com/tom-gall>`_\n\nThis tutorial is an introduction to working with MicroTVM and a TFLite \nmodel with Relay.\n\n"
+        "\nMicro TVM with TFLite Models\n============================\n**Author**: `Tom Gall <https://github.com/tom-gall>`_\n\nThis tutorial is an introduction to working with MicroTVM and a TFLite\nmodel with Relay.\n\n"
       ]
     },
     {
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "# %%\n# Setup\n# -----\n#\n# To get started, TFLite package needs to be installed as prerequisite.\n#\n# install tflite\n#\n# .. code-block:: bash\n#\n#   pip install tflite=2.1.0 --user\n#\n# or you could generate TFLite package yourself. The steps are the following:\n#\n#   Get the flatc compiler.\n#   Please refer to https://github.com/google/flatbuffers for details\n#   and make sure it is properly installed.\n#\n# .. code-block:: bash\n#\n#   flatc --version\n#\n# Get the T [...]
+        "# %%\n# Setup\n# -----\n#\n# To get started, TFLite package needs to be installed as prerequisite.\n#\n# install tflite\n#\n# .. code-block:: bash\n#\n#   pip install tflite=2.1.0 --user\n#\n# or you could generate TFLite package yourself. The steps are the following:\n#\n#   Get the flatc compiler.\n#   Please refer to https://github.com/google/flatbuffers for details\n#   and make sure it is properly installed.\n#\n# .. code-block:: bash\n#\n#   flatc --version\n#\n# Get the T [...]
       ]
     },
     {
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "try:\n    import tflite\n    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\nexcept AttributeError:\n    import tflite.Model\n    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)"
+        "try:\n    import tflite\n\n    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\nexcept AttributeError:\n    import tflite.Model\n\n    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)"
       ]
     },
     {
@@ -62,7 +62,7 @@
       },
       "outputs": [],
       "source": [
-        "version = tflite_model.Version()\nprint (\"Model Version: \" + str(version))"
+        "version = tflite_model.Version()\nprint(\"Model Version: \" + str(version))"
       ]
     },
     {
@@ -80,56 +80,43 @@
       },
       "outputs": [],
       "source": [
-        "input_tensor = \"dense_4_input\"\ninput_shape = (1,)\ninput_dtype = \"float32\"\n\nmod, params = relay.frontend.from_tflite(tflite_model,\n                                         shape_dict={input_tensor: input_shape},\n                                         dtype_dict={input_tensor: input_dtype})\n\n# %%\n# Running on device\n# ----------------------------------------------\n#\n# Setup the device config which is what will be used to communicate\n# with the microcontroller (a [...]
+        "input_tensor = \"dense_4_input\"\ninput_shape = (1,)\ninput_dtype = \"float32\"\n\nmod, params = relay.frontend.from_tflite(\n    tflite_model, shape_dict={input_tensor: input_shape}, dtype_dict={input_tensor: input_dtype}\n)"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Next with the dev_config, we establish a micro session and create\na context\n\n.. code-block:: python\n\n  with micro.Session(dev_config) as sess:\n      ctx = tvm.micro_dev(0)\n\n"
+        "Now we create a build config for relay. turning off two options\nand then calling relay.build which will result in a C source\nfile.\n\n.. code-block:: python\n\n\n"
       ]
     },
     {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "Now we create a build config for relay. turning off two options\nand then calling relay.build which will result in a C source\nfile.\n\n.. code-block:: python\n\n  with tvm.transform.PassContext(opt_level=3, config={'tir.disable_vectorize': True},disabled_pass=['FuseOps']):\n      graph, c_mod, params = relay.build(mod, target=TARGET, params=params)\n\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "With the c_mod that is the handle to our C source code, we create a\nmicro module, followed by a compiled object which behind the scenes\nis linked to the microTVM runtime for running on the target board\n\n.. code-block:: python\n\n  micro_mod = micro.create_micro_mod(c_mod, dev_config)\n  mod = graph_runtime.create(graph, micro_mod, ctx)\n\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "Pass the weights to get ready to perform inference\n\n.. code-block:: python\n\n  mod.set_input(**params)\n\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
       "source": [
-        "The model consumes a single float32 value and returns a predicted\nsine value.\nTo pass the input value we construct a tvm.nd.array object\nwith a single contrived number as input. For this model values of\n0 to 2Pi are acceptable.\n\n.. code-block:: python\n\n  mod.set_input(input_tensor, tvm.nd.array(np.array([0.5], dtype=\"float32\")))\n\n"
+        "TARGET = tvm.target.target.micro(\"host\")\n\nwith tvm.transform.PassContext(\n    opt_level=3, config={\"tir.disable_vectorize\": True}, disabled_pass=[\"FuseOps\"]\n):\n    graph, c_mod, c_params = relay.build(mod, target=TARGET, params=params)\n\n\n# %%\n# Running on simulated device\n# ----------------------------------------------\n#\n# First, compile a static microTVM runtime for the targeted device. In this case, the host simulated\n# device is used.\nworkspace = tvm.micr [...]
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Run the model on device\n\n.. code-block:: python\n\n  mod.run()\n\n"
+        "Next, establish a session with the simulated device and run the\ncomputation. The `with session` line would typically flash an attached\nmicrocontroller, but in this tutorial, it simply launches a subprocess\nto stand in for an attached microcontroller.\n\n.. code-block:: python\n\n\n"
       ]
     },
     {
-      "cell_type": "markdown",
-      "metadata": {},
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
       "source": [
-        "Get output from the run and print\n\n.. code-block:: python\n\n  tvm_output = mod.get_output(0).asnumpy()\n  print(\"result is: \"+str(tvm_output))\n\n"
+        "flasher = compiler.flasher()\nwith tvm.micro.Session(binary=micro_binary, flasher=flasher) as session:\n    graph_mod = tvm.micro.create_local_graph_runtime(\n        graph, session.get_system_lib(), session.context\n    )\n\n    # Set the model parameters using the lowered parameters produced by `relay.build`.\n    graph_mod.set_input(**c_params)\n\n    # The model consumes a single float32 value and returns a predicted sine value.  To pass the\n    # input value we construct a [...]
       ]
     }
   ],
diff --git a/docs/_downloads/cfc40d6a8f25c9fce268b6a4f277d915/cross_compilation_and_rpc.py b/docs/_downloads/cfc40d6a8f25c9fce268b6a4f277d915/cross_compilation_and_rpc.py
index eaf6f03..572ebb8 100644
--- a/docs/_downloads/cfc40d6a8f25c9fce268b6a4f277d915/cross_compilation_and_rpc.py
+++ b/docs/_downloads/cfc40d6a8f25c9fce268b6a4f277d915/cross_compilation_and_rpc.py
@@ -101,8 +101,8 @@ from tvm import rpc
 from tvm.contrib import util
 
 n = tvm.runtime.convert(1024)
-A = te.placeholder((n,), name='A')
-B = te.compute((n,), lambda i: A[i] + 1.0, name='B')
+A = te.placeholder((n,), name="A")
+B = te.compute((n,), lambda i: A[i] + 1.0, name="B")
 s = te.create_schedule(B.op)
 
 ######################################################################
@@ -114,14 +114,14 @@ s = te.create_schedule(B.op)
 local_demo = True
 
 if local_demo:
-    target = 'llvm'
+    target = "llvm"
 else:
-    target = 'llvm -mtriple=armv7l-linux-gnueabihf'
+    target = "llvm -mtriple=armv7l-linux-gnueabihf"
 
-func = tvm.build(s, [A, B], target=target, name='add_one')
+func = tvm.build(s, [A, B], target=target, name="add_one")
 # save the lib at a local temp folder
 temp = util.tempdir()
-path = temp.relpath('lib.tar')
+path = temp.relpath("lib.tar")
 func.export_library(path)
 
 ######################################################################
@@ -168,7 +168,7 @@ if local_demo:
     remote = rpc.LocalSession()
 else:
     # The following is my environment, change this to the IP address of your target device
-    host = '10.77.1.162'
+    host = "10.77.1.162"
     port = 9090
     remote = rpc.connect(host, port)
 
@@ -177,7 +177,7 @@ else:
 # compiler to relink them. Now `func` is a remote module object.
 
 remote.upload(path)
-func = remote.load_module('lib.tar')
+func = remote.load_module("lib.tar")
 
 # create arrays on the remote device
 ctx = remote.cpu()
@@ -196,7 +196,7 @@ np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
 time_f = func.time_evaluator(func.entry_name, ctx, number=10)
 cost = time_f(a, b).mean
-print('%g secs/op' % cost)
+print("%g secs/op" % cost)
 
 #########################################################################
 # Run OpenCL Kernel Remotely by RPC
@@ -221,11 +221,12 @@ print('%g secs/op' % cost)
 #
 # The following function shows how we run an OpenCL kernel remotely
 
+
 def run_opencl():
     # NOTE: This is the setting for my rk3399 board. You need to modify
     # them according to your environment.
     target_host = "llvm -mtriple=aarch64-linux-gnu"
-    opencl_device_host = '10.77.1.145'
+    opencl_device_host = "10.77.1.145"
     opencl_device_port = 9090
 
     # create schedule for the above "add one" compute declaration
@@ -238,10 +239,10 @@ def run_opencl():
     remote = rpc.connect(opencl_device_host, opencl_device_port)
 
     # export and upload
-    path = temp.relpath('lib_cl.tar')
+    path = temp.relpath("lib_cl.tar")
     func.export_library(path)
     remote.upload(path)
-    func = remote.load_module('lib_cl.tar')
+    func = remote.load_module("lib_cl.tar")
 
     # run
     ctx = remote.cl()
@@ -251,6 +252,7 @@ def run_opencl():
     np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
     print("OpenCL test passed!")
 
+
 ######################################################################
 # Summary
 # -------
diff --git a/docs/_downloads/d37aecc420f8b90ce29faee4df5d0bcd/matrix_multiply.ipynb b/docs/_downloads/d37aecc420f8b90ce29faee4df5d0bcd/matrix_multiply.ipynb
index b4daeaf..71895d9 100644
--- a/docs/_downloads/d37aecc420f8b90ce29faee4df5d0bcd/matrix_multiply.ipynb
+++ b/docs/_downloads/d37aecc420f8b90ce29faee4df5d0bcd/matrix_multiply.ipynb
@@ -90,7 +90,7 @@
       },
       "outputs": [],
       "source": [
-        "# Outer input feature reduction axis\nko = te.reduce_axis((0, n), name=\"ko\")\n# Inner input feature reduction axis\nki = te.reduce_axis((0, env.BLOCK_IN), name=\"ki\")\n# Describe the in-VTA matrix multiplication\nC_buf = te.compute(\n    (o, m, env.BATCH, env.BLOCK_OUT),\n    lambda bo, co, bi, ci:\n        te.sum(A_buf[bo, ko, bi, ki].astype(env.acc_dtype) *\n                B_buf[co, ko, ci, ki].astype(env.acc_dtype),\n                axis=[ko, ki]),\n    name=\"C_buf\")"
+        "# Outer input feature reduction axis\nko = te.reduce_axis((0, n), name=\"ko\")\n# Inner input feature reduction axis\nki = te.reduce_axis((0, env.BLOCK_IN), name=\"ki\")\n# Describe the in-VTA matrix multiplication\nC_buf = te.compute(\n    (o, m, env.BATCH, env.BLOCK_OUT),\n    lambda bo, co, bi, ci: te.sum(\n        A_buf[bo, ko, bi, ki].astype(env.acc_dtype) * B_buf[co, ko, ci, ki].astype(env.acc_dtype),\n        axis=[ko, ki],\n    ),\n    name=\"C_buf\",\n)"
       ]
     },
     {
@@ -115,7 +115,7 @@
       },
       "outputs": [],
       "source": [
-        "# Cast to output type, and send to main memory\nC = te.compute(\n    (o, m, env.BATCH, env.BLOCK_OUT),\n    lambda *i: C_buf(*i).astype(env.inp_dtype),\n    name=\"C\")"
+        "# Cast to output type, and send to main memory\nC = te.compute(\n    (o, m, env.BATCH, env.BLOCK_OUT), lambda *i: C_buf(*i).astype(env.inp_dtype), name=\"C\"\n)"
       ]
     },
     {
@@ -215,7 +215,7 @@
       },
       "outputs": [],
       "source": [
-        "s[C_buf].reorder(\n    ko,\n    s[C_buf].op.axis[0],\n    s[C_buf].op.axis[1],\n    s[C_buf].op.axis[2],\n    s[C_buf].op.axis[3],\n    ki)\ns[C_buf].tensorize(s[C_buf].op.axis[2], env.gemm)\n\n# Let's take a look at the finalized schedule\nprint(vta.lower(s, [A, B, C], simple_mode=True))"
+        "s[C_buf].reorder(\n    ko, s[C_buf].op.axis[0], s[C_buf].op.axis[1], s[C_buf].op.axis[2], s[C_buf].op.axis[3], ki\n)\ns[C_buf].tensorize(s[C_buf].op.axis[2], env.gemm)\n\n# Let's take a look at the finalized schedule\nprint(vta.lower(s, [A, B, C], simple_mode=True))"
       ]
     },
     {
@@ -258,7 +258,7 @@
       },
       "outputs": [],
       "source": [
-        "# Get the remote device context\nctx = remote.ext_dev(0)\n\n# Initialize the A and B arrays randomly in the int range of (-128, 128]\nA_orig = np.random.randint(\n    -128, 128, size=(o * env.BATCH, n * env.BLOCK_IN)).astype(A.dtype)\nB_orig = np.random.randint(\n    -128, 128, size=(m * env.BLOCK_OUT, n * env.BLOCK_IN)).astype(B.dtype)\n\n# Apply packing to the A and B arrays from a 2D to a 4D packed layout\nA_packed = A_orig.reshape(\n    o, env.BATCH, n, env.BLOCK_IN).transpo [...]
+        "# Get the remote device context\nctx = remote.ext_dev(0)\n\n# Initialize the A and B arrays randomly in the int range of (-128, 128]\nA_orig = np.random.randint(-128, 128, size=(o * env.BATCH, n * env.BLOCK_IN)).astype(A.dtype)\nB_orig = np.random.randint(-128, 128, size=(m * env.BLOCK_OUT, n * env.BLOCK_IN)).astype(B.dtype)\n\n# Apply packing to the A and B arrays from a 2D to a 4D packed layout\nA_packed = A_orig.reshape(o, env.BATCH, n, env.BLOCK_IN).transpose((0, 2, 1, 3))\n [...]
       ]
     },
     {
@@ -276,7 +276,7 @@
       },
       "outputs": [],
       "source": [
-        "# Compute reference result with numpy\nC_ref = np.dot(A_orig.astype(env.acc_dtype),\n               B_orig.T.astype(env.acc_dtype)).astype(C.dtype)\nC_ref = C_ref.reshape(\n    o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))\nnp.testing.assert_equal(C_ref, C_nd.asnumpy())\n\n# Print stats\nif env.TARGET in [\"sim\", \"tsim\"]:\n    sim_stats = simulator.stats()\n    print(\"Execution statistics:\")\n    for k, v in sim_stats.items():\n        print(\"\\t{:<16}: {:>16}\". [...]
+        "# Compute reference result with numpy\nC_ref = np.dot(A_orig.astype(env.acc_dtype), B_orig.T.astype(env.acc_dtype)).astype(C.dtype)\nC_ref = C_ref.reshape(o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))\nnp.testing.assert_equal(C_ref, C_nd.asnumpy())\n\n# Print stats\nif env.TARGET in [\"sim\", \"tsim\"]:\n    sim_stats = simulator.stats()\n    print(\"Execution statistics:\")\n    for k, v in sim_stats.items():\n        print(\"\\t{:<16}: {:>16}\".format(k, v))\n\nprint [...]
       ]
     },
     {
diff --git a/docs/_downloads/e09aef52edc37570c0178591a87d328c/from_tensorflow.ipynb b/docs/_downloads/e09aef52edc37570c0178591a87d328c/from_tensorflow.ipynb
index 6e90654..9b48a8d 100644
--- a/docs/_downloads/e09aef52edc37570c0178591a87d328c/from_tensorflow.ipynb
+++ b/docs/_downloads/e09aef52edc37570c0178591a87d328c/from_tensorflow.ipynb
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "# tvm, relay\nimport tvm\nfrom tvm import te\nfrom tvm import relay\n\n# os and numpy\nimport numpy as np\nimport os.path\n\n# Tensorflow imports\nimport tensorflow as tf\ntry:\n    tf_compat_v1 = tf.compat.v1\nexcept ImportError:\n    tf_compat_v1 = tf\n\n# Tensorflow utility functions\nimport tvm.relay.testing.tf as tf_testing\n\n# Base location for model related files.\nrepo_base = 'https://github.com/dmlc/web-data/raw/master/tensorflow/models/InceptionV1/'\n\n# Test image\ni [...]
+        "# tvm, relay\nimport tvm\nfrom tvm import te\nfrom tvm import relay\n\n# os and numpy\nimport numpy as np\nimport os.path\n\n# Tensorflow imports\nimport tensorflow as tf\n\ntry:\n    tf_compat_v1 = tf.compat.v1\nexcept ImportError:\n    tf_compat_v1 = tf\n\n# Tensorflow utility functions\nimport tvm.relay.testing.tf as tf_testing\n\n# Base location for model related files.\nrepo_base = \"https://github.com/dmlc/web-data/raw/master/tensorflow/models/InceptionV1/\"\n\n# Test imag [...]
       ]
     },
     {
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "model_name = 'classify_image_graph_def-with_shapes.pb'\nmodel_url = os.path.join(repo_base, model_name)\n\n# Image label map\nmap_proto = 'imagenet_2012_challenge_label_map_proto.pbtxt'\nmap_proto_url = os.path.join(repo_base, map_proto)\n\n# Human readable text for labels\nlabel_map = 'imagenet_synset_to_human_label_map.txt'\nlabel_map_url = os.path.join(repo_base, label_map)\n\n# Target settings\n# Use these commented settings to build for cuda.\n#target = 'cuda'\n#target_host [...]
+        "model_name = \"classify_image_graph_def-with_shapes.pb\"\nmodel_url = os.path.join(repo_base, model_name)\n\n# Image label map\nmap_proto = \"imagenet_2012_challenge_label_map_proto.pbtxt\"\nmap_proto_url = os.path.join(repo_base, map_proto)\n\n# Human readable text for labels\nlabel_map = \"imagenet_synset_to_human_label_map.txt\"\nlabel_map_url = os.path.join(repo_base, label_map)\n\n# Target settings\n# Use these commented settings to build for cuda.\n# target = 'cuda'\n# tar [...]
       ]
     },
     {
@@ -62,7 +62,7 @@
       },
       "outputs": [],
       "source": [
-        "from tvm.contrib.download import download_testdata\n\nimg_path = download_testdata(image_url, img_name, module='data')\nmodel_path = download_testdata(model_url, model_name, module=['tf', 'InceptionV1'])\nmap_proto_path = download_testdata(map_proto_url, map_proto, module='data')\nlabel_path = download_testdata(label_map_url, label_map, module='data')"
+        "from tvm.contrib.download import download_testdata\n\nimg_path = download_testdata(image_url, img_name, module=\"data\")\nmodel_path = download_testdata(model_url, model_name, module=[\"tf\", \"InceptionV1\"])\nmap_proto_path = download_testdata(map_proto_url, map_proto, module=\"data\")\nlabel_path = download_testdata(label_map_url, label_map, module=\"data\")"
       ]
     },
     {
@@ -80,7 +80,7 @@
       },
       "outputs": [],
       "source": [
-        "with tf_compat_v1.gfile.GFile(model_path, 'rb') as f:\n    graph_def = tf_compat_v1.GraphDef()\n    graph_def.ParseFromString(f.read())\n    graph = tf.import_graph_def(graph_def, name='')\n    # Call the utility to import the graph definition into default graph.\n    graph_def = tf_testing.ProcessGraphDefParam(graph_def)\n    # Add shapes to the graph.\n    with tf_compat_v1.Session() as sess:\n        graph_def = tf_testing.AddShapesToGraphDef(sess, 'softmax')"
+        "with tf_compat_v1.gfile.GFile(model_path, \"rb\") as f:\n    graph_def = tf_compat_v1.GraphDef()\n    graph_def.ParseFromString(f.read())\n    graph = tf.import_graph_def(graph_def, name=\"\")\n    # Call the utility to import the graph definition into default graph.\n    graph_def = tf_testing.ProcessGraphDefParam(graph_def)\n    # Add shapes to the graph.\n    with tf_compat_v1.Session() as sess:\n        graph_def = tf_testing.AddShapesToGraphDef(sess, \"softmax\")"
       ]
     },
     {
@@ -98,7 +98,7 @@
       },
       "outputs": [],
       "source": [
-        "from PIL import Image\nimage = Image.open(img_path).resize((299, 299))\n\nx = np.array(image)"
+        "from PIL import Image\n\nimage = Image.open(img_path).resize((299, 299))\n\nx = np.array(image)"
       ]
     },
     {
@@ -116,7 +116,7 @@
       },
       "outputs": [],
       "source": [
-        "shape_dict = {'DecodeJpeg/contents': x.shape}\ndtype_dict = {'DecodeJpeg/contents': 'uint8'}\nmod, params = relay.frontend.from_tensorflow(graph_def,\n                                             layout=layout,\n                                             shape=shape_dict)\n\nprint(\"Tensorflow protobuf imported to relay frontend.\")"
+        "shape_dict = {\"DecodeJpeg/contents\": x.shape}\ndtype_dict = {\"DecodeJpeg/contents\": \"uint8\"}\nmod, params = relay.frontend.from_tensorflow(graph_def, layout=layout, shape=shape_dict)\n\nprint(\"Tensorflow protobuf imported to relay frontend.\")"
       ]
     },
     {
@@ -152,7 +152,7 @@
       },
       "outputs": [],
       "source": [
-        "from tvm.contrib import graph_runtime\ndtype = 'uint8'\nm = graph_runtime.GraphModule(lib['default'](ctx))\n# set inputs\nm.set_input('DecodeJpeg/contents', tvm.nd.array(x.astype(dtype)))\n# execute\nm.run()\n# get outputs\ntvm_output = m.get_output(0, tvm.nd.empty(((1, 1008)), 'float32'))"
+        "from tvm.contrib import graph_runtime\n\ndtype = \"uint8\"\nm = graph_runtime.GraphModule(lib[\"default\"](ctx))\n# set inputs\nm.set_input(\"DecodeJpeg/contents\", tvm.nd.array(x.astype(dtype)))\n# execute\nm.run()\n# get outputs\ntvm_output = m.get_output(0, tvm.nd.empty(((1, 1008)), \"float32\"))"
       ]
     },
     {
@@ -170,7 +170,7 @@
       },
       "outputs": [],
       "source": [
-        "predictions = tvm_output.asnumpy()\npredictions = np.squeeze(predictions)\n\n# Creates node ID --> English string lookup.\nnode_lookup = tf_testing.NodeLookup(label_lookup_path=map_proto_path,\n                                    uid_lookup_path=label_path)\n\n# Print top 5 predictions from TVM output.\ntop_k = predictions.argsort()[-5:][::-1]\nfor node_id in top_k:\n    human_string = node_lookup.id_to_string(node_id)\n    score = predictions[node_id]\n    print('%s (score = %. [...]
+        "predictions = tvm_output.asnumpy()\npredictions = np.squeeze(predictions)\n\n# Creates node ID --> English string lookup.\nnode_lookup = tf_testing.NodeLookup(label_lookup_path=map_proto_path, uid_lookup_path=label_path)\n\n# Print top 5 predictions from TVM output.\ntop_k = predictions.argsort()[-5:][::-1]\nfor node_id in top_k:\n    human_string = node_lookup.id_to_string(node_id)\n    score = predictions[node_id]\n    print(\"%s (score = %.5f)\" % (human_string, score))"
       ]
     },
     {
@@ -188,7 +188,7 @@
       },
       "outputs": [],
       "source": [
-        "def create_graph():\n    \"\"\"Creates a graph from saved GraphDef file and returns a saver.\"\"\"\n    # Creates graph from saved graph_def.pb.\n    with tf_compat_v1.gfile.GFile(model_path, 'rb') as f:\n        graph_def = tf_compat_v1.GraphDef()\n        graph_def.ParseFromString(f.read())\n        graph = tf.import_graph_def(graph_def, name='')\n        # Call the utility to import the graph definition into default graph.\n        graph_def = tf_testing.ProcessGraphDefParam( [...]
+        "def create_graph():\n    \"\"\"Creates a graph from saved GraphDef file and returns a saver.\"\"\"\n    # Creates graph from saved graph_def.pb.\n    with tf_compat_v1.gfile.GFile(model_path, \"rb\") as f:\n        graph_def = tf_compat_v1.GraphDef()\n        graph_def.ParseFromString(f.read())\n        graph = tf.import_graph_def(graph_def, name=\"\")\n        # Call the utility to import the graph definition into default graph.\n        graph_def = tf_testing.ProcessGraphDefPa [...]
       ]
     }
   ],
diff --git a/docs/_downloads/e3748c6e5b8a427385ff1afdf1562c3c/opt_conv_tensorcore.py b/docs/_downloads/e3748c6e5b8a427385ff1afdf1562c3c/opt_conv_tensorcore.py
index 4b2823c..0cbcf7e 100644
--- a/docs/_downloads/e3748c6e5b8a427385ff1afdf1562c3c/opt_conv_tensorcore.py
+++ b/docs/_downloads/e3748c6e5b8a427385ff1afdf1562c3c/opt_conv_tensorcore.py
@@ -72,55 +72,72 @@ stride_w = 1
 # TensorCore shape
 block_size = 16
 
-assert (batch_size % block_size == 0)
-assert (in_channels % block_size == 0)
-assert (out_channels % block_size == 0)
+assert batch_size % block_size == 0
+assert in_channels % block_size == 0
+assert out_channels % block_size == 0
 
 # Input feature map: (N, H, W, IC, n, ic)
-data_shape = (batch_size // block_size,
-              height,
-              width,
-              in_channels // block_size,
-              block_size,
-              block_size)
+data_shape = (
+    batch_size // block_size,
+    height,
+    width,
+    in_channels // block_size,
+    block_size,
+    block_size,
+)
 # Kernel: (H, W, IC, OC, ic, oc)
-kernel_shape = (kernel_h,
-                kernel_w,
-                in_channels // block_size,
-                out_channels // block_size,
-                block_size,
-                block_size)
+kernel_shape = (
+    kernel_h,
+    kernel_w,
+    in_channels // block_size,
+    out_channels // block_size,
+    block_size,
+    block_size,
+)
 # Output feature map: (N, H, W, OC, n, oc)
-output_shape = (batch_size // block_size,
-                height,
-                width,
-                out_channels // block_size,
-                block_size,
-                block_size)
+output_shape = (
+    batch_size // block_size,
+    height,
+    width,
+    out_channels // block_size,
+    block_size,
+    block_size,
+)
 
 # Reduction axes
-kh = te.reduce_axis((0, kernel_h), name='kh')
-kw = te.reduce_axis((0, kernel_w), name='kw')
-ic = te.reduce_axis((0, in_channels // block_size), name='ic')
-ii = te.reduce_axis((0, block_size), name='ii')
+kh = te.reduce_axis((0, kernel_h), name="kh")
+kw = te.reduce_axis((0, kernel_w), name="kw")
+ic = te.reduce_axis((0, in_channels // block_size), name="ic")
+ii = te.reduce_axis((0, block_size), name="ii")
 
 # Algorithm
-A = te.placeholder(data_shape, name='A', dtype="float16")
-W = te.placeholder(kernel_shape, name='W', dtype="float16")
+A = te.placeholder(data_shape, name="A", dtype="float16")
+W = te.placeholder(kernel_shape, name="W", dtype="float16")
 Apad = te.compute(
-    (batch_size // block_size, height + 2 * pad_h, width + 2 * pad_w, in_channels // block_size, block_size,
-     block_size),
+    (
+        batch_size // block_size,
+        height + 2 * pad_h,
+        width + 2 * pad_w,
+        in_channels // block_size,
+        block_size,
+        block_size,
+    ),
     lambda n, h, w, i, nn, ii: tvm.tir.if_then_else(
-        tvm.tir.all(h >= pad_h, h - pad_h < height,
-                w >= pad_w, w - pad_w < width),
-        A[n, h - pad_h, w - pad_w, i, nn, ii], tvm.tir.const(0., "float16")),
-    name='Apad')
-Conv = te.compute(output_shape,
-                   lambda n, h, w, o, nn, oo: te.sum(
-                       Apad[n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype("float32") *
-                       W[kh, kw, ic, o, ii, oo].astype("float32"),
-                       axis=[ic, kh, kw, ii]),
-                   name="Conv")
+        tvm.tir.all(h >= pad_h, h - pad_h < height, w >= pad_w, w - pad_w < width),
+        A[n, h - pad_h, w - pad_w, i, nn, ii],
+        tvm.tir.const(0.0, "float16"),
+    ),
+    name="Apad",
+)
+Conv = te.compute(
+    output_shape,
+    lambda n, h, w, o, nn, oo: te.sum(
+        Apad[n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype("float32")
+        * W[kh, kw, ic, o, ii, oo].astype("float32"),
+        axis=[ic, kh, kw, ii],
+    ),
+    name="Conv",
+)
 
 s = te.create_schedule(Conv.op)
 s[Apad].compute_inline()
@@ -134,11 +151,11 @@ s[Apad].compute_inline()
 # stores at the on-chip registers level, the same place with local memory.
 
 # Designate the memory hierarchy
-AS = s.cache_read(Apad, 'shared', [Conv])
-WS = s.cache_read(W, 'shared', [Conv])
-AF = s.cache_read(AS, 'wmma.matrix_a', [Conv])
-WF = s.cache_read(WS, 'wmma.matrix_b', [Conv])
-ConvF = s.cache_write(Conv, 'wmma.accumulator')
+AS = s.cache_read(Apad, "shared", [Conv])
+WS = s.cache_read(W, "shared", [Conv])
+AF = s.cache_read(AS, "wmma.matrix_a", [Conv])
+WF = s.cache_read(WS, "wmma.matrix_b", [Conv])
+ConvF = s.cache_write(Conv, "wmma.accumulator")
 
 ###############################################################################
 # Define Tensor Intrinsic
@@ -151,11 +168,12 @@ ConvF = s.cache_write(Conv, 'wmma.accumulator')
 # :code:`mma_sync` and :code:`store_matrix`. Since :code:`fill_fragment` and :code:`mma_sync`
 # are both used in matrix multiplication, so we can just write following three intrinsics.
 
+
 def intrin_wmma_load_matrix(scope):
     n = 16
-    A = te.placeholder((n, n), name='A', dtype='float16')
-    BA = tvm.tir.decl_buffer(A.shape, A.dtype, scope='shared', data_alignment=32, offset_factor=256)
-    C = te.compute((n, n), lambda i, j: A[i, j], name='C')
+    A = te.placeholder((n, n), name="A", dtype="float16")
+    BA = tvm.tir.decl_buffer(A.shape, A.dtype, scope="shared", data_alignment=32, offset_factor=256)
+    C = te.compute((n, n), lambda i, j: A[i, j], name="C")
     BC = tvm.tir.decl_buffer(C.shape, C.dtype, scope=scope, data_alignment=32, offset_factor=256)
 
     def intrin_func(ins, outs):
@@ -163,9 +181,20 @@ def intrin_wmma_load_matrix(scope):
 
         BA = ins[0]
         BC = outs[0]
-        ib.emit(tvm.tir.call_intrin('handle', 'tir.tvm_load_matrix_sync',
-                                BC.data, n, n, n, BC.elem_offset // 256,
-                                BA.access_ptr('r'), n, 'row_major'))
+        ib.emit(
+            tvm.tir.call_intrin(
+                "handle",
+                "tir.tvm_load_matrix_sync",
+                BC.data,
+                n,
+                n,
+                n,
+                BC.elem_offset // 256,
+                BA.access_ptr("r"),
+                n,
+                "row_major",
+            )
+        )
         return ib.get()
 
     return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
@@ -173,33 +202,53 @@ def intrin_wmma_load_matrix(scope):
 
 def intrin_wmma_gemm():
     n = 16
-    A = te.placeholder((n, n), name='A', dtype='float16')
-    B = te.placeholder((n, n), name='B', dtype='float16')
+    A = te.placeholder((n, n), name="A", dtype="float16")
+    B = te.placeholder((n, n), name="B", dtype="float16")
     k = te.reduce_axis((0, n), name="k")
-    C = te.compute((n, n),
-                    lambda ii, jj:
-                    te.sum(A[ii, k].astype('float') * B[k, jj].astype('float'), axis=k),
-                    name='C')
-    BA = tvm.tir.decl_buffer(A.shape, A.dtype, name='BA', scope='wmma.matrix_a', data_alignment=32, offset_factor=256)
-    BB = tvm.tir.decl_buffer(B.shape, B.dtype, name='BB', scope='wmma.matrix_b', data_alignment=32, offset_factor=256)
-    BC = tvm.tir.decl_buffer(C.shape, C.dtype, name='BC', scope='wmma.accumulator', data_alignment=32, offset_factor=256)
+    C = te.compute(
+        (n, n),
+        lambda ii, jj: te.sum(A[ii, k].astype("float") * B[k, jj].astype("float"), axis=k),
+        name="C",
+    )
+    BA = tvm.tir.decl_buffer(
+        A.shape, A.dtype, name="BA", scope="wmma.matrix_a", data_alignment=32, offset_factor=256
+    )
+    BB = tvm.tir.decl_buffer(
+        B.shape, B.dtype, name="BB", scope="wmma.matrix_b", data_alignment=32, offset_factor=256
+    )
+    BC = tvm.tir.decl_buffer(
+        C.shape, C.dtype, name="BC", scope="wmma.accumulator", data_alignment=32, offset_factor=256
+    )
 
     def intrin_func(ins, outs):
         BA, BB = ins
-        BC, = outs
+        (BC,) = outs
 
         def init():
             ib = tvm.tir.ir_builder.create()
-            ib.emit(tvm.tir.call_intrin('handle', 'tir.tvm_fill_fragment', BC.data, n, n, n, BC.elem_offset // 256, 0.0))
+            ib.emit(
+                tvm.tir.call_intrin(
+                    "handle", "tir.tvm_fill_fragment", BC.data, n, n, n, BC.elem_offset // 256, 0.0
+                )
+            )
             return ib.get()
 
         def update():
             ib = tvm.tir.ir_builder.create()
-            ib.emit(tvm.tir.call_intrin('handle', 'tir.tvm_mma_sync',
-                                    BC.data, BC.elem_offset // 256,
-                                    BA.data, BA.elem_offset // 256,
-                                    BB.data, BB.elem_offset // 256,
-                                    BC.data, BC.elem_offset // 256))
+            ib.emit(
+                tvm.tir.call_intrin(
+                    "handle",
+                    "tir.tvm_mma_sync",
+                    BC.data,
+                    BC.elem_offset // 256,
+                    BA.data,
+                    BA.elem_offset // 256,
+                    BB.data,
+                    BB.elem_offset // 256,
+                    BC.data,
+                    BC.elem_offset // 256,
+                )
+            )
             return ib.get()
 
         return update(), init(), update()
@@ -209,22 +258,36 @@ def intrin_wmma_gemm():
 
 def intrin_wmma_store_matrix():
     n = 16
-    A = te.placeholder((n, n), name='A', dtype='float32')
-    BA = tvm.tir.decl_buffer(A.shape, A.dtype, scope='wmma.accumulator', data_alignment=32, offset_factor=256)
-    C = te.compute((n, n), lambda i, j: A[i, j], name='C')
-    BC = tvm.tir.decl_buffer(C.shape, C.dtype, scope='global', data_alignment=32, offset_factor=256)
+    A = te.placeholder((n, n), name="A", dtype="float32")
+    BA = tvm.tir.decl_buffer(
+        A.shape, A.dtype, scope="wmma.accumulator", data_alignment=32, offset_factor=256
+    )
+    C = te.compute((n, n), lambda i, j: A[i, j], name="C")
+    BC = tvm.tir.decl_buffer(C.shape, C.dtype, scope="global", data_alignment=32, offset_factor=256)
 
     def intrin_func(ins, outs):
         ib = tvm.tir.ir_builder.create()
         BA = ins[0]
         BC = outs[0]
-        ib.emit(tvm.tir.call_intrin('handle', 'tir.tvm_store_matrix_sync',
-                                BA.data, n, n, n, BA.elem_offset // 256,
-                                BC.access_ptr('w'), n, 'row_major'))
+        ib.emit(
+            tvm.tir.call_intrin(
+                "handle",
+                "tir.tvm_store_matrix_sync",
+                BA.data,
+                n,
+                n,
+                n,
+                BA.elem_offset // 256,
+                BC.access_ptr("w"),
+                n,
+                "row_major",
+            )
+        )
         return ib.get()
 
     return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
 
+
 ###############################################################################
 # Scheduling the Computation
 # --------------------------
@@ -256,12 +319,12 @@ warp_col_tiles = 4
 warp_size = 32
 chunk = 2
 
-block_x = te.thread_axis('blockIdx.x')
-block_y = te.thread_axis('blockIdx.y')
-block_z = te.thread_axis('blockIdx.z')
-thread_x = te.thread_axis('threadIdx.x')
-thread_y = te.thread_axis('threadIdx.y')
-thread_z = te.thread_axis('threadIdx.z')
+block_x = te.thread_axis("blockIdx.x")
+block_y = te.thread_axis("blockIdx.y")
+block_z = te.thread_axis("blockIdx.z")
+thread_x = te.thread_axis("threadIdx.x")
+thread_y = te.thread_axis("threadIdx.y")
+thread_z = te.thread_axis("threadIdx.z")
 
 nc, hc, wc, oc, nnc, ooc = Conv.op.axis
 block_k = s[Conv].fuse(hc, wc)
@@ -316,8 +379,8 @@ print(tvm.lower(s, [A, W, Conv], simple_mode=True))
 # The last phase is to lower the computation loops down to TensorCore hardware intrinsics
 # by mapping the 2D convolution to tensor intrinsics
 
-s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix('wmma.matrix_a'))
-s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix('wmma.matrix_b'))
+s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix("wmma.matrix_a"))
+s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix("wmma.matrix_b"))
 s[Conv].tensorize(nnc, intrin_wmma_store_matrix())
 s[ConvF].tensorize(nnf, intrin_wmma_gemm())
 print(tvm.lower(s, [A, W, Conv], simple_mode=True))
@@ -331,17 +394,15 @@ print(tvm.lower(s, [A, W, Conv], simple_mode=True))
 
 ctx = tvm.gpu(0)
 if nvcc.have_tensorcore(ctx.compute_version):
-    with tvm.transform.PassContext(config={"tir.UnrollLoop": {
-        "auto_max_step": 16
-    }}):
-        func = tvm.build(s, [A, W, Conv], 'cuda')
+    with tvm.transform.PassContext(config={"tir.UnrollLoop": {"auto_max_step": 16}}):
+        func = tvm.build(s, [A, W, Conv], "cuda")
     a_np = np.random.uniform(size=data_shape).astype(A.dtype)
     w_np = np.random.uniform(size=kernel_shape).astype(W.dtype)
     a = tvm.nd.array(a_np, ctx)
     w = tvm.nd.array(w_np, ctx)
     c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx)
     evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
-    print('conv2d with tensor core: %f ms' % (evaluator(a, w, c).mean * 1e3))
+    print("conv2d with tensor core: %f ms" % (evaluator(a, w, c).mean * 1e3))
 
 ###############################################################################
 # Summary
diff --git a/docs/_downloads/e41367a7f459e4f4dca82180009c1539/tune_relay_mobile_gpu.py b/docs/_downloads/e41367a7f459e4f4dca82180009c1539/tune_relay_mobile_gpu.py
index 5f5e523..19fa601 100644
--- a/docs/_downloads/e41367a7f459e4f4dca82180009c1539/tune_relay_mobile_gpu.py
+++ b/docs/_downloads/e41367a7f459e4f4dca82180009c1539/tune_relay_mobile_gpu.py
@@ -76,31 +76,41 @@ import tvm.contrib.graph_runtime as runtime
 # We can load some pre-defined network from :code:`relay.testing`.
 # We can also load models from MXNet, ONNX and TensorFlow.
 
+
 def get_network(name, batch_size):
     """Get the symbol definition and random weight of a network"""
     input_shape = (batch_size, 3, 224, 224)
     output_shape = (batch_size, 1000)
 
     if "resnet" in name:
-        n_layer = int(name.split('-')[1])
-        mod, params = relay.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer, batch_size=batch_size, dtype=dtype
+        )
     elif "vgg" in name:
-        n_layer = int(name.split('-')[1])
-        mod, params = relay.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)
-    elif name == 'mobilenet':
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.vgg.get_workload(
+            num_layers=n_layer, batch_size=batch_size, dtype=dtype
+        )
+    elif name == "mobilenet":
         mod, params = relay.testing.mobilenet.get_workload(batch_size=batch_size, dtype=dtype)
-    elif name == 'squeezenet_v1.1':
-        mod, params = relay.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1', dtype=dtype)
-    elif name == 'inception_v3':
+    elif name == "squeezenet_v1.1":
+        mod, params = relay.testing.squeezenet.get_workload(
+            batch_size=batch_size, version="1.1", dtype=dtype
+        )
+    elif name == "inception_v3":
         input_shape = (1, 3, 299, 299)
         mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
-    elif name == 'mxnet':
+    elif name == "mxnet":
         # an example for mxnet model
         from mxnet.gluon.model_zoo.vision import get_model
-        block = get_model('resnet18_v1', pretrained=True)
-        mod, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype)
+
+        block = get_model("resnet18_v1", pretrained=True)
+        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
         net = mod["main"]
-        net = relay.Function(net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs)
+        net = relay.Function(
+            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
+        )
         mod = tvm.IRModule.from_expr(net)
     else:
         raise ValueError("Unsupported network: " + name)
@@ -187,35 +197,34 @@ def get_network(name, batch_size):
 
 #### DEVICE CONFIG ####
 
-target = tvm.target.create('opencl -device=mali')
+target = tvm.target.Target("opencl -device=mali")
 
 # Replace "aarch64-linux-gnu" with the correct target of your board.
 # This target host is used for cross compilation. You can query it by :code:`gcc -v` on your device.
-target_host = 'llvm -mtriple=aarch64-linux-gnu'
+target_host = "llvm -mtriple=aarch64-linux-gnu"
 
 # Also replace this with the device key in your tracker
-device_key = 'rk3399'
+device_key = "rk3399"
 
 # Set this to True if you use android phone
 use_android = False
 
 #### TUNING OPTION ####
-network = 'resnet-18'
+network = "resnet-18"
 log_file = "%s.%s.log" % (device_key, network)
-dtype = 'float32'
+dtype = "float32"
 
 tuning_option = {
-    'log_filename': log_file,
-
-    'tuner': 'xgb',
-    'n_trial': 1000,
-    'early_stopping': 450,
-
-    'measure_option': autotvm.measure_option(
-        builder=autotvm.LocalBuilder(
-            build_func='ndk' if use_android else 'default'),
+    "log_filename": log_file,
+    "tuner": "xgb",
+    "n_trial": 1000,
+    "early_stopping": 450,
+    "measure_option": autotvm.measure_option(
+        builder=autotvm.LocalBuilder(build_func="ndk" if use_android else "default"),
         runner=autotvm.RPCRunner(
-            device_key, host='0.0.0.0', port=9190,
+            device_key,
+            host="0.0.0.0",
+            port=9190,
             number=10,
             timeout=5,
         ),
@@ -242,29 +251,31 @@ tuning_option = {
 # We will introduce a more sophisticated tuning scheduler in the future.
 
 # You can skip the implementation of this function for this tutorial.
-def tune_tasks(tasks,
-               measure_option,
-               tuner='xgb',
-               n_trial=1000,
-               early_stopping=None,
-               log_filename='tuning.log',
-               use_transfer_learning=True):
+def tune_tasks(
+    tasks,
+    measure_option,
+    tuner="xgb",
+    n_trial=1000,
+    early_stopping=None,
+    log_filename="tuning.log",
+    use_transfer_learning=True,
+):
     # create tmp log file
     tmp_log_file = log_filename + ".tmp"
     if os.path.exists(tmp_log_file):
         os.remove(tmp_log_file)
 
     for i, tsk in enumerate(reversed(tasks)):
-        prefix = "[Task %2d/%2d] " % (i+1, len(tasks))
+        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
 
         # create tuner
-        if tuner == 'xgb' or tuner == 'xgb-rank':
-            tuner_obj = XGBTuner(tsk, loss_type='rank')
-        elif tuner == 'ga':
+        if tuner == "xgb" or tuner == "xgb-rank":
+            tuner_obj = XGBTuner(tsk, loss_type="rank")
+        elif tuner == "ga":
             tuner_obj = GATuner(tsk, pop_size=50)
-        elif tuner == 'random':
+        elif tuner == "random":
             tuner_obj = RandomTuner(tsk)
-        elif tuner == 'gridsearch':
+        elif tuner == "gridsearch":
             tuner_obj = GridSearchTuner(tsk)
         else:
             raise ValueError("Invalid tuner: " + tuner)
@@ -275,13 +286,15 @@ def tune_tasks(tasks,
 
         # do tuning
         tsk_trial = min(n_trial, len(tsk.config_space))
-        tuner_obj.tune(n_trial=tsk_trial,
-                       early_stopping=early_stopping,
-                       measure_option=measure_option,
-                       callbacks=[
-                           autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
-                           autotvm.callback.log_to_file(tmp_log_file)
-                       ])
+        tuner_obj.tune(
+            n_trial=tsk_trial,
+            early_stopping=early_stopping,
+            measure_option=measure_option,
+            callbacks=[
+                autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
+                autotvm.callback.log_to_file(tmp_log_file),
+            ],
+        )
 
     # pick best records to a cache file
     autotvm.record.pick_best(tmp_log_file, log_filename)
@@ -291,15 +304,18 @@ def tune_tasks(tasks,
 ########################################################################
 # Finally, we launch tuning jobs and evaluate the end-to-end performance.
 
+
 def tune_and_evaluate(tuning_opt):
     # extract workloads from relay program
     print("Extract tasks...")
     mod, params, input_shape, _ = get_network(network, batch_size=1)
-    tasks = autotvm.task.extract_from_program(mod["main"],
-                                              target=target,
-                                              target_host=target_host,
-                                              params=params,
-                                              ops=(relay.op.get("nn.conv2d"),))
+    tasks = autotvm.task.extract_from_program(
+        mod["main"],
+        target=target,
+        target_host=target_host,
+        params=params,
+        ops=(relay.op.get("nn.conv2d"),),
+    )
 
     # run tuning tasks
     print("Tuning...")
@@ -309,12 +325,14 @@ def tune_and_evaluate(tuning_opt):
     with autotvm.apply_history_best(log_file):
         print("Compile...")
         with tvm.transform.PassContext(opt_level=3):
-            graph, lib, params = relay.build_module.build(
-                mod, target=target, params=params, target_host=target_host)
+            lib = relay.build_module.build(
+                mod, target=target, params=params, target_host=target_host
+            )
         # export library
         tmp = tempdir()
         if use_android:
             from tvm.contrib import ndk
+
             filename = "net.so"
             lib.export_library(tmp.relpath(filename), ndk.create_shared)
         else:
@@ -323,24 +341,25 @@ def tune_and_evaluate(tuning_opt):
 
         # upload module to device
         print("Upload...")
-        remote = autotvm.measure.request_remote(device_key, '0.0.0.0', 9190,
-                                                timeout=10000)
+        remote = autotvm.measure.request_remote(device_key, "0.0.0.0", 9190, timeout=10000)
         remote.upload(tmp.relpath(filename))
         rlib = remote.load_module(filename)
 
         # upload parameters to device
         ctx = remote.context(str(target), 0)
-        module = runtime.create(graph, rlib, ctx)
+        module = runtime.GraphModule(rlib["default"](ctx))
         data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
-        module.set_input('data', data_tvm)
-        module.set_input(**params)
+        module.set_input("data", data_tvm)
 
         # evaluate
         print("Evaluate inference time cost...")
         ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=30)
         prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
-        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
-              (np.mean(prof_res), np.std(prof_res)))
+        print(
+            "Mean inference time (std dev): %.2f ms (%.2f ms)"
+            % (np.mean(prof_res), np.std(prof_res))
+        )
+
 
 # We do not run the tuning in our webpage server since it takes too long.
 # Uncomment the following line to run it by yourself.
diff --git a/docs/_downloads/e732d71c83de9fd8c6c7a72184d3ee0a/from_coreml.py b/docs/_downloads/e732d71c83de9fd8c6c7a72184d3ee0a/from_coreml.py
index f5db0f5..4e3f391 100644
--- a/docs/_downloads/e732d71c83de9fd8c6c7a72184d3ee0a/from_coreml.py
+++ b/docs/_downloads/e732d71c83de9fd8c6c7a72184d3ee0a/from_coreml.py
@@ -47,9 +47,9 @@ from PIL import Image
 # ----------------------------
 # We will download and load a pretrained mobilenet classification network
 # provided by apple in this example
-model_url = 'https://docs-assets.developer.apple.com/coreml/models/MobileNet.mlmodel'
-model_file = 'mobilenet.mlmodel'
-model_path = download_testdata(model_url, model_file, module='coreml')
+model_url = "https://docs-assets.developer.apple.com/coreml/models/MobileNet.mlmodel"
+model_file = "mobilenet.mlmodel"
+model_path = download_testdata(model_url, model_file, module="coreml")
 # Now you have mobilenet.mlmodel on disk
 mlmodel = cm.models.MLModel(model_path)
 
@@ -57,19 +57,19 @@ mlmodel = cm.models.MLModel(model_path)
 # Load a test image
 # ------------------
 # A single cat dominates the examples!
-img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
-img_path = download_testdata(img_url, 'cat.png', module='data')
+img_url = "https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true"
+img_path = download_testdata(img_url, "cat.png", module="data")
 img = Image.open(img_path).resize((224, 224))
 # Mobilenet.mlmodel's input is BGR format
-img_bgr = np.array(img)[:,:,::-1]
+img_bgr = np.array(img)[:, :, ::-1]
 x = np.transpose(img_bgr, (2, 0, 1))[np.newaxis, :]
 
 ######################################################################
 # Compile the model on Relay
 # ---------------------------
 # We should be familiar with the process right now.
-target = 'llvm'
-shape_dict = {'image': x.shape}
+target = "llvm"
+shape_dict = {"image": x.shape}
 
 # Parse CoreML model and convert into Relay computation graph
 mod, params = relay.frontend.from_coreml(mlmodel, shape_dict)
@@ -82,11 +82,12 @@ with tvm.transform.PassContext(opt_level=3):
 # -------------------
 # The process is no different from other example
 from tvm.contrib import graph_runtime
+
 ctx = tvm.cpu(0)
-dtype = 'float32'
-m = graph_runtime.GraphModule(lib['default'](ctx))
+dtype = "float32"
+m = graph_runtime.GraphModule(lib["default"](ctx))
 # set inputs
-m.set_input('image', tvm.nd.array(x.astype(dtype)))
+m.set_input("image", tvm.nd.array(x.astype(dtype)))
 # execute
 m.run()
 # get outputs
@@ -97,13 +98,17 @@ top1 = np.argmax(tvm_output.asnumpy()[0])
 # Look up synset name
 # -------------------
 # Look up prediction top 1 index in 1000 class synset.
-synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
-                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',
-                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',
-                      'imagenet1000_clsid_to_human.txt'])
-synset_name = 'imagenet1000_clsid_to_human.txt'
-synset_path = download_testdata(synset_url, synset_name, module='data')
+synset_url = "".join(
+    [
+        "https://gist.githubusercontent.com/zhreshold/",
+        "4d0b62f3d01426887599d4f7ede23ee5/raw/",
+        "596b27d23537e5a1b5751d2b0481ef172f58b539/",
+        "imagenet1000_clsid_to_human.txt",
+    ]
+)
+synset_name = "imagenet1000_clsid_to_human.txt"
+synset_path = download_testdata(synset_url, synset_name, module="data")
 with open(synset_path) as f:
     synset = eval(f.read())
 # You should see the following result: Top-1 id 282 class name tiger cat
-print('Top-1 id', top1, 'class name', synset[top1])
+print("Top-1 id", top1, "class name", synset[top1])
diff --git a/docs/_downloads/e87c21d127b0b825efcf978b9f8e2cd7/low_level_custom_pass.ipynb b/docs/_downloads/e87c21d127b0b825efcf978b9f8e2cd7/low_level_custom_pass.ipynb
index f8b0ee5..dabf31b 100644
--- a/docs/_downloads/e87c21d127b0b825efcf978b9f8e2cd7/low_level_custom_pass.ipynb
+++ b/docs/_downloads/e87c21d127b0b825efcf978b9f8e2cd7/low_level_custom_pass.ipynb
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "n = tvm.tir.const(128, \"int32\")\na = te.placeholder((n, ), name=\"a\")\nb = te.placeholder((n, ), name=\"b\")\nc = te.compute((n, ), lambda i: a[i] + b[i], name='c')\n\nsch = te.create_schedule(c.op)\nir  = tvm.lower(sch, [a, b, c])\nprint(ir)"
+        "n = tvm.tir.const(128, \"int32\")\na = te.placeholder((n,), name=\"a\")\nb = te.placeholder((n,), name=\"b\")\nc = te.compute((n,), lambda i: a[i] + b[i], name=\"c\")\n\nsch = te.create_schedule(c.op)\nir = tvm.lower(sch, [a, b, c])\nprint(ir)"
       ]
     },
     {
@@ -69,7 +69,7 @@
       },
       "outputs": [],
       "source": [
-        "loops = []\ndef find_width8(op):\n    \"\"\" Find all the 'tir.For' nodes whose extent can be divided by 8. \"\"\"\n    if isinstance(op, tvm.tir.For):\n        if isinstance(op.extent, tvm.tir.IntImm):\n            if op.extent.value % 8 == 0:\n                loops.append(op)"
+        "loops = []\n\n\ndef find_width8(op):\n    \"\"\" Find all the 'tir.For' nodes whose extent can be divided by 8. \"\"\"\n    if isinstance(op, tvm.tir.For):\n        if isinstance(op.extent, tvm.tir.IntImm):\n            if op.extent.value % 8 == 0:\n                loops.append(op)"
       ]
     },
     {
@@ -87,7 +87,7 @@
       },
       "outputs": [],
       "source": [
-        "def vectorize8(op):\n    \"\"\" Split can vectorize the loops found in `find_width8`. \"\"\"\n    if op in loops:\n        extent = op.extent.value\n        name = op.loop_var.name\n        lo, li = te.var(name + '.outer'), te.var(name + '.inner')\n        body = tvm.tir.stmt_functor.substitute(op.body, {op.loop_var: lo * 8 + li})\n        body = tvm.tir.For(li, 0, 8, tvm.tir.For.Vectorized, 0, body)\n        body = tvm.tir.For(lo, 0, extent // 8, tvm.tir.For.Serial, 0, body)\n  [...]
+        "def vectorize8(op):\n    \"\"\" Split can vectorize the loops found in `find_width8`. \"\"\"\n    if op in loops:\n        extent = op.extent.value\n        name = op.loop_var.name\n        lo, li = te.var(name + \".outer\"), te.var(name + \".inner\")\n        body = tvm.tir.stmt_functor.substitute(op.body, {op.loop_var: lo * 8 + li})\n        body = tvm.tir.For(li, 0, 8, tvm.tir.For.Vectorized, 0, body)\n        body = tvm.tir.For(lo, 0, extent // 8, tvm.tir.For.Serial, 0, body [...]
       ]
     },
     {
diff --git a/docs/_downloads/e92c7219a1cd7838e61f9683f4228a7f/from_onnx.ipynb b/docs/_downloads/e92c7219a1cd7838e61f9683f4228a7f/from_onnx.ipynb
index af31db1..5ba58a7 100644
--- a/docs/_downloads/e92c7219a1cd7838e61f9683f4228a7f/from_onnx.ipynb
+++ b/docs/_downloads/e92c7219a1cd7838e61f9683f4228a7f/from_onnx.ipynb
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "model_url = ''.join(['https://gist.github.com/zhreshold/',\n                     'bcda4716699ac97ea44f791c24310193/raw/',\n                     '93672b029103648953c4e5ad3ac3aadf346a4cdc/',\n                     'super_resolution_0.2.onnx'])\nmodel_path = download_testdata(model_url, 'super_resolution.onnx', module='onnx')\n# now you have super_resolution.onnx on disk\nonnx_model = onnx.load(model_path)"
+        "model_url = \"\".join(\n    [\n        \"https://gist.github.com/zhreshold/\",\n        \"bcda4716699ac97ea44f791c24310193/raw/\",\n        \"93672b029103648953c4e5ad3ac3aadf346a4cdc/\",\n        \"super_resolution_0.2.onnx\",\n    ]\n)\nmodel_path = download_testdata(model_url, \"super_resolution.onnx\", module=\"onnx\")\n# now you have super_resolution.onnx on disk\nonnx_model = onnx.load(model_path)"
       ]
     },
     {
@@ -62,7 +62,7 @@
       },
       "outputs": [],
       "source": [
-        "from PIL import Image\nimg_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'\nimg_path = download_testdata(img_url, 'cat.png', module='data')\nimg = Image.open(img_path).resize((224, 224))\nimg_ycbcr = img.convert(\"YCbCr\")  # convert to YCbCr\nimg_y, img_cb, img_cr = img_ycbcr.split()\nx = np.array(img_y)[np.newaxis, np.newaxis, :, :]"
+        "from PIL import Image\n\nimg_url = \"https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true\"\nimg_path = download_testdata(img_url, \"cat.png\", module=\"data\")\nimg = Image.open(img_path).resize((224, 224))\nimg_ycbcr = img.convert(\"YCbCr\")  # convert to YCbCr\nimg_y, img_cb, img_cr = img_ycbcr.split()\nx = np.array(img_y)[np.newaxis, np.newaxis, :, :]"
       ]
     },
     {
@@ -80,7 +80,7 @@
       },
       "outputs": [],
       "source": [
-        "target = 'llvm'\n\ninput_name = '1'\nshape_dict = {input_name: x.shape}\nmod, params = relay.frontend.from_onnx(onnx_model, shape_dict)\n\nwith tvm.transform.PassContext(opt_level=1):\n    intrp = relay.build_module.create_executor('graph', mod, tvm.cpu(0), target)"
+        "target = \"llvm\"\n\ninput_name = \"1\"\nshape_dict = {input_name: x.shape}\nmod, params = relay.frontend.from_onnx(onnx_model, shape_dict)\n\nwith tvm.transform.PassContext(opt_level=1):\n    intrp = relay.build_module.create_executor(\"graph\", mod, tvm.cpu(0), target)"
       ]
     },
     {
@@ -98,7 +98,7 @@
       },
       "outputs": [],
       "source": [
-        "dtype = 'float32'\ntvm_output = intrp.evaluate()(tvm.nd.array(x.astype(dtype)), **params).asnumpy()"
+        "dtype = \"float32\"\ntvm_output = intrp.evaluate()(tvm.nd.array(x.astype(dtype)), **params).asnumpy()"
       ]
     },
     {
@@ -116,7 +116,7 @@
       },
       "outputs": [],
       "source": [
-        "from matplotlib import pyplot as plt\nout_y = Image.fromarray(np.uint8((tvm_output[0, 0]).clip(0, 255)), mode='L')\nout_cb = img_cb.resize(out_y.size, Image.BICUBIC)\nout_cr = img_cr.resize(out_y.size, Image.BICUBIC)\nresult = Image.merge('YCbCr', [out_y, out_cb, out_cr]).convert('RGB')\ncanvas = np.full((672, 672*2, 3), 255)\ncanvas[0:224, 0:224, :] = np.asarray(img)\ncanvas[:, 672:, :] = np.asarray(result)\nplt.imshow(canvas.astype(np.uint8))\nplt.show()"
+        "from matplotlib import pyplot as plt\n\nout_y = Image.fromarray(np.uint8((tvm_output[0, 0]).clip(0, 255)), mode=\"L\")\nout_cb = img_cb.resize(out_y.size, Image.BICUBIC)\nout_cr = img_cr.resize(out_y.size, Image.BICUBIC)\nresult = Image.merge(\"YCbCr\", [out_y, out_cb, out_cr]).convert(\"RGB\")\ncanvas = np.full((672, 672 * 2, 3), 255)\ncanvas[0:224, 0:224, :] = np.asarray(img)\ncanvas[:, 672:, :] = np.asarray(result)\nplt.imshow(canvas.astype(np.uint8))\nplt.show()"
       ]
     }
   ],
diff --git a/docs/_downloads/ea0c81cab71096d16b825a33fd276c58/from_mxnet.py b/docs/_downloads/ea0c81cab71096d16b825a33fd276c58/from_mxnet.py
index d75ec00..d81b211 100644
--- a/docs/_downloads/ea0c81cab71096d16b825a33fd276c58/from_mxnet.py
+++ b/docs/_downloads/ea0c81cab71096d16b825a33fd276c58/from_mxnet.py
@@ -49,31 +49,38 @@ from tvm.contrib.download import download_testdata
 from mxnet.gluon.model_zoo.vision import get_model
 from PIL import Image
 from matplotlib import pyplot as plt
-block = get_model('resnet18_v1', pretrained=True)
-img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
-img_name = 'cat.png'
-synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
-                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',
-                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',
-                      'imagenet1000_clsid_to_human.txt'])
-synset_name = 'imagenet1000_clsid_to_human.txt'
-img_path = download_testdata(img_url, 'cat.png', module='data')
-synset_path = download_testdata(synset_url, synset_name, module='data')
+
+block = get_model("resnet18_v1", pretrained=True)
+img_url = "https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true"
+img_name = "cat.png"
+synset_url = "".join(
+    [
+        "https://gist.githubusercontent.com/zhreshold/",
+        "4d0b62f3d01426887599d4f7ede23ee5/raw/",
+        "596b27d23537e5a1b5751d2b0481ef172f58b539/",
+        "imagenet1000_clsid_to_human.txt",
+    ]
+)
+synset_name = "imagenet1000_clsid_to_human.txt"
+img_path = download_testdata(img_url, "cat.png", module="data")
+synset_path = download_testdata(synset_url, synset_name, module="data")
 with open(synset_path) as f:
     synset = eval(f.read())
 image = Image.open(img_path).resize((224, 224))
 plt.imshow(image)
 plt.show()
 
+
 def transform_image(image):
-    image = np.array(image) - np.array([123., 117., 104.])
+    image = np.array(image) - np.array([123.0, 117.0, 104.0])
     image /= np.array([58.395, 57.12, 57.375])
     image = image.transpose((2, 0, 1))
     image = image[np.newaxis, :]
     return image
 
+
 x = transform_image(image)
-print('x', x.shape)
+print("x", x.shape)
 
 ######################################################################
... 224060 lines suppressed ...