You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by tq...@apache.org on 2020/03/30 18:38:55 UTC
[incubator-tvm-site] branch asf-site updated: Docs build at Mon Mar 30 11:38:43 PDT 2020

This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/incubator-tvm-site.git


The following commit(s) were added to refs/heads/asf-site by this push:
     new 6695369  Docs build at Mon Mar 30 11:38:43 PDT 2020
6695369 is described below

commit 6695369c33e3d432a18a0adfe7fc77c55f898168
Author: tqchen <tq...@octoml.ai>
AuthorDate: Mon Mar 30 11:38:43 2020 -0700

    Docs build at Mon Mar 30 11:38:43 PDT 2020
---
 .../tuple_inputs.ipynb                             |   122 +
 .../from_tflite.py                                 |   199 +
 .../tune_simple_template.py                        |   327 +
 .../from_tflite.ipynb                              |   151 +
 .../tune_simple_template.ipynb                     |   190 +
 .../opt_matmul_auto_tensorcore.py                  |   531 +
 .../matrix_multiply.py                             |   482 +
 .../143c743c62f58570eabd77fd3395ca8c/scan.py       |   153 +
 .../tensor_expr_get_started.ipynb                  |   287 +
 .../tutorials_jupyter.zip                          |   Bin 0 -> 473912 bytes
 .../tune_conv2d_cuda.ipynb                         |   115 +
 .../tune_relay_cuda.py                             |   378 +
 .../relay_pass_infra.ipynb                         |   252 +
 .../tune_relay_mobile_gpu.ipynb                    |   168 +
 .../tune_conv2d_cuda.py                            |   237 +
 .../matrix_multiply_opt.ipynb                      |   176 +
 .../from_keras.ipynb                               |   144 +
 .../from_coreml.ipynb                              |   144 +
 .../from_caffe2.ipynb                              |   133 +
 .../deploy_model_on_android.ipynb                  |   190 +
 .../deploy_model_on_android.py                     |   360 +
 .../relay_quick_start.py                           |   160 +
 .../opt_gemm.ipynb                                 |   309 +
 .../using_external_lib.py                          |   561 +
 .../tune_relay_vta.ipynb                           |   186 +
 .../cross_compilation_and_rpc.py                   |   263 +
 .../4e9540fc014621d8d3bd14869c1ab227/scan.ipynb    |   169 +
 .../deploy_quantized.ipynb                         |   144 +
 .../from_tensorflow.py                             |   240 +
 .../relay_pass_infra.py                            |   248 +
 .../intro_topi.ipynb                               |   230 +
 .../deploy_quantized.py                            |   159 +
 .../5bd1bb9c6505ea40407fa19f01579414/reduction.py  |   196 +
 .../intrin_math.ipynb                              |   169 +
 .../tune_relay_vta.py                              |   494 +
 .../schedule_primitives.ipynb                      |   284 +
 .../tensorize.ipynb                                |   241 +
 .../deploy_detection.ipynb                         |   169 +
 .../opt_conv_cuda.ipynb                            |   151 +
 .../matrix_multiply_opt.py                         |   391 +
 .../tuple_inputs.py                                |   120 +
 .../696dd37904ef92773435ca321ff41bfb/from_onnx.py  |    99 +
 .../tutorials_python.zip                           |   Bin 0 -> 116302 bytes
 .../using_external_lib.ipynb                       |   147 +
 .../from_pytorch.ipynb                             |   162 +
 .../70d345c5409f99cb5de9dc44f147ff6f/build_gcn.py  |   361 +
 .../from_caffe2.py                                 |   133 +
 .../tune_relay_cuda.ipynb                          |   172 +
 .../cross_compilation_and_rpc.ipynb                |   179 +
 .../7ece74acc230c7d55086182cc8884b09/extern_op.py  |   126 +
 .../deploy_ssd_gluoncv.ipynb                       |   144 +
 .../from_darknet.ipynb                             |   144 +
 .../836dc3852acf09662e9eb37c4c5e1e1b/opt_gemm.py   |   380 +
 .../deploy_model_on_rasp.py                        |   226 +
 .../tune_relay_x86.py                              |   240 +
 .../extern_op.ipynb                                |   133 +
 .../opt_matmul_auto_tensorcore.ipynb               |   111 +
 .../schedule_primitives.py                         |   208 +
 .../opt_conv_tensorcore.ipynb                      |   165 +
 .../9a950897eeef498440fbe2f0afe2601f/tedd.py       |   160 +
 .../9b0365fd5723f7c4d4e996637ab9a487/intro_topi.py |   148 +
 .../low_level_custom_pass.py                       |   173 +
 .../relay_quick_start.ipynb                        |   144 +
 .../a2f661bf234a167b5458fa28d8fafedc/tedd.ipynb    |   165 +
 .../from_darknet.py                                |   206 +
 .../opt_conv_cuda.py                               |   246 +
 .../tune_relay_x86.ipynb                           |   115 +
 .../baa4de13ce6d932de43e0eb5c4cb8f16/tensorize.py  |   310 +
 .../tune_relay_arm.py                              |   400 +
 .../vta_get_started.py                             |   404 +
 .../deploy_model_on_rasp.ipynb                     |   201 +
 .../build_gcn.ipynb                                |   223 +
 .../deploy_classification.ipynb                    |   133 +
 .../convolution_opt.ipynb                          |   194 +
 .../deploy_ssd_gluoncv.py                          |   118 +
 .../tutorials_python.zip                           |   Bin 0 -> 409257 bytes
 .../convolution_opt.py                             |   455 +
 .../matrix_multiply.ipynb                          |   311 +
 .../tensor_expr_get_started.py                     |   318 +
 .../tutorials_jupyter.zip                          |   Bin 0 -> 131891 bytes
 .../from_tensorflow.ipynb                          |   216 +
 .../opt_conv_tensorcore.py                         |   348 +
 .../tune_relay_mobile_gpu.py                       |   401 +
 .../from_coreml.py                                 |   112 +
 .../low_level_custom_pass.ipynb                    |   158 +
 .../from_onnx.ipynb                                |   144 +
 .../ea0c81cab71096d16b825a33fd276c58/from_mxnet.py |   138 +
 .../reduction.ipynb                                |   248 +
 .../deploy_detection.py                            |   330 +
 .../deploy_classification.py                       |   290 +
 .../from_mxnet.ipynb                               |   162 +
 .../f59fd8b968f7dcde34ed872c8527c192/from_keras.py |   108 +
 .../vta_get_started.ipynb                          |   373 +
 .../from_pytorch.py                                |   166 +
 .../tune_relay_arm.ipynb                           |   168 +
 .../intrin_math.py                                 |   168 +
 docs/_images/sphx_glr_build_gcn_thumb.png          |   Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_convolution_opt_thumb.png    |   Bin 0 -> 26786 bytes
 .../sphx_glr_cross_compilation_and_rpc_thumb.png   |   Bin 0 -> 26786 bytes
 .../_images/sphx_glr_deploy_classification_001.png |   Bin 0 -> 149470 bytes
 .../sphx_glr_deploy_classification_thumb.png       |   Bin 0 -> 106745 bytes
 docs/_images/sphx_glr_deploy_detection_001.png     |   Bin 0 -> 330958 bytes
 docs/_images/sphx_glr_deploy_detection_thumb.png   |   Bin 0 -> 125541 bytes
 .../sphx_glr_deploy_model_on_android_thumb.png     |   Bin 0 -> 26786 bytes
 .../sphx_glr_deploy_model_on_rasp_thumb.png        |   Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_deploy_quantized_thumb.png   |   Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_deploy_ssd_gluoncv_001.png   |   Bin 0 -> 283871 bytes
 docs/_images/sphx_glr_deploy_ssd_gluoncv_thumb.png |   Bin 0 -> 108007 bytes
 docs/_images/sphx_glr_extern_op_thumb.png          |   Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_from_caffe2_001.png          |   Bin 0 -> 149470 bytes
 docs/_images/sphx_glr_from_caffe2_thumb.png        |   Bin 0 -> 106745 bytes
 docs/_images/sphx_glr_from_coreml_thumb.png        |   Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_from_darknet_001.png         |   Bin 0 -> 351484 bytes
 docs/_images/sphx_glr_from_darknet_thumb.png       |   Bin 0 -> 136338 bytes
 docs/_images/sphx_glr_from_keras_001.png           |   Bin 0 -> 149470 bytes
 docs/_images/sphx_glr_from_keras_thumb.png         |   Bin 0 -> 106745 bytes
 docs/_images/sphx_glr_from_mxnet_001.png           |   Bin 0 -> 149470 bytes
 docs/_images/sphx_glr_from_mxnet_thumb.png         |   Bin 0 -> 106745 bytes
 docs/_images/sphx_glr_from_onnx_001.png            |   Bin 0 -> 173547 bytes
 docs/_images/sphx_glr_from_onnx_thumb.png          |   Bin 0 -> 66996 bytes
 docs/_images/sphx_glr_from_pytorch_thumb.png       |   Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_from_tensorflow_thumb.png    |   Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_from_tflite_001.png          |   Bin 0 -> 149470 bytes
 docs/_images/sphx_glr_from_tflite_thumb.png        |   Bin 0 -> 106745 bytes
 docs/_images/sphx_glr_intrin_math_thumb.png        |   Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_intro_topi_thumb.png         |   Bin 0 -> 26786 bytes
 .../sphx_glr_low_level_custom_pass_thumb.png       |   Bin 0 -> 26786 bytes
 .../_images/sphx_glr_matrix_multiply_opt_thumb.png |   Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_matrix_multiply_thumb.png    |   Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_opt_conv_cuda_thumb.png      |   Bin 0 -> 26786 bytes
 .../_images/sphx_glr_opt_conv_tensorcore_thumb.png |   Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_opt_gemm_thumb.png           |   Bin 0 -> 26786 bytes
 .../sphx_glr_opt_matmul_auto_tensorcore_thumb.png  |   Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_reduction_thumb.png          |   Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_relay_pass_infra_thumb.png   |   Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_relay_quick_start_thumb.png  |   Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_scan_thumb.png               |   Bin 0 -> 26786 bytes
 .../_images/sphx_glr_schedule_primitives_thumb.png |   Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_tedd_thumb.png               |   Bin 0 -> 26786 bytes
 .../sphx_glr_tensor_expr_get_started_thumb.png     |   Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_tensorize_thumb.png          |   Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_tune_conv2d_cuda_thumb.png   |   Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_tune_relay_arm_thumb.png     |   Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_tune_relay_cuda_thumb.png    |   Bin 0 -> 26786 bytes
 .../sphx_glr_tune_relay_mobile_gpu_thumb.png       |   Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_tune_relay_vta_thumb.png     |   Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_tune_relay_x86_thumb.png     |   Bin 0 -> 26786 bytes
 .../sphx_glr_tune_simple_template_thumb.png        |   Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_tuple_inputs_thumb.png       |   Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_using_external_lib_thumb.png |   Bin 0 -> 26786 bytes
 docs/_images/sphx_glr_vta_get_started_thumb.png    |   Bin 0 -> 26786 bytes
 docs/_sources/api/python/autotvm.rst.txt           |    90 +
 docs/_sources/api/python/contrib.rst.txt           |   134 +
 docs/_sources/api/python/driver.rst.txt            |    24 +
 docs/_sources/api/python/error.rst.txt             |    23 +
 docs/_sources/api/python/graph_runtime.rst.txt     |    21 +
 docs/_sources/api/python/hybrid.rst.txt            |    23 +
 docs/_sources/api/python/index.rst.txt             |    39 +
 docs/_sources/api/python/ir.rst.txt                |    23 +
 docs/_sources/api/python/ndarray.rst.txt           |    27 +
 docs/_sources/api/python/relay/analysis.rst.txt    |    25 +
 docs/_sources/api/python/relay/backend.rst.txt     |    33 +
 docs/_sources/api/python/relay/frontend.rst.txt    |    38 +
 docs/_sources/api/python/relay/image.rst.txt       |    25 +
 docs/_sources/api/python/relay/index.rst.txt       |    48 +
 docs/_sources/api/python/relay/nn.rst.txt          |    23 +
 docs/_sources/api/python/relay/op.rst.txt          |    24 +
 docs/_sources/api/python/relay/testing.rst.txt     |    21 +
 docs/_sources/api/python/relay/transform.rst.txt   |    25 +
 docs/_sources/api/python/relay/vision.rst.txt      |    25 +
 docs/_sources/api/python/rpc.rst.txt               |    23 +
 docs/_sources/api/python/runtime.rst.txt           |    50 +
 docs/_sources/api/python/target.rst.txt            |    23 +
 docs/_sources/api/python/te.rst.txt                |    25 +
 docs/_sources/api/python/tir.rst.txt               |    33 +
 docs/_sources/api/python/topi.rst.txt              |   234 +
 docs/_sources/api/python/vta/index.rst.txt         |    45 +
 docs/_sources/api_links.rst.txt                    |    25 +
 docs/_sources/contribute/code_guide.rst.txt        |    58 +
 docs/_sources/contribute/code_review.rst.txt       |   105 +
 docs/_sources/contribute/committer_guide.rst.txt   |   103 +
 docs/_sources/contribute/community.rst.txt         |    50 +
 docs/_sources/contribute/document.rst.txt          |   119 +
 docs/_sources/contribute/error_handling.rst.txt    |   122 +
 docs/_sources/contribute/git_howto.rst.txt         |   137 +
 docs/_sources/contribute/index.rst.txt             |    51 +
 docs/_sources/contribute/pull_request.rst.txt      |   120 +
 docs/_sources/contribute/release_process.rst.txt   |   182 +
 docs/_sources/deploy/android.md.txt                |    39 +
 docs/_sources/deploy/aocl_fpga.md.txt              |   109 +
 docs/_sources/deploy/aws_fpga.md.txt               |   170 +
 docs/_sources/deploy/cpp_deploy.md.txt             |    52 +
 docs/_sources/deploy/index.rst.txt                 |    71 +
 docs/_sources/deploy/integrate.md.txt              |    67 +
 docs/_sources/dev/benchmark.rst.txt                |   137 +
 docs/_sources/dev/codebase_walkthrough.rst.txt     |   229 +
 docs/_sources/dev/convert_layout.rst.txt           |   242 +
 docs/_sources/dev/debugger.rst.txt                 |   172 +
 docs/_sources/dev/hybrid_script.rst.txt            |    93 +
 docs/_sources/dev/index.rst.txt                    |    42 +
 docs/_sources/dev/inferbound.rst.txt               |   765 ++
 .../introduction_to_module_serialization.rst.txt   |   226 +
 docs/_sources/dev/relay_add_op.rst.txt             |   268 +
 docs/_sources/dev/relay_add_pass.rst.txt           |   406 +
 .../dev/relay_bring_your_own_codegen.rst.txt       |   960 ++
 docs/_sources/dev/relay_intro.rst.txt              |   206 +
 docs/_sources/dev/relay_op_strategy.rst.txt        |   282 +
 docs/_sources/dev/relay_pass_infra.rst.txt         |   666 ++
 docs/_sources/dev/runtime.rst.txt                  |   295 +
 docs/_sources/dev/security.rst.txt                 |    43 +
 docs/_sources/dev/virtual_machine.rst.txt          |   404 +
 docs/_sources/faq.rst.txt                          |    69 +
 docs/_sources/frontend/tensorflow.rst.txt          |   249 +
 docs/_sources/genindex.rst.txt                     |    19 +
 docs/_sources/index.rst.txt                        |    61 +
 docs/_sources/install/docker.rst.txt               |    74 +
 docs/_sources/install/from_source.rst.txt          |   241 +
 docs/_sources/install/index.rst.txt                |    32 +
 docs/_sources/install/nnpack.md.txt                |    98 +
 docs/_sources/langref/hybrid_script.rst.txt        |   234 +
 docs/_sources/langref/index.rst.txt                |    60 +
 docs/_sources/langref/relay_adt.rst.txt            |   533 +
 docs/_sources/langref/relay_expr.rst.txt           |   693 ++
 docs/_sources/langref/relay_op.rst.txt             |   221 +
 docs/_sources/langref/relay_type.rst.txt           |   398 +
 .../tutorials/autotvm/sg_execution_times.rst.txt   |    15 +
 .../tutorials/autotvm/tune_conv2d_cuda.rst.txt     |   345 +
 .../tutorials/autotvm/tune_relay_arm.rst.txt       |   462 +
 .../tutorials/autotvm/tune_relay_cuda.rst.txt      |   449 +
 .../autotvm/tune_relay_mobile_gpu.rst.txt          |   463 +
 .../tutorials/autotvm/tune_relay_x86.rst.txt       |   300 +
 .../tutorials/autotvm/tune_simple_template.rst.txt |   439 +
 .../tutorials/cross_compilation_and_rpc.rst.txt    |   345 +
 .../tutorials/dev/low_level_custom_pass.rst.txt    |   291 +
 .../tutorials/dev/relay_pass_infra.rst.txt         |   552 +
 .../tutorials/dev/sg_execution_times.rst.txt       |    11 +
 docs/_sources/tutorials/frontend/build_gcn.rst.txt |   513 +
 .../frontend/deploy_model_on_android.rst.txt       |   477 +
 .../frontend/deploy_model_on_rasp.rst.txt          |   342 +
 .../tutorials/frontend/deploy_quantized.rst.txt    |   254 +
 .../tutorials/frontend/deploy_ssd_gluoncv.rst.txt  |   318 +
 .../tutorials/frontend/from_caffe2.rst.txt         |   222 +
 .../tutorials/frontend/from_coreml.rst.txt         |   232 +
 .../tutorials/frontend/from_darknet.rst.txt        |   325 +
 .../_sources/tutorials/frontend/from_keras.rst.txt |   216 +
 .../_sources/tutorials/frontend/from_mxnet.rst.txt |   245 +
 docs/_sources/tutorials/frontend/from_onnx.rst.txt |   218 +
 .../tutorials/frontend/from_pytorch.rst.txt        |   272 +
 .../tutorials/frontend/from_tensorflow.rst.txt     |   404 +
 .../tutorials/frontend/from_tflite.rst.txt         |   307 +
 .../tutorials/frontend/sg_execution_times.rst.txt  |    24 +
 .../tutorials/frontend/using_external_lib.rst.txt  |   647 ++
 docs/_sources/tutorials/index.rst.txt              |   896 ++
 docs/_sources/tutorials/language/extern_op.rst.txt |   203 +
 .../tutorials/language/intrin_math.rst.txt         |   345 +
 docs/_sources/tutorials/language/reduction.rst.txt |   483 +
 docs/_sources/tutorials/language/scan.rst.txt      |   317 +
 .../tutorials/language/schedule_primitives.rst.txt |   543 +
 .../tutorials/language/sg_execution_times.rst.txt  |    17 +
 docs/_sources/tutorials/language/tedd.rst.txt      |   235 +
 docs/_sources/tutorials/language/tensorize.rst.txt |   502 +
 .../tutorials/language/tuple_inputs.rst.txt        |   238 +
 .../tutorials/optimize/opt_conv_cuda.rst.txt       |   331 +
 .../tutorials/optimize/opt_conv_tensorcore.rst.txt |   610 ++
 docs/_sources/tutorials/optimize/opt_gemm.rst.txt  |   794 ++
 .../optimize/opt_matmul_auto_tensorcore.rst.txt    |   577 +
 .../tutorials/optimize/sg_execution_times.rst.txt  |    13 +
 docs/_sources/tutorials/relay_quick_start.rst.txt  |   375 +
 docs/_sources/tutorials/sg_execution_times.rst.txt |    12 +
 .../tutorials/tensor_expr_get_started.rst.txt      |   481 +
 docs/_sources/tutorials/topi/intro_topi.rst.txt    |   674 ++
 .../tutorials/topi/sg_execution_times.rst.txt      |    10 +
 docs/_sources/vta/dev/config.rst.txt               |    74 +
 docs/_sources/vta/dev/hardware.rst.txt             |   300 +
 docs/_sources/vta/dev/index.rst.txt                |    31 +
 docs/_sources/vta/index.rst.txt                    |    55 +
 docs/_sources/vta/install.md.txt                   |   419 +
 .../tutorials/autotvm/sg_execution_times.rst.txt   |    10 +
 .../vta/tutorials/autotvm/tune_relay_vta.rst.txt   |   585 +
 .../frontend/deploy_classification.rst.txt         |   395 +
 .../tutorials/frontend/deploy_detection.rst.txt    |   463 +
 .../tutorials/frontend/sg_execution_times.rst.txt  |    11 +
 docs/_sources/vta/tutorials/index.rst.txt          |   218 +
 .../_sources/vta/tutorials/matrix_multiply.rst.txt |   753 ++
 .../vta/tutorials/optimize/convolution_opt.rst.txt |   900 ++
 .../tutorials/optimize/matrix_multiply_opt.rst.txt |   720 ++
 .../tutorials/optimize/sg_execution_times.rst.txt  |    11 +
 .../vta/tutorials/sg_execution_times.rst.txt       |    11 +
 .../_sources/vta/tutorials/vta_get_started.rst.txt |   643 ++
 docs/_static/basic.css                             |   768 ++
 docs/_static/broken_example.png                    |   Bin 0 -> 21404 bytes
 docs/_static/css/badge_only.css                    |     1 +
 docs/_static/css/theme.css                         |     6 +
 docs/_static/css/tvm_theme.css                     |    43 +
 docs/_static/doctools.js                           |   315 +
 docs/_static/documentation_options.js              |    11 +
 docs/_static/file.png                              |   Bin 0 -> 286 bytes
 docs/_static/fonts/Inconsolata-Bold.ttf            |   Bin 0 -> 109948 bytes
 docs/_static/fonts/Inconsolata-Regular.ttf         |   Bin 0 -> 96964 bytes
 docs/_static/fonts/Inconsolata.ttf                 |   Bin 0 -> 63184 bytes
 docs/_static/fonts/Lato-Bold.ttf                   |   Bin 0 -> 656544 bytes
 docs/_static/fonts/Lato-Regular.ttf                |   Bin 0 -> 656568 bytes
 docs/_static/fonts/Lato/lato-bold.eot              |   Bin 0 -> 256056 bytes
 docs/_static/fonts/Lato/lato-bold.ttf              |   Bin 0 -> 600856 bytes
 docs/_static/fonts/Lato/lato-bold.woff             |   Bin 0 -> 309728 bytes
 docs/_static/fonts/Lato/lato-bold.woff2            |   Bin 0 -> 184912 bytes
 docs/_static/fonts/Lato/lato-bolditalic.eot        |   Bin 0 -> 266158 bytes
 docs/_static/fonts/Lato/lato-bolditalic.ttf        |   Bin 0 -> 622572 bytes
 docs/_static/fonts/Lato/lato-bolditalic.woff       |   Bin 0 -> 323344 bytes
 docs/_static/fonts/Lato/lato-bolditalic.woff2      |   Bin 0 -> 193308 bytes
 docs/_static/fonts/Lato/lato-italic.eot            |   Bin 0 -> 268604 bytes
 docs/_static/fonts/Lato/lato-italic.ttf            |   Bin 0 -> 639388 bytes
 docs/_static/fonts/Lato/lato-italic.woff           |   Bin 0 -> 328412 bytes
 docs/_static/fonts/Lato/lato-italic.woff2          |   Bin 0 -> 195704 bytes
 docs/_static/fonts/Lato/lato-regular.eot           |   Bin 0 -> 253461 bytes
 docs/_static/fonts/Lato/lato-regular.ttf           |   Bin 0 -> 607720 bytes
 docs/_static/fonts/Lato/lato-regular.woff          |   Bin 0 -> 309192 bytes
 docs/_static/fonts/Lato/lato-regular.woff2         |   Bin 0 -> 182708 bytes
 docs/_static/fonts/RobotoSlab-Bold.ttf             |   Bin 0 -> 170616 bytes
 docs/_static/fonts/RobotoSlab-Regular.ttf          |   Bin 0 -> 169064 bytes
 .../fonts/RobotoSlab/roboto-slab-v7-bold.eot       |   Bin 0 -> 79520 bytes
 .../fonts/RobotoSlab/roboto-slab-v7-bold.ttf       |   Bin 0 -> 170616 bytes
 .../fonts/RobotoSlab/roboto-slab-v7-bold.woff      |   Bin 0 -> 87624 bytes
 .../fonts/RobotoSlab/roboto-slab-v7-bold.woff2     |   Bin 0 -> 67312 bytes
 .../fonts/RobotoSlab/roboto-slab-v7-regular.eot    |   Bin 0 -> 78331 bytes
 .../fonts/RobotoSlab/roboto-slab-v7-regular.ttf    |   Bin 0 -> 169064 bytes
 .../fonts/RobotoSlab/roboto-slab-v7-regular.woff   |   Bin 0 -> 86288 bytes
 .../fonts/RobotoSlab/roboto-slab-v7-regular.woff2  |   Bin 0 -> 66444 bytes
 docs/_static/fonts/fontawesome-webfont.eot         |   Bin 0 -> 165742 bytes
 docs/_static/fonts/fontawesome-webfont.svg         |  2671 +++++
 docs/_static/fonts/fontawesome-webfont.ttf         |   Bin 0 -> 165548 bytes
 docs/_static/fonts/fontawesome-webfont.woff        |   Bin 0 -> 98024 bytes
 docs/_static/fonts/fontawesome-webfont.woff2       |   Bin 0 -> 77160 bytes
 docs/_static/gallery.css                           |   192 +
 docs/_static/img/README                            |     2 +
 docs/_static/img/tvm-logo-small.png                |   Bin 0 -> 6683 bytes
 docs/_static/img/tvm-logo-square.png               |   Bin 0 -> 3453 bytes
 docs/_static/jquery-3.4.1.js                       | 10598 +++++++++++++++++++
 docs/_static/jquery.js                             |     2 +
 docs/_static/js/modernizr.min.js                   |     4 +
 docs/_static/js/theme.js                           |     3 +
 docs/_static/language_data.js                      |   297 +
 docs/_static/minus.png                             |   Bin 0 -> 90 bytes
 docs/_static/no_image.png                          |   Bin 0 -> 4315 bytes
 docs/_static/plus.png                              |   Bin 0 -> 90 bytes
 docs/_static/pygments.css                          |    65 +
 docs/_static/searchtools.js                        |   506 +
 docs/_static/tvm-logo-small.png                    |   Bin 0 -> 6683 bytes
 docs/_static/tvm-logo-square.png                   |   Bin 0 -> 3453 bytes
 docs/_static/underscore-1.3.1.js                   |   999 ++
 docs/_static/underscore.js                         |    31 +
 docs/api/python/autotvm.html                       |  2441 +++++
 docs/api/python/contrib.html                       |  1424 +++
 docs/api/python/driver.html                        |   358 +
 docs/api/python/error.html                         |   422 +
 docs/api/python/graph_runtime.html                 |   448 +
 docs/api/python/hybrid.html                        |   436 +
 docs/api/python/index.html                         |   338 +
 docs/api/python/ir.html                            |  1433 +++
 docs/api/python/ndarray.html                       |   417 +
 docs/api/python/relay/analysis.html                |   856 ++
 docs/api/python/relay/backend.html                 |   707 ++
 docs/api/python/relay/frontend.html                |   475 +
 docs/api/python/relay/image.html                   |   400 +
 docs/api/python/relay/index.html                   |  3918 +++++++
 docs/api/python/relay/nn.html                      |  2015 ++++
 docs/api/python/relay/op.html                      |  3230 ++++++
 docs/api/python/relay/testing.html                 |   285 +
 docs/api/python/relay/transform.html               |  1372 +++
 docs/api/python/relay/vision.html                  |   602 ++
 docs/api/python/rpc.html                           |   760 ++
 docs/api/python/runtime.html                       |  1491 +++
 docs/api/python/target.html                        |   824 ++
 docs/api/python/te.html                            |  2401 +++++
 docs/api/python/tir.html                           |  3022 ++++++
 docs/api/python/topi.html                          |  2588 +++++
 docs/api/python/vta/index.html                     |   403 +
 docs/api_links.html                                |   251 +
 docs/contribute/code_guide.html                    |   298 +
 docs/contribute/code_review.html                   |   348 +
 docs/contribute/committer_guide.html               |   342 +
 docs/contribute/community.html                     |   284 +
 docs/contribute/document.html                      |   351 +
 docs/contribute/error_handling.html                |   356 +
 docs/contribute/git_howto.html                     |   364 +
 docs/contribute/index.html                         |   348 +
 docs/contribute/pull_request.html                  |   361 +
 docs/contribute/release_process.html               |   395 +
 docs/deploy/android.html                           |   276 +
 docs/deploy/aocl_fpga.html                         |   357 +
 docs/deploy/aws_fpga.html                          |   415 +
 docs/deploy/cpp_deploy.html                        |   285 +
 docs/deploy/index.html                             |   318 +
 docs/deploy/integrate.html                         |   301 +
 docs/dev/benchmark.html                            |   451 +
 docs/dev/codebase_walkthrough.html                 |   433 +
 docs/dev/convert_layout.html                       |   468 +
 docs/dev/debugger.html                             |   417 +
 docs/dev/hybrid_script.html                        |   338 +
 docs/dev/index.html                                |   366 +
 docs/dev/inferbound.html                           |   882 ++
 docs/dev/introduction_to_module_serialization.html |   449 +
 docs/dev/relay_add_op.html                         |   492 +
 docs/dev/relay_add_pass.html                       |   606 ++
 docs/dev/relay_bring_your_own_codegen.html         |  1092 ++
 docs/dev/relay_intro.html                          |   428 +
 docs/dev/relay_op_strategy.html                    |   491 +
 docs/dev/relay_pass_infra.html                     |   827 ++
 docs/dev/runtime.html                              |   490 +
 docs/dev/security.html                             |   286 +
 docs/dev/virtual_machine.html                      |   604 ++
 docs/faq.html                                      |   296 +
 docs/frontend/tensorflow.html                      |   478 +
 docs/genindex.html                                 |  3294 ++++++
 docs/index.html                                    |   330 +
 docs/install/docker.html                           |   294 +
 docs/install/from_source.html                      |   470 +
 docs/install/index.html                            |   270 +
 docs/install/nnpack.html                           |   338 +
 .../javadoc/org/apache/tvm/class-use/Function.html |    12 +-
 docs/langref/hybrid_script.html                    |   433 +
 docs/langref/index.html                            |   337 +
 docs/langref/relay_adt.html                        |   714 ++
 docs/langref/relay_expr.html                       |   828 ++
 docs/langref/relay_op.html                         |   718 ++
 docs/langref/relay_type.html                       |   566 +
 docs/objects.inv                                   |   Bin 0 -> 13806 bytes
 docs/py-modindex.html                              |   553 +
 docs/search.html                                   |   249 +
 docs/searchindex.js                                |     1 +
 docs/tutorials/autotvm/sg_execution_times.html     |   243 +
 docs/tutorials/autotvm/tune_conv2d_cuda.html       |   531 +
 docs/tutorials/autotvm/tune_relay_arm.html         |   636 ++
 docs/tutorials/autotvm/tune_relay_cuda.html        |   613 ++
 docs/tutorials/autotvm/tune_relay_mobile_gpu.html  |   638 ++
 docs/tutorials/autotvm/tune_relay_x86.html         |   496 +
 docs/tutorials/autotvm/tune_simple_template.html   |   592 ++
 docs/tutorials/cross_compilation_and_rpc.html      |   510 +
 docs/tutorials/dev/low_level_custom_pass.html      |   452 +
 docs/tutorials/dev/relay_pass_infra.html           |   635 ++
 docs/tutorials/dev/sg_execution_times.html         |   239 +
 docs/tutorials/frontend/build_gcn.html             |   666 ++
 .../frontend/deploy_model_on_android.html          |   621 ++
 docs/tutorials/frontend/deploy_model_on_rasp.html  |   497 +
 docs/tutorials/frontend/deploy_quantized.html      |   439 +
 docs/tutorials/frontend/deploy_ssd_gluoncv.html    |   492 +
 docs/tutorials/frontend/from_caffe2.html           |   419 +
 docs/tutorials/frontend/from_coreml.html           |   416 +
 docs/tutorials/frontend/from_darknet.html          |   510 +
 docs/tutorials/frontend/from_keras.html            |   402 +
 docs/tutorials/frontend/from_mxnet.html            |   422 +
 docs/tutorials/frontend/from_onnx.html             |   402 +
 docs/tutorials/frontend/from_pytorch.html          |   454 +
 docs/tutorials/frontend/from_tensorflow.html       |   559 +
 docs/tutorials/frontend/from_tflite.html           |   483 +
 docs/tutorials/frontend/sg_execution_times.html    |   252 +
 docs/tutorials/frontend/using_external_lib.html    |   841 ++
 docs/tutorials/index.html                          |   566 +
 docs/tutorials/language/extern_op.html             |   396 +
 docs/tutorials/language/intrin_math.html           |   491 +
 docs/tutorials/language/reduction.html             |   584 +
 docs/tutorials/language/scan.html                  |   479 +
 docs/tutorials/language/schedule_primitives.html   |   622 ++
 docs/tutorials/language/sg_execution_times.html    |   245 +
 docs/tutorials/language/tedd.html                  |   402 +
 docs/tutorials/language/tensorize.html             |   611 ++
 docs/tutorials/language/tuple_inputs.html          |   429 +
 docs/tutorials/optimize/opt_conv_cuda.html         |   484 +
 docs/tutorials/optimize/opt_conv_tensorcore.html   |   778 ++
 docs/tutorials/optimize/opt_gemm.html              |   832 ++
 .../optimize/opt_matmul_auto_tensorcore.html       |   785 ++
 docs/tutorials/optimize/sg_execution_times.html    |   241 +
 docs/tutorials/relay_quick_start.html              |   577 +
 docs/tutorials/sg_execution_times.html             |   240 +
 docs/tutorials/tensor_expr_get_started.html        |   579 +
 docs/tutorials/topi/intro_topi.html                |   787 ++
 docs/tutorials/topi/sg_execution_times.html        |   238 +
 docs/vta/dev/config.html                           |   350 +
 docs/vta/dev/hardware.html                         |   526 +
 docs/vta/dev/index.html                            |   274 +
 docs/vta/index.html                                |   276 +
 docs/vta/install.html                              |   662 ++
 docs/vta/tutorials/autotvm/sg_execution_times.html |   238 +
 docs/vta/tutorials/autotvm/tune_relay_vta.html     |   821 ++
 .../tutorials/frontend/deploy_classification.html  |   581 +
 docs/vta/tutorials/frontend/deploy_detection.html  |   616 ++
 .../vta/tutorials/frontend/sg_execution_times.html |   239 +
 docs/vta/tutorials/index.html                      |   332 +
 docs/vta/tutorials/matrix_multiply.html            |   851 ++
 docs/vta/tutorials/optimize/convolution_opt.html   |  1027 ++
 .../tutorials/optimize/matrix_multiply_opt.html    |   860 ++
 .../vta/tutorials/optimize/sg_execution_times.html |   239 +
 docs/vta/tutorials/sg_execution_times.html         |   239 +
 docs/vta/tutorials/vta_get_started.html            |   712 ++
 494 files changed, 160966 insertions(+), 6 deletions(-)

diff --git a/docs/_downloads/00a1355fcb7c30e9e70fc8fefc708f98/tuple_inputs.ipynb b/docs/_downloads/00a1355fcb7c30e9e70fc8fefc708f98/tuple_inputs.ipynb
new file mode 100644
index 0000000..54dd8a5
--- /dev/null
+++ b/docs/_downloads/00a1355fcb7c30e9e70fc8fefc708f98/tuple_inputs.ipynb
@@ -0,0 +1,122 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\nCompute and Reduce with Tuple Inputs\n=======================================\n**Author**: `Ziheng Jiang <https://github.com/ZihengJiang>`_\n\nOften we want to compute multiple outputs with the same shape within\na single loop or perform reduction that involves multiple values like\n:code:`argmax`. These problems can be addressed by tuple inputs.\n\nIn this tutorial, we will introduce the usage of tuple inputs in TVM.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from __future__ import absolute_import, print_function\n\nimport tvm\nfrom tvm import te\nimport numpy as np"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Describe Batchwise Computation\n------------------------------\nFor operators which have the same shape, we can put them together as\nthe inputs of :any:`te.compute`, if we want them to be scheduled\ntogether in the next schedule procedure.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "n = te.var(\"n\")\nm = te.var(\"m\")\nA0 = te.placeholder((m, n), name='A0')\nA1 = te.placeholder((m, n), name='A1')\nB0, B1 = te.compute((m, n), lambda i, j: (A0[i, j] + 2, A1[i, j] * 3), name='B')\n\n# The generated IR code would be:\ns = te.create_schedule(B0.op)\nprint(tvm.lower(s, [A0, A1, B0, B1], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\nDescribe Reduction with Collaborative Inputs\n--------------------------------------------\nSometimes, we require multiple inputs to express some reduction\noperators, and the inputs will collaborate together, e.g. :code:`argmax`.\nIn the reduction procedure, :code:`argmax` need to compare the value of\noperands, also need to keep the index of operand. It can be expressed\nwith :py:func:`te.comm_reducer` as below:\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# x and y are the operands of reduction, both of them is a tuple of index\n# and value.\ndef fcombine(x, y):\n    lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0])\n    rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1])\n    return lhs, rhs\n\n# our identity element also need to be a tuple, so `fidentity` accepts\n# two types as inputs.\ndef fidentity(t0, t1):\n    return tvm.tir.const(-1, t0), tvm.te.min_value(t1)\n\nargmax = te.comm_reducer(fcombine, fidentity, name='argmax')\n\n [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "<div class=\"alert alert-info\"><h4>Note</h4><p>For ones who are not familiar with reduction, please refer to\n  `general-reduction`.</p></div>\n\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Schedule Operation with Tuple Inputs\n------------------------------------\nIt is worth mentioning that although you will get multiple outputs\nwith one batch operation, but they can only be scheduled together\nin terms of operation.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "n = te.var(\"n\")\nm = te.var(\"m\")\nA0 = te.placeholder((m, n), name='A0')\nB0, B1 = te.compute((m, n), lambda i, j: (A0[i, j] + 2, A0[i, j] * 3), name='B')\nA1 = te.placeholder((m, n), name='A1')\nC = te.compute((m, n), lambda i, j: A1[i, j] + B0[i, j], name='C')\n\ns = te.create_schedule(C.op)\ns[B0].compute_at(s[C], C.op.axis[0])\n# as you can see in the below generated IR code:\nprint(tvm.lower(s, [A0, A1, C], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Summary\n-------\nThis tutorial introduces the usage of tuple inputs operation.\n\n- Describe normal batchwise computation.\n- Describe reduction operation with tuple inputs.\n- Notice that you can only schedule computation in terms of operation instead of tensor.\n\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/_downloads/02fc8627299fa0b05eb017773b471bfa/from_tflite.py b/docs/_downloads/02fc8627299fa0b05eb017773b471bfa/from_tflite.py
new file mode 100644
index 0000000..3273855
--- /dev/null
+++ b/docs/_downloads/02fc8627299fa0b05eb017773b471bfa/from_tflite.py
@@ -0,0 +1,199 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Compile TFLite Models
+=====================
+**Author**: `Zhao Wu <https://github.com/FrozenGene>`_
+
+This article is an introductory tutorial to deploy TFLite models with Relay.
+
+To get started, Flatbuffers and TFLite package needs to be installed as prerequisites.
+A quick solution is to install Flatbuffers via pip
+
+.. code-block:: bash
+
+    pip install flatbuffers --user
+
+
+To install TFlite packages, you could use our prebuilt wheel:
+
+.. code-block:: bash
+
+    # For python3:
+    wget https://github.com/FrozenGene/tflite/releases/download/v1.13.1/tflite-1.13.1-py3-none-any.whl
+    pip3 install -U tflite-1.13.1-py3-none-any.whl --user
+
+    # For python2:
+    wget https://github.com/FrozenGene/tflite/releases/download/v1.13.1/tflite-1.13.1-py2-none-any.whl
+    pip install -U tflite-1.13.1-py2-none-any.whl --user
+
+
+or you could generate TFLite package yourself. The steps are the following:
+
+.. code-block:: bash
+
+    # Get the flatc compiler.
+    # Please refer to https://github.com/google/flatbuffers for details
+    # and make sure it is properly installed.
+    flatc --version
+
+    # Get the TFLite schema.
+    wget https://raw.githubusercontent.com/tensorflow/tensorflow/r1.13/tensorflow/lite/schema/schema.fbs
+
+    # Generate TFLite package.
+    flatc --python schema.fbs
+
+    # Add current folder (which contains generated tflite module) to PYTHONPATH.
+    export PYTHONPATH=${PYTHONPATH:+$PYTHONPATH:}$(pwd)
+
+
+Now please check if TFLite package is installed successfully, ``python -c "import tflite"``
+
+Below you can find an example on how to compile TFLite model using TVM.
+"""
+######################################################################
+# Utils for downloading and extracting zip files
+# ----------------------------------------------
+import os
+
+def extract(path):
+    import tarfile
+    if path.endswith("tgz") or path.endswith("gz"):
+        dir_path = os.path.dirname(path)
+        tar = tarfile.open(path)
+        tar.extractall(path=dir_path)
+        tar.close()
+    else:
+        raise RuntimeError('Could not decompress the file: ' + path)
+
+
+######################################################################
+# Load pretrained TFLite model
+# ----------------------------
+# Load mobilenet V1 TFLite model provided by Google
+from tvm.contrib.download import download_testdata
+
+model_url = "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz"
+
+# Download model tar file and extract it to get mobilenet_v1_1.0_224.tflite
+model_path = download_testdata(model_url, "mobilenet_v1_1.0_224.tgz", module=['tf', 'official'])
+model_dir = os.path.dirname(model_path)
+extract(model_path)
+
+# Now we can open mobilenet_v1_1.0_224.tflite
+tflite_model_file = os.path.join(model_dir, "mobilenet_v1_1.0_224.tflite")
+tflite_model_buf = open(tflite_model_file, "rb").read()
+
+# Get TFLite model from buffer
+try:
+    import tflite
+    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+except AttributeError:
+    import tflite.Model
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+
+######################################################################
+# Load a test image
+# -----------------
+# A single cat dominates the examples!
+from PIL import Image
+from matplotlib import pyplot as plt
+import numpy as np
+
+image_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
+image_path = download_testdata(image_url, 'cat.png', module='data')
+resized_image = Image.open(image_path).resize((224, 224))
+plt.imshow(resized_image)
+plt.show()
+image_data = np.asarray(resized_image).astype("float32")
+
+# Add a dimension to the image so that we have NHWC format layout
+image_data = np.expand_dims(image_data, axis=0)
+
+# Preprocess image as described here:
+# https://github.com/tensorflow/models/blob/edb6ed22a801665946c63d650ab9a0b23d98e1b1/research/slim/preprocessing/inception_preprocessing.py#L243
+image_data[:, :, :, 0] = 2.0 / 255.0 * image_data[:, :, :, 0] - 1
+image_data[:, :, :, 1] = 2.0 / 255.0 * image_data[:, :, :, 1] - 1
+image_data[:, :, :, 2] = 2.0 / 255.0 * image_data[:, :, :, 2] - 1
+print('input', image_data.shape)
+
+######################################################################
+# Compile the model with relay
+# ----------------------------
+
+# TFLite input tensor name, shape and type
+input_tensor = "input"
+input_shape = (1, 224, 224, 3)
+input_dtype = "float32"
+
+# Parse TFLite model and convert it to a Relay module
+from tvm import relay
+mod, params = relay.frontend.from_tflite(tflite_model,
+                                         shape_dict={input_tensor: input_shape},
+                                         dtype_dict={input_tensor: input_dtype})
+
+# Build the module against to x86 CPU
+target = "llvm"
+with relay.build_config(opt_level=3):
+    graph, lib, params = relay.build(mod, target, params=params)
+
+######################################################################
+# Execute on TVM
+# --------------
+import tvm
+from tvm import te
+from tvm.contrib import graph_runtime as runtime
+
+# Create a runtime executor module
+module = runtime.create(graph, lib, tvm.cpu())
+
+# Feed input data
+module.set_input(input_tensor, tvm.nd.array(image_data))
+
+# Feed related params
+module.set_input(**params)
+
+# Run
+module.run()
+
+# Get output
+tvm_output = module.get_output(0).asnumpy()
+
+######################################################################
+# Display results
+# ---------------
+
+# Load label file
+label_file_url = ''.join(['https://raw.githubusercontent.com/',
+                          'tensorflow/tensorflow/master/tensorflow/lite/java/demo/',
+                          'app/src/main/assets/',
+                          'labels_mobilenet_quant_v1_224.txt'])
+label_file = "labels_mobilenet_quant_v1_224.txt"
+label_path = download_testdata(label_file_url, label_file, module='data')
+
+# List of 1001 classes
+with open(label_path) as f:
+    labels = f.readlines()
+
+# Convert result to 1D data
+predictions = np.squeeze(tvm_output)
+
+# Get top 1 prediction
+prediction = np.argmax(predictions)
+
+# Convert id to class name and show the result
+print("The image prediction result is: id " + str(prediction) + " name: " + labels[prediction])
diff --git a/docs/_downloads/0bb862dbb3a4c434477f93fe2c147fbb/tune_simple_template.py b/docs/_downloads/0bb862dbb3a4c434477f93fe2c147fbb/tune_simple_template.py
new file mode 100644
index 0000000..c5a3843
--- /dev/null
+++ b/docs/_downloads/0bb862dbb3a4c434477f93fe2c147fbb/tune_simple_template.py
@@ -0,0 +1,327 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Writing tunable template and Using auto-tuner
+=============================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
+
+This is an introduction tutorial to the auto-tuning module in TVM.
+
+There are two steps in auto-tuning.
+The first step is defining a search space.
+The second step is running a search algorithm to explore through this space.
+In this tutorial, you can learn how to perform these two steps in TVM.
+The whole workflow is illustrated by a matrix multiplication example.
+"""
+
+######################################################################
+# Install dependencies
+# --------------------
+# To use autotvm package in TVM, we need to install some extra dependencies.
+# This step (installing xgboost) can be skipped as it doesn't need XGBoost
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user psutil xgboost
+#
+# To make TVM run faster in tuning, it is recommended to use cython
+# as FFI of TVM. In the root directory of TVM, execute
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user cython
+#   sudo make cython3
+#
+# Now return to python code. Import packages.
+
+import logging
+import sys
+
+import numpy as np
+import tvm
+from tvm import te
+
+# the module is called `autotvm`
+from tvm import autotvm
+
+######################################################################
+# Step 1:  Define the search space
+# --------------------------------
+# In this section, we will rewrite a deterministic TVM schedule code to a
+# tunable schedule template. You can regard the process of search space definition
+# as the parameterization of our existing schedule code.
+#
+# To begin with, here is how we implement a blocked matrix multiplication in TVM.
+
+# Matmul V0: Constant tiling factor
+def matmul_v0(N, L, M, dtype):
+    A = te.placeholder((N, L), name='A', dtype=dtype)
+    B = te.placeholder((L, M), name='B', dtype=dtype)
+
+    k = te.reduce_axis((0, L), name='k')
+    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name='C')
+    s = te.create_schedule(C.op)
+
+    # schedule
+    y, x = s[C].op.axis
+    k = s[C].op.reduce_axis[0]
+
+    yo, yi = s[C].split(y, 8)
+    xo, xi = s[C].split(x, 8)
+
+    s[C].reorder(yo, xo, k, yi, xi)
+
+    return s, [A, B, C]
+
+#####################################################################
+# Parametrize the schedule
+# ^^^^^^^^^^^^^^^^^^^^^^^^
+# In the previous schedule code, we use a constant "8" as tiling factor.
+# However, it might not be the best one because the best tiling factor depends
+# on real hardware environment and input shape.
+#
+# If you want the schedule code to be portable across a wider range of input shapes
+# and target hardware, it is better to define a set of candidate values and
+# pick the best one according to the measurement results on target hardware.
+#
+# In autotvm, we can define a tunable parameter, or a "knob" for such kind of value.
+
+# Matmul V1: List candidate values
+@autotvm.template("tutorial/matmul_v1")  # 1. use a decorator
+def matmul_v1(N, L, M, dtype):
+    A = te.placeholder((N, L), name='A', dtype=dtype)
+    B = te.placeholder((L, M), name='B', dtype=dtype)
+
+    k = te.reduce_axis((0, L), name='k')
+    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name='C')
+    s = te.create_schedule(C.op)
+
+    # schedule
+    y, x = s[C].op.axis
+    k = s[C].op.reduce_axis[0]
+
+    # 2. get the config object
+    cfg = autotvm.get_config()
+
+    # 3. define search space
+    cfg.define_knob("tile_y", [1, 2, 4, 8, 16])
+    cfg.define_knob("tile_x", [1, 2, 4, 8, 16])
+
+    # 4. schedule according to config
+    yo, yi = s[C].split(y, cfg['tile_y'].val)
+    xo, xi = s[C].split(x, cfg['tile_x'].val)
+
+    s[C].reorder(yo, xo, k, yi, xi)
+
+    return s, [A, B, C]
+
+###############################################################################
+# Here we make four modifications to the previous schedule code and get
+# a tunable "template". We can explain the modifications one by one.
+#
+# 1. Use a decorator to mark this function as a simple template.
+# 2. Get a config object:
+#    You can regard this :code:`cfg` as an argument of this function but
+#    we obtain it in a different way. With this argument, this function is no longer
+#    a deterministic schedule code. Instead, we can pass different configurations to
+#    this function and get different schedules, so this function is a "template".
+#
+#    To make the template function more compact, we do two things in a single function.
+#    (1) define a search space and (2) schedule according to an entity in this space.
+#    To achieve this, we make :code:`cfg` be either
+#    a :any:`ConfigSpace` or a :any:`ConfigEntity` object.
+#
+#    When it is a :any:`ConfigSpace`, it will collect all tunable knobs in this function and
+#    build the search space.
+#    When it is a :any:`ConfigEntity`, it will ignore all space definition API
+#    (namely, :code:`cfg.define_XXXXX(...)`).   Instead, it stores deterministic values for
+#    all tunable knobs, and we schedule according to these values.
+#
+#    During auto-tuning, we will first call this template with a :any:`ConfigSpace`
+#    object to build the search space. Then we call this template with different :any:`ConfigEntity`
+#    in the built space to get different schedules. Finally we will measure the code generated by
+#    different schedules and pick the best one.
+#
+# 3. Define two tunable knobs. The first one is :code:`tile_y` with
+#    5 possible values. The second one is :code:`tile_x` with a same
+#    list of possible values. These two knobs are independent, so they
+#    span a search space with size = 5x5 = 25
+# 4. Schedule according to the deterministic values in :code:`cfg`
+#
+
+#####################################################################
+# Use better space definition API
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# In the previous template, we manually list all possible values for a knob.
+# This is the lowest level API to define the space.
+# However, we also provide another set of API to make the space definition
+# easier and smarter. It is recommended to use this set of high level API.
+#
+# In the following example, we use :any:`ConfigSpace.define_split` to define a split
+# knob. It will enumerate all the possible ways to split an axis and construct
+# the space.
+#
+# We also have :any:`ConfigSpace.define_reorder` for reorder knob and
+# :any:`ConfigSpace.define_annotate` for annotation like unroll, vectorization,
+# thread binding.
+# When the high level API cannot meet your requirement, you can always fall
+# back to use low level API.
+
+@autotvm.template("tutorial/matmul")
+def matmul(N, L, M, dtype):
+    A = te.placeholder((N, L), name='A', dtype=dtype)
+    B = te.placeholder((L, M), name='B', dtype=dtype)
+
+    k = te.reduce_axis((0, L), name='k')
+    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name='C')
+    s = te.create_schedule(C.op)
+
+    # schedule
+    y, x = s[C].op.axis
+    k = s[C].op.reduce_axis[0]
+
+    ##### define space begin #####
+    cfg = autotvm.get_config()
+    cfg.define_split("tile_y", y, num_outputs=2)
+    cfg.define_split("tile_x", x, num_outputs=2)
+    ##### define space end #####
+
+    # schedule according to config
+    yo, yi = cfg["tile_y"].apply(s, C, y)
+    xo, xi = cfg["tile_x"].apply(s, C, x)
+
+    s[C].reorder(yo, xo, k, yi, xi)
+
+    return s, [A, B, C]
+
+######################################################################
+# .. note:: More Explanation on :code:`cfg.defile_split`
+#
+#  In this template, :code:`cfg.define_split("tile_y", y, num_outputs=2)` will enumerate
+#  all possible combinations that can split axis y into two axes with factors of the length of y.
+#  For example, if the length of y is 32 and we want to split it into two axes
+#  using factors of 32, then there are 6 possible values for
+#  (length of outer axis, length of inner axis) pair, namely
+#  (32, 1), (16, 2), (8, 4), (4, 8), (2, 16) or (1, 32).
+#  They are just the 6 possible values of `tile_y`.
+#
+#  During schedule, :code:`cfg["tile_y"]` is a :code:`SplitEntity` object.
+#  We stores the lengths of outer axes and inner axes in :code:`cfg['tile_y'].size`
+#  (a tuple with two elements).
+#  In this template, we apply it by using :code:`yo, yi = cfg['tile_y'].apply(s, C, y)`.
+#  Actually, this is equivalent to
+#  :code:`yo, yi = s[C].split(y, cfg["tile_y"].size[1])`
+#  or  :code:`yo, yi = s[C].split(y, nparts=cfg['tile_y"].size[0])`
+#
+#  The advantage of using cfg.apply API is that it makes multi-level split
+#  (when num_outputs >= 3) easier.
+
+######################################################################
+# Step 2:  Search through the space
+# ---------------------------------
+# In step 1, we build the search space by extending our old schedule code
+# into a template. The next step is to pick a tuner and explore in this space.
+#
+# Auto-tuners in TVM
+# ^^^^^^^^^^^^^^^^^^
+# The job for a tuner can be described by following pseudo code
+#
+#   .. code-block:: c
+#
+#    ct = 0
+#    while ct < max_number_of_trials:
+#        propose a batch of configs
+#        measure this batch of configs on real hardware and get results
+#        ct += batch_size
+#
+# When proposing the next batch of configs, the tuner can take different strategies. We
+# provide four tuners with different strategies in autotvm.
+#
+# * :any:`RandomTuner`: Enumerate the space in a random order
+# * :any:`GridSearchTuner`: Enumerate the space in a grid search order
+# * :any:`GATuner`: Using genetic algorithm to search through the space
+# * :any:`XGBTuner`: Uses a model based method. Train a XGBoost model to predict the speed of lowered IR and pick the next batch according to the prediction.
+#
+# You can choose the tuner according to the size of your space, your time budget and other factors.
+# For example, if your space is very small (less than 1000), a gridsearch tuner or a
+# random tuner is good enough. If your space is at the level of 10^9 (this is the space
+# size of a conv2d operator on CUDA GPU), XGBoostTuner can explore more efficiently
+# and find better configs.
+
+################################################################
+# Begin tuning
+# ^^^^^^^^^^^^
+# Here we continue our matrix multiplication example.
+# First we should create a tuning task.
+# We can also inspect the initialized search space.
+# In this case, for a 512x512 square matrix multiplication, the space size
+# is 10x10=100
+N, L, M = 512, 512, 512
+task = autotvm.task.create("tutorial/matmul", args=(N, L, M, 'float32'), target='llvm')
+print(task.config_space)
+
+################################################################
+# Then we need to define how to measure the generated code and pick a tuner.
+# Since our space is small, a random tuner is just okay.
+#
+# We only make 10 trials in this tutorial for demonstration. In practice,
+# you can do more trials according to your time budget.
+# We will log the tuning results into a log file. This file can be
+# used to get the best config later.
+
+# logging config (for printing tuning log to the screen)
+logging.getLogger('autotvm').setLevel(logging.DEBUG)
+logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))
+
+# There are two steps for measuring a config: build and run.
+# By default, we use all CPU cores to compile program. Then measure them sequentially.
+# We measure 5 times and take average to reduce variance.
+measure_option = autotvm.measure_option(
+    builder='local',
+    runner=autotvm.LocalRunner(number=5))
+
+# Begin tuning with RandomTuner, log records to file `matmul.log`
+# You can use alternatives like XGBTuner.
+tuner = autotvm.tuner.RandomTuner(task)
+tuner.tune(n_trial=10,
+           measure_option=measure_option,
+           callbacks=[autotvm.callback.log_to_file('matmul.log')])
+
+#########################################################################
+# Finally we apply history best from the cache file and check its correctness.
+# We can call the function :code:`matmul` directly under the
+# :any:`autotvm.apply_history_best` context. When we call this function,
+# it will query the dispatch context with its argument and get the best config
+# with the same argument.
+
+# apply history best from log file
+with autotvm.apply_history_best('matmul.log'):
+    with tvm.target.create("llvm"):
+        s, arg_bufs = matmul(N, L, M, 'float32')
+        func = tvm.build(s, arg_bufs)
+
+# check correctness
+a_np = np.random.uniform(size=(N, L)).astype(np.float32)
+b_np = np.random.uniform(size=(L, M)).astype(np.float32)
+c_np = a_np.dot(b_np)
+
+c_tvm = tvm.nd.empty(c_np.shape)
+func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm)
+
+tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
diff --git a/docs/_downloads/0c30ce88b67b0e8d46494348ab36c9fb/from_tflite.ipynb b/docs/_downloads/0c30ce88b67b0e8d46494348ab36c9fb/from_tflite.ipynb
new file mode 100644
index 0000000..3364e3a
--- /dev/null
+++ b/docs/_downloads/0c30ce88b67b0e8d46494348ab36c9fb/from_tflite.ipynb
@@ -0,0 +1,151 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\nCompile TFLite Models\n=====================\n**Author**: `Zhao Wu <https://github.com/FrozenGene>`_\n\nThis article is an introductory tutorial to deploy TFLite models with Relay.\n\nTo get started, Flatbuffers and TFLite package needs to be installed as prerequisites.\nA quick solution is to install Flatbuffers via pip\n\n.. code-block:: bash\n\n    pip install flatbuffers --user\n\n\nTo install TFlite packages, you could use our prebuilt wheel:\n\n.. code-block:: bash\n\n   [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Utils for downloading and extracting zip files\n----------------------------------------------\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import os\n\ndef extract(path):\n    import tarfile\n    if path.endswith(\"tgz\") or path.endswith(\"gz\"):\n        dir_path = os.path.dirname(path)\n        tar = tarfile.open(path)\n        tar.extractall(path=dir_path)\n        tar.close()\n    else:\n        raise RuntimeError('Could not decompress the file: ' + path)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Load pretrained TFLite model\n----------------------------\nLoad mobilenet V1 TFLite model provided by Google\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from tvm.contrib.download import download_testdata\n\nmodel_url = \"http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz\"\n\n# Download model tar file and extract it to get mobilenet_v1_1.0_224.tflite\nmodel_path = download_testdata(model_url, \"mobilenet_v1_1.0_224.tgz\", module=['tf', 'official'])\nmodel_dir = os.path.dirname(model_path)\nextract(model_path)\n\n# Now we can open mobilenet_v1_1.0_224.tflite\ntflite_model_file = os.path.join(mo [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Load a test image\n-----------------\nA single cat dominates the examples!\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from PIL import Image\nfrom matplotlib import pyplot as plt\nimport numpy as np\n\nimage_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'\nimage_path = download_testdata(image_url, 'cat.png', module='data')\nresized_image = Image.open(image_path).resize((224, 224))\nplt.imshow(resized_image)\nplt.show()\nimage_data = np.asarray(resized_image).astype(\"float32\")\n\n# Add a dimension to the image so that we have NHWC format layout\nimage_data = np.expand [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Compile the model with relay\n----------------------------\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# TFLite input tensor name, shape and type\ninput_tensor = \"input\"\ninput_shape = (1, 224, 224, 3)\ninput_dtype = \"float32\"\n\n# Parse TFLite model and convert it to a Relay module\nfrom tvm import relay\nmod, params = relay.frontend.from_tflite(tflite_model,\n                                         shape_dict={input_tensor: input_shape},\n                                         dtype_dict={input_tensor: input_dtype})\n\n# Build the module against to x86 CPU\ntarget = \"ll [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Execute on TVM\n--------------\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import tvm\nfrom tvm import te\nfrom tvm.contrib import graph_runtime as runtime\n\n# Create a runtime executor module\nmodule = runtime.create(graph, lib, tvm.cpu())\n\n# Feed input data\nmodule.set_input(input_tensor, tvm.nd.array(image_data))\n\n# Feed related params\nmodule.set_input(**params)\n\n# Run\nmodule.run()\n\n# Get output\ntvm_output = module.get_output(0).asnumpy()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Display results\n---------------\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Load label file\nlabel_file_url = ''.join(['https://raw.githubusercontent.com/',\n                          'tensorflow/tensorflow/master/tensorflow/lite/java/demo/',\n                          'app/src/main/assets/',\n                          'labels_mobilenet_quant_v1_224.txt'])\nlabel_file = \"labels_mobilenet_quant_v1_224.txt\"\nlabel_path = download_testdata(label_file_url, label_file, module='data')\n\n# List of 1001 classes\nwith open(label_path) as f:\n    labels = f. [...]
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/_downloads/0d95a85fc279fdff660608ef305b9107/tune_simple_template.ipynb b/docs/_downloads/0d95a85fc279fdff660608ef305b9107/tune_simple_template.ipynb
new file mode 100644
index 0000000..03aa884
--- /dev/null
+++ b/docs/_downloads/0d95a85fc279fdff660608ef305b9107/tune_simple_template.ipynb
@@ -0,0 +1,190 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\nWriting tunable template and Using auto-tuner\n=============================================\n**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_\n\nThis is an introduction tutorial to the auto-tuning module in TVM.\n\nThere are two steps in auto-tuning.\nThe first step is defining a search space.\nThe second step is running a search algorithm to explore through this space.\nIn this tutorial, you can learn how to perform these two steps in TVM.\nThe whole workflow is  [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Install dependencies\n--------------------\nTo use autotvm package in TVM, we need to install some extra dependencies.\nThis step (installing xgboost) can be skipped as it doesn't need XGBoost\n(change \"3\" to \"2\" if you use python2):\n\n.. code-block:: bash\n\n  pip3 install --user psutil xgboost\n\nTo make TVM run faster in tuning, it is recommended to use cython\nas FFI of TVM. In the root directory of TVM, execute\n(change \"3\" to \"2\" if you use python2):\n\n.. code-bl [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import logging\nimport sys\n\nimport numpy as np\nimport tvm\nfrom tvm import te\n\n# the module is called `autotvm`\nfrom tvm import autotvm"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Step 1:  Define the search space\n--------------------------------\nIn this section, we will rewrite a deterministic TVM schedule code to a\ntunable schedule template. You can regard the process of search space definition\nas the parameterization of our existing schedule code.\n\nTo begin with, here is how we implement a blocked matrix multiplication in TVM.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Matmul V0: Constant tiling factor\ndef matmul_v0(N, L, M, dtype):\n    A = te.placeholder((N, L), name='A', dtype=dtype)\n    B = te.placeholder((L, M), name='B', dtype=dtype)\n\n    k = te.reduce_axis((0, L), name='k')\n    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name='C')\n    s = te.create_schedule(C.op)\n\n    # schedule\n    y, x = s[C].op.axis\n    k = s[C].op.reduce_axis[0]\n\n    yo, yi = s[C].split(y, 8)\n    xo, xi = s[C].split(x, 8)\n\ [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Parametrize the schedule\n^^^^^^^^^^^^^^^^^^^^^^^^\nIn the previous schedule code, we use a constant \"8\" as tiling factor.\nHowever, it might not be the best one because the best tiling factor depends\non real hardware environment and input shape.\n\nIf you want the schedule code to be portable across a wider range of input shapes\nand target hardware, it is better to define a set of candidate values and\npick the best one according to the measurement results on target hardwar [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Matmul V1: List candidate values\n@autotvm.template(\"tutorial/matmul_v1\")  # 1. use a decorator\ndef matmul_v1(N, L, M, dtype):\n    A = te.placeholder((N, L), name='A', dtype=dtype)\n    B = te.placeholder((L, M), name='B', dtype=dtype)\n\n    k = te.reduce_axis((0, L), name='k')\n    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name='C')\n    s = te.create_schedule(C.op)\n\n    # schedule\n    y, x = s[C].op.axis\n    k = s[C].op.reduce_axis[0]\n\ [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Here we make four modifications to the previous schedule code and get\na tunable \"template\". We can explain the modifications one by one.\n\n1. Use a decorator to mark this function as a simple template.\n2. Get a config object:\n   You can regard this :code:`cfg` as an argument of this function but\n   we obtain it in a different way. With this argument, this function is no longer\n   a deterministic schedule code. Instead, we can pass different configurations to\n   this fun [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Use better space definition API\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nIn the previous template, we manually list all possible values for a knob.\nThis is the lowest level API to define the space.\nHowever, we also provide another set of API to make the space definition\neasier and smarter. It is recommended to use this set of high level API.\n\nIn the following example, we use :any:`ConfigSpace.define_split` to define a split\nknob. It will enumerate all the possible ways to split a [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "@autotvm.template(\"tutorial/matmul\")\ndef matmul(N, L, M, dtype):\n    A = te.placeholder((N, L), name='A', dtype=dtype)\n    B = te.placeholder((L, M), name='B', dtype=dtype)\n\n    k = te.reduce_axis((0, L), name='k')\n    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name='C')\n    s = te.create_schedule(C.op)\n\n    # schedule\n    y, x = s[C].op.axis\n    k = s[C].op.reduce_axis[0]\n\n    ##### define space begin #####\n    cfg = autotvm.get_confi [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "<div class=\"alert alert-info\"><h4>Note</h4><p>More Explanation on :code:`cfg.defile_split`</p></div>\n\n In this template, :code:`cfg.define_split(\"tile_y\", y, num_outputs=2)` will enumerate\n all possible combinations that can split axis y into two axes with factors of the length of y.\n For example, if the length of y is 32 and we want to split it into two axes\n using factors of 32, then there are 6 possible values for\n (length of outer axis, length of inner axis) pair,  [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Step 2:  Search through the space\n---------------------------------\nIn step 1, we build the search space by extending our old schedule code\ninto a template. The next step is to pick a tuner and explore in this space.\n\nAuto-tuners in TVM\n^^^^^^^^^^^^^^^^^^\nThe job for a tuner can be described by following pseudo code\n\n  .. code-block:: c\n\n   ct = 0\n   while ct < max_number_of_trials:\n       propose a batch of configs\n       measure this batch of configs on real hard [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Begin tuning\n^^^^^^^^^^^^\nHere we continue our matrix multiplication example.\nFirst we should create a tuning task.\nWe can also inspect the initialized search space.\nIn this case, for a 512x512 square matrix multiplication, the space size\nis 10x10=100\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "N, L, M = 512, 512, 512\ntask = autotvm.task.create(\"tutorial/matmul\", args=(N, L, M, 'float32'), target='llvm')\nprint(task.config_space)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Then we need to define how to measure the generated code and pick a tuner.\nSince our space is small, a random tuner is just okay.\n\nWe only make 10 trials in this tutorial for demonstration. In practice,\nyou can do more trials according to your time budget.\nWe will log the tuning results into a log file. This file can be\nused to get the best config later.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# logging config (for printing tuning log to the screen)\nlogging.getLogger('autotvm').setLevel(logging.DEBUG)\nlogging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))\n\n# There are two steps for measuring a config: build and run.\n# By default, we use all CPU cores to compile program. Then measure them sequentially.\n# We measure 5 times and take average to reduce variance.\nmeasure_option = autotvm.measure_option(\n    builder='local',\n    runner=autotvm.L [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Finally we apply history best from the cache file and check its correctness.\nWe can call the function :code:`matmul` directly under the\n:any:`autotvm.apply_history_best` context. When we call this function,\nit will query the dispatch context with its argument and get the best config\nwith the same argument.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# apply history best from log file\nwith autotvm.apply_history_best('matmul.log'):\n    with tvm.target.create(\"llvm\"):\n        s, arg_bufs = matmul(N, L, M, 'float32')\n        func = tvm.build(s, arg_bufs)\n\n# check correctness\na_np = np.random.uniform(size=(N, L)).astype(np.float32)\nb_np = np.random.uniform(size=(L, M)).astype(np.float32)\nc_np = a_np.dot(b_np)\n\nc_tvm = tvm.nd.empty(c_np.shape)\nfunc(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm)\n\ntvm.testing.assert [...]
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/_downloads/10e16681be542cc483fa89e9b4678a27/opt_matmul_auto_tensorcore.py b/docs/_downloads/10e16681be542cc483fa89e9b4678a27/opt_matmul_auto_tensorcore.py
new file mode 100644
index 0000000..aae1333
--- /dev/null
+++ b/docs/_downloads/10e16681be542cc483fa89e9b4678a27/opt_matmul_auto_tensorcore.py
@@ -0,0 +1,531 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+.. _opt-matmul-auto-tensorcore:
+
+How to optimize matmul with Auto TensorCore CodeGen
+===================================================
+**Author**: `Minmin Sun <https://github.com/minminsun>`_, \
+            `Lanbo Li <https://github.com/Orion34C>`_, \
+            `Chenfan Jia <https://github.com/jcf94>`_, \
+            `Jun Yang <https://github.com/yangjunpro>`_
+
+In this tutorial, we will demonstrate how to write a high performance matmul
+schedule on Volta/Turing GPUs with TVM Auto TensorCore CodeGen.
+This is a transparent solution to generate tensorcore kernel
+with most transformations done in ir passes.
+Users can also write schedule with tensorization to generate TensorCore code.
+Both solutions use the same tensorcore intrinsics.
+Please refer to :ref:`opt-conv-tensorcore` tutorial for more details.
+"""
+
+################################################################
+# Preparation and Algorithm
+# -------------------------
+# 2 kinds of input data types are supported: float16 and int8.
+# For float16, the accumulator is float32.
+# For int8, the accumulator is int32.
+# For data layouts, 'N' means None-transpose while 'T' means Transpose.
+
+import logging
+import sys
+
+import numpy as np
+import tvm
+from tvm import te
+
+from tvm import autotvm
+from tvm.contrib import nvcc
+
+def matmul_nn(A, B, L, dtype='float16', layout='NN'):
+    k = te.reduce_axis((0, L), name='k')
+    if dtype == 'float16':
+      out_type = 'float'
+    elif dtype == 'int8':
+      out_type = 'int'
+    elif dtype == 'int4' or dtype == 'int1':
+      out_type = 'int'
+    if (layout == 'NN'):
+      return te.compute((N, M), lambda i, j: te.sum(A[i, k].astype(out_type) * B[k, j].astype(out_type), axis=k))
+    if (layout == 'NT'):
+      return te.compute((N, M), lambda i, j: te.sum(A[k, i].astype(out_type) * B[k, j].astype(out_type), axis=k))
+    if (layout == 'TN'):
+      return te.compute((N, M), lambda i, j: te.sum(A[i, k].astype(out_type) * B[j, k].astype(out_type), axis=k))
+    if (layout == 'TT'):
+      return te.compute((N, M), lambda i, j: te.sum(A[k, i].astype(out_type) * B[j, k].astype(out_type), axis=k))
+
+###############################################################################
+# Scheduling the Computation
+# --------------------------
+# This schedule is no different than a non-tensorcore matmul schedule on GPU.
+# Please refer to :ref:`opt-gemm` tutorial for basics of optimizing matmul schedule.
+# When the "tensor_core" pragma is set, the "rewrite for tensorcore" ir pass
+# will automatically transform the schedule for tensorcore codegen,
+# otherwise normal CUDA code, with lower performance but equal functionality, will be generated.
+#
+# .. note::
+#
+#   *Requirements of TesnsorCore*
+#
+#   Note that in the following 2 cases, even though the "tensor_core" pragma is set, TVM will still fall back to normal CUDA codegen:
+#   (1) The m, n or k of input matrices is not multiple of 16;
+#   (2) The warp tile size is not 16x16x16 on CUDA9, or not one of {16x16x16, 32x8x16, 8x32x16} on CUDA version >= 10.0.
+#
+# In this schedule, storage_align is used to reduce bank conflicts of shared memory. Please refer to this
+# `doc <https://docs.tvm.ai/api/python/schedule.html#tvm.te.schedule.Stage.storage_align>`_
+# for the usage of storage_align primitive. In short, we need to add an offset to some shared memory buffer
+# to reduce bank conflicts.
+# According to the `wmma doc <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#wmma-description>`_,
+# the stride of load_matrix_sync must be a multiple of 16 bytes,
+# so we choose 8 as offset for float16 and 16 as offset for int8.
+#
+# We use AutoTVM to search for best configurations in this schedule.
+
+@autotvm.template("tutorial/auto_tensorcore/test_gemm")
+def test_gemm(N, L, M, dtype, layout):
+    if (layout == "NN"):
+      shape_a = (N, L)
+      shape_b = (L, M)
+    elif (layout == "NT"):
+      shape_a = (L, N)
+      shape_b = (L, M)
+    elif (layout == "TN"):
+      shape_a = (N, L)
+      shape_b = (M, L)
+    elif (layout == "TT"):
+      shape_a = (L, N)
+      shape_b = (M, L)
+    else:
+      print ("Unsupported layout:", layout)
+      sys.exit(1);
+    A = te.placeholder(shape_a, name='A', dtype=dtype)
+    B = te.placeholder(shape_b, name='B', dtype=dtype)
+    C = matmul_nn(A, B, L, dtype, layout)
+
+    s = te.create_schedule(C.op)
+    y, x = s[C].op.axis
+    k = s[C].op.reduce_axis[0]
+
+    # storage_align params
+    factor = 16
+    offset = 8
+    if dtype == 'int8':
+      factor = 32
+      offset = 16
+    elif dtype == 'int4':
+      factor = 64
+      offset = 32
+    elif dtype == 'int1':
+      factor = 256
+      offset = 128
+
+    # create cache stages
+    AA = s.cache_read(A, "shared", [C])
+    if (layout == "NN" or layout == "TN"):
+      s[AA].storage_align(AA.op.axis[0], factor, offset)
+    AL = s.cache_read(AA, "local", [C])
+    BB = s.cache_read(B, "shared", [C])
+    if (layout == "TT" or layout == "NT"):
+      s[BB].storage_align(BB.op.axis[0], factor, offset)
+    BL = s.cache_read(BB, "local", [C])
+    CL = s.cache_write(C, "local")
+
+    #autotvm search space definition
+    cfg = autotvm.get_config()
+
+    cfg.define_knob("bx", [2, 4, 8])
+    cfg.define_knob("by", [8, 16, 32, 64])
+    cfg.define_knob("step_k", [1, 2, 4, 8, 16, 32])
+    cfg.define_knob("v", [4, 8, 16, 32])
+    by = cfg['by'].val
+    bx = cfg['bx'].val
+    step_k = cfg['step_k'].val
+    v = cfg['v'].val
+
+    # thread tile
+    TX = 8
+    TY = 1
+    if dtype == 'int4' or dtype == 'int1':
+      TX = 2
+    # warp tile
+    warp_tile_m = 16 # it could also be 8 or 32 on CUDA version >= 10.0
+    warp_tile_k = 16 # it must be 16 for fp16/int8 data type
+    if dtype == 'int4':
+      warp_tile_m = 8
+      warp_tile_k = 32
+    elif dtype == 'int1':
+      warp_tile_m = 8
+      warp_tile_k = 128
+    # block tile
+    tile_x = bx * TX
+    tile_y = by * TY
+
+    yo, ty = s[C].split(y, tile_y)
+    ty, yi = s[C].split(ty, TY)
+
+    # schedule for C stage
+    xo, xi = s[C].split(x, tile_x)
+    WX = min(warp_tile_m, tile_x)
+    tz, xi = s[C].split(xi, WX)
+    tx, xi = s[C].split(xi, TX)
+    s[C].reorder(yo, xo, tz, ty, tx, yi, xi)
+    s[C].bind(yo, te.thread_axis("blockIdx.y"))
+    s[C].bind(xo, te.thread_axis("blockIdx.x"))
+    s[C].bind(ty, te.thread_axis("threadIdx.y"))
+    s[C].bind(tz, te.thread_axis("threadIdx.z"))
+    s[C].bind(tx, te.thread_axis("threadIdx.x"))
+
+    # schedule for CL stage
+    ko, ki = s[CL].split(k, step_k * warp_tile_k)
+    kl, ki = s[CL].split(ki, warp_tile_k)
+    s[CL].compute_at(s[C], tx)
+    yo, xo = CL.op.axis
+    s[CL].reorder(ko, kl, ki, yo, xo)
+
+    # schedule for AA stage
+    s[AA].compute_at(s[CL], ko)
+    xo, xi = s[AA].split(s[AA].op.axis[1], factor=bx*v)
+    tz, tx = s[AA].split(xi, factor=(WX//TX)*v)
+    tx, vec = s[AA].split(tx, factor=v)
+    fused = s[AA].fuse(s[AA].op.axis[0], xo)
+    _, ty = s[AA].split(fused, factor=by)
+    s[AA].bind(ty, te.thread_axis("threadIdx.y"))
+    s[AA].bind(tz, te.thread_axis("threadIdx.z"))
+    s[AA].bind(tx, te.thread_axis("threadIdx.x"))
+    # vectorization is very important for float16/int8 inputs
+    s[AA].vectorize(vec)
+
+    # schedule for BB stage
+    s[BB].compute_at(s[CL], ko)
+    xo, xi = s[BB].split(s[BB].op.axis[1], factor=bx*v)
+    tz, tx = s[BB].split(xi, factor=(WX//TX)*v)
+    tx, vec = s[BB].split(tx, factor=v)
+    fused = s[BB].fuse(s[BB].op.axis[0], xo)
+    _, ty = s[BB].split(fused, factor=by)
+    s[BB].bind(ty, te.thread_axis("threadIdx.y"))
+    s[BB].bind(tz, te.thread_axis("threadIdx.z"))
+    s[BB].bind(tx, te.thread_axis("threadIdx.x"))
+    s[BB].vectorize(vec)
+
+    s[AL].compute_at(s[CL], kl)
+    s[BL].compute_at(s[CL], kl)
+
+    # set the 'tensor_core' pragma for tensorcore codegen
+    s[CL].pragma(ko, 'tensor_core')
+
+    return s, [A, B, C]
+
+###############################################################################
+# AutoTune and Test
+# -----------------
+# Finally we use a tuner to tune the schedule, generate code with best config
+# and run the kernel to compare with numpy to check whether the results are correct.
+
+# check whether the gpu has tensorcore
+if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
+  print("skip because cuda is not enabled..")
+  sys.exit(0)
+
+ctx = tvm.gpu()
+if not nvcc.have_tensorcore(ctx.compute_version):
+  print('the gpu has no tensorcore, skipping...')
+  sys.exit(0)
+
+M, N, L = 512, 32, 512
+dtype = 'float16'
+layout = 'NN'
+if len(sys.argv) >= 4:
+  M, N, L = int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3])
+if len(sys.argv) >= 5:
+  dtype = sys.argv[4]
+if len(sys.argv) >= 6:
+  layout = sys.argv[5]
+
+# check whether current gpu arch support support current dtype's wmma codegen
+cuda_compute_capability = tvm.runtime._ffi_api.GetDeviceAttr(2, 0, 4)
+major, minor= nvcc.parse_compute_version(cuda_compute_capability)
+if dtype == 'int8':
+  assert(major == 7 and minor >= 2)
+elif dtype == 'int4' or dtype == 'int1':
+  # int4/int1 only support layout TN
+  assert(major == 7 and minor == 5 and layout == 'TN')
+
+def tune_and_evaluate(M, N, L, dtype, layout):
+  task = autotvm.task.create("tutorial/auto_tensorcore/test_gemm", args=(N, L, M, dtype, layout),
+                             target='cuda')
+  print(task.config_space)
+
+  logging.getLogger('autotvm').setLevel(logging.DEBUG)
+  logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))
+
+  measure_option = autotvm.measure_option(
+    builder='local',
+    runner=autotvm.LocalRunner(number=5))
+
+  tuner = autotvm.tuner.XGBTuner(task)
+  tuner.tune(n_trial=1000,
+             measure_option=measure_option,
+             callbacks=[autotvm.callback.log_to_file('matmul.log')])
+
+  dispatch_context = autotvm.apply_history_best("matmul.log")
+  best_config = dispatch_context.query(task.target, task.workload)
+  print("\nBest config:")
+  print(best_config)
+  with autotvm.apply_history_best('matmul.log'):
+    with tvm.target.create("cuda"):
+        with tvm.target.build_config():
+            s, arg_bufs = test_gemm(N, L, M, dtype, layout)
+            print(tvm.lower(s, arg_bufs, simple_mode=True))
+            func = tvm.build(s, arg_bufs)
+  dev_module = func.imported_modules[0]
+  print(dev_module.get_source())
+
+  # check correctness
+  if (layout == "NN"):
+    shape_a = (N, L)
+    shape_b = (L, M)
+  elif (layout == "NT"):
+    shape_a = (L, N)
+    shape_b = (L, M)
+  elif (layout == "TN"):
+    shape_a = (N, L)
+    shape_b = (M, L)
+  elif (layout == "TT"):
+    shape_a = (L, N)
+    shape_b = (M, L)
+
+  a_np = None
+  b_np = None
+  c_np = None
+  c_np_type = None
+  if dtype == 'float16':
+    c_np_type = np.float32
+    a_np = np.random.uniform(size=shape_a).astype(np.float16)
+    b_np = np.random.uniform(size=shape_b).astype(np.float16)
+    if (layout == "NN"):
+      c_np = np.dot(a_np, b_np)
+    elif (layout == "NT"):
+      c_np = np.dot(a_np.T, b_np)
+    elif (layout == "TN"):
+      c_np = np.dot(a_np, b_np.T)
+    elif (layout == "TT"):
+      c_np = np.dot(a_np.T, b_np.T)
+  elif dtype == 'int8':
+    c_np_type = np.int32
+    a_np = np.random.randint(low=-128, high=127, size=shape_a).astype(np.int8)
+    b_np = np.random.randint(low=-128, high=127, size=shape_b).astype(np.int8)
+    if (layout == "NN"):
+      c_np = np.dot(a_np.astype(np.int32), b_np.astype(np.int32))
+    elif (layout == "NT"):
+      c_np = np.dot(a_np.astype(np.int32).T, b_np.astype(np.int32))
+    elif (layout == "TN"):
+      c_np = np.dot(a_np.astype(np.int32), b_np.astype(np.int32).T)
+    elif (layout == "TT"):
+      c_np = np.dot(a_np.astype(np.int32).T, b_np.astype(np.int32).T)
+  elif dtype == 'int4':
+    c_np_type = np.int32
+    a_np_int = np.random.randint(low=-8, high=7, size=shape_a).astype(np.int32)
+    b_np_int = np.random.randint(low=-8, high=7, size=shape_b).astype(np.int32)
+    # "TN"
+    c_np = np.dot(a_np_int.astype(np.int32), b_np_int.astype(np.int32).T)
+    a_np = np.zeros(shape=(N, int(L/8)), dtype = np.int32)
+    b_np = np.zeros(shape=(M, int(L/8)), dtype = np.int32)
+    # a_np --> col_major
+    for i in range(N):
+      for j in range(int(L/8)):
+        for k in range(8):
+          a_np[i, j] = a_np[i, j] | ((a_np_int[i, j * 8 + k] & 0xf) << ((7 - k) * 4))
+
+    # b_np --> row_major
+    for i in range(M):
+      for j in range(int(L/8)):
+        for k in range(8):
+          b_np[i, j] = b_np[i, j] | ((b_np_int[i, j * 8 + k] & 0xf) << ((7 - k) * 4))
+  elif dtype == 'int1':
+    c_np_type = np.int32
+    a_np_int = np.random.randint(low=0, high=1, size=shape_a).astype(np.int32)
+    b_np_int = np.random.randint(low=0, high=1, size=shape_b).astype(np.int32)
+    # "TN"
+    c_np = np.dot(a_np_int.astype(np.int32), b_np_int.astype(np.int32).T)
+    a_np = np.zeros(shape=(N, int(L/32)), dtype = np.int32)
+    b_np = np.zeros(shape=(M, int(L/32)), dtype = np.int32)
+    for i in range(N):
+      for j in range(int(L/32)):
+        for k in range(32):
+          a_np[i, j] = a_np[i, j] | ((a_np_int[i, j * 32 + k] & 0xf) << (31 - k))
+
+    for i in range(M):
+      for j in range(int(L/32)):
+        for k in range(32):
+          b_np[i, j] = b_np[i, j] | ((b_np_int[i, j * 32 + k] & 0xf) << (31 - k))
+
+  c_tvm = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np_type), ctx=ctx)
+  a_tvm = tvm.nd.array(a_np, ctx=ctx)
+  b_tvm = tvm.nd.array(b_np, ctx=ctx)
+  func(a_tvm, b_tvm, c_tvm)
+
+  tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-3)
+
+  evaluator = func.time_evaluator(func.entry_name, ctx, number=100)
+  print('Time cost of this operator: %f' % evaluator(a_tvm, b_tvm, c_tvm).mean)
+
+# We do not run the tuning in our webpage server since it takes some time.
+# Uncomment the following line to run it by yourself.
+
+# tune_and_evaluate(M, N, L, dtype, layout)
+
+######################################################################
+# Sample Output
+# -------------
+# .. code-block:: bash
+#
+#    Best config:
+#    [('bx', 4), ('by', 32), ('step_k', 16), ('v', 8)],,None,40
+#    Finish loading 162 records
+#    produce compute {
+#      // attr [iter_var(blockIdx.y, , blockIdx.y)] thread_extent = 1
+#      // attr [compute.local] storage_scope = "wmma.accumulator"
+#      allocate compute.local[float32 * 256]
+#      // attr [A.shared] storage_scope = "shared"
+#      allocate A.shared[float16 * 8448]
+#      // attr [B.shared] storage_scope = "shared"
+#      allocate B.shared[float16 * 8192]
+#      // attr [A.shared.local] storage_scope = "wmma.matrix_b"
+#      allocate A.shared.local[float16 * 256]
+#      // attr [B.shared.local] storage_scope = "wmma.matrix_a"
+#      allocate B.shared.local[float16 * 256]
+#      // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 16
+#      // attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent = 2
+#      // attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 32
+#      // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 2
+#      produce compute.local {
+#        for (j.c.init, 0, 1) {
+#          tvm_fill_fragment(compute.local, 16, 16, 16, 0, 0f)
+#        }
+#        // attr [iter_var(k.outer, )] pragma_tensor_core = 1
+#        for (k.outer, 0, 2) {
+#          produce A.shared {
+#            for (ax0.ax1.outer.fused.outer, 0, 8) {
+#              // attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 32
+#              // attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent = 2
+#              // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 2
+#              A.shared[ramp((((((ax0.ax1.outer.fused.outer*1056) + (floordiv(threadIdx.y, 8)*264)) + (floormod(threadIdx.y, 8)*32)) + (threadIdx.z*16)) + (threadIdx.x*8)), 1, 8)] = A[ramp(((((((ax0.ax1.outer.fused.outer*2048) + (floordiv(threadIdx.y, 8)*512)) + (k.outer*256)) + (floormod(threadIdx.y, 8)*32)) + (threadIdx.z*16)) + (threadIdx.x*8)), 1, 8)]
+#            }
+#          }
+#          produce B.shared {
+#            for (ax0.ax1.outer.fused.outer, 0, 8) {
+#              // attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 32
+#              // attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent = 2
+#              // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 2
+#              B.shared[ramp(((((ax0.ax1.outer.fused.outer*1024) + (threadIdx.y*32)) + (threadIdx.z*16)) + (threadIdx.x*8)), 1, 8)] = B[ramp(((((((k.outer*131072) + (ax0.ax1.outer.fused.outer*16384)) + (threadIdx.y*512)) + (blockIdx.x*32)) + (threadIdx.z*16)) + (threadIdx.x*8)), 1, 8)]
+#            }
+#          }
+#          for (k.inner.outer, 0, 16) {
+#            produce A.shared.local {
+#              for (ax1, 0, 1) {
+#                tvm_load_matrix_sync(A.shared.local, 16, 16, 16, 0, &(A.shared[(((threadIdx.y/16)*4224) + (k.inner.outer*16))]), 264, "col_major")
+#              }
+#            }
+#            produce B.shared.local {
+#              for (ax0, 0, 1) {
+#                for (ax1, 0, 1) {
+#                  tvm_load_matrix_sync(B.shared.local, 16, 16, 16, 0, &(B.shared[((k.inner.outer*512) + (threadIdx.z*16))]), 32, "col_major")
+#                }
+#              }
+#            }
+#            for (k.inner.inner, 0, 1) {
+#              for (j.c, 0, 1) {
+#                tvm_mma_sync(compute.local, 0, B.shared.local, 0, A.shared.local, 0, compute.local, 0)
+#              }
+#            }
+#          }
+#        }
+#      }
+#      for (j.inner.inner.inner, 0, 1) {
+#        tvm_store_matrix_sync(compute.local, 16, 16, 16, 0, &(compute[((((threadIdx.y/16)*8192) + (blockIdx.x*32)) + (threadIdx.z*16))]), 512, "col_major")
+#      }
+#    }
+#
+#    #include <cuda_fp16.h>
+#    __device__ half max(const half a, const half b)
+#    {
+#      return __hgt(__half(a), __half(b)) ? a : b;
+#    }
+#    __device__ half min(const half a, const half b)
+#    {
+#      return __hlt(__half(a), __half(b)) ? a : b;
+#    }
+#    __device__ half operator+(const volatile __half &a,  const volatile __half &b)
+#    {
+#      return __hadd(a, b);
+#    }
+#    __device__ half operator<=(const volatile __half &a,  const volatile __half &b)
+#    {
+#      return __hlt(a, b);
+#    }
+#    __device__ half operator*(const volatile __half &a,  const volatile __half &b)
+#    {
+#      return __hmul(a, b);
+#    }
+#    #include <mma.h>
+#    extern "C" __global__ void default_function_kernel0( half* __restrict__ A,  half* __restrict__ B,  float* __restrict__ compute) {
+#      nvcuda::wmma::fragment<nvcuda::wmma::accumulator, 16, 16, 16, float> compute_local[1];
+#      __shared__ half A_shared[8448];
+#      __shared__ half B_shared[8192];
+#      nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, 16, 16, 16, half, nvcuda::wmma::col_major> A_shared_local[1];
+#      nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, 16, 16, 16, half, nvcuda::wmma::col_major> B_shared_local[1];
+#      for (int j_c_init = 0; j_c_init < 1; ++j_c_init) {
+#        (void)nvcuda::wmma::fill_fragment(compute_local[0], 0.000000e+00f);
+#      }
+#      for (int k_outer = 0; k_outer < 2; ++k_outer) {
+#        __syncthreads();
+#        for (int ax0_ax1_outer_fused_outer = 0; ax0_ax1_outer_fused_outer < 8; ++ax0_ax1_outer_fused_outer) {
+#          ((__shared__ float4*)(A_shared + (((((ax0_ax1_outer_fused_outer * 1056) + ((((int)threadIdx.y) >> 3) * 264)) + ((((int)threadIdx.y) & 7) * 32)) + (((int)threadIdx.z) * 16)) + (((int)threadIdx.x) * 8))))[0] = (( float4*)(A + ((((((ax0_ax1_outer_fused_outer * 2048) + ((((int)threadIdx.y) >> 3) * 512)) + (k_outer * 256)) + ((((int)threadIdx.y) & 7) * 32)) + (((int)threadIdx.z) * 16)) + (((int)threadIdx.x) * 8))))[0];
+#        }
+#        for (int ax0_ax1_outer_fused_outer1 = 0; ax0_ax1_outer_fused_outer1 < 8; ++ax0_ax1_outer_fused_outer1) {
+#          ((__shared__ float4*)(B_shared + ((((ax0_ax1_outer_fused_outer1 * 1024) + (((int)threadIdx.y) * 32)) + (((int)threadIdx.z) * 16)) + (((int)threadIdx.x) * 8))))[0] = (( float4*)(B + ((((((k_outer * 131072) + (ax0_ax1_outer_fused_outer1 * 16384)) + (((int)threadIdx.y) * 512)) + (((int)blockIdx.x) * 32)) + (((int)threadIdx.z) * 16)) + (((int)threadIdx.x) * 8))))[0];
+#        }
+#        __syncthreads();
+#        for (int k_inner_outer = 0; k_inner_outer < 16; ++k_inner_outer) {
+#          for (int ax1 = 0; ax1 < 1; ++ax1) {
+#            (void)nvcuda::wmma::load_matrix_sync(A_shared_local[0], &(A_shared[(((((int)threadIdx.y) / 16) * 4224) + (k_inner_outer * 16))]), 264);
+#          }
+#          for (int ax0 = 0; ax0 < 1; ++ax0) {
+#            for (int ax11 = 0; ax11 < 1; ++ax11) {
+#              (void)nvcuda::wmma::load_matrix_sync(B_shared_local[0], &(B_shared[((k_inner_outer * 512) + (((int)threadIdx.z) * 16))]), 32);
+#            }
+#          }
+#          for (int k_inner_inner = 0; k_inner_inner < 1; ++k_inner_inner) {
+#            for (int j_c = 0; j_c < 1; ++j_c) {
+#              (void)nvcuda::wmma::mma_sync(compute_local[0], B_shared_local[0], A_shared_local[0], compute_local[0]);
+#            }
+#          }
+#        }
+#      }
+#      for (int j_inner_inner_inner = 0; j_inner_inner_inner < 1; ++j_inner_inner_inner) {
+#        (void)nvcuda::wmma::store_matrix_sync(&(compute[((((((int)threadIdx.y) / 16) * 8192) + (((int)blockIdx.x) * 32)) + (((int)threadIdx.z) * 16))]), compute_local[0], 512, nvcuda::wmma::mem_col_major);
+#      }
+#    }
+#
+#
+#    Time cost of this operator: 0.000008
+
+###############################################################################
+# Summary
+# -------
+# This tutorial demonstrates how to use the AutoTensorCoreCodeGen of TVM
+# to generate tensorcore kernels.
diff --git a/docs/_downloads/1195277fef6a622db64b78b4ea799ed4/matrix_multiply.py b/docs/_downloads/1195277fef6a622db64b78b4ea799ed4/matrix_multiply.py
new file mode 100644
index 0000000..227144e
--- /dev/null
+++ b/docs/_downloads/1195277fef6a622db64b78b4ea799ed4/matrix_multiply.py
@@ -0,0 +1,482 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+.. _basic-mat-mult:
+
+Simple Matrix Multiply
+======================
+**Author**: `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_
+
+In this tutorial, we will build on top of the :ref:`vta-get-started` tutorial
+and introduce additional concepts required to implement matrix multiplication
+on VTA with the TVM workflow.
+"""
+
+######################################################################
+# RPC Setup
+# ---------
+# We start by programming the Pynq's FPGA and building its RPC runtime
+# as we did in the VTA introductory tutorial.
+
+from __future__ import absolute_import, print_function
+
+import os
+import tvm
+from tvm import te
+import vta
+import numpy as np
+from tvm import rpc
+from tvm.contrib import util
+from vta.testing import simulator
+
+# Load VTA parameters from the vta/vta-hw/config/vta_config.json file
+env = vta.get_env()
+
+# We read the Pynq RPC host IP address and port number from the OS environment
+host = os.environ.get("VTA_RPC_HOST", "192.168.2.99")
+port = int(os.environ.get("VTA_RPC_PORT", "9091"))
+
+# We configure both the bitstream and the runtime system on the Pynq
+# to match the VTA configuration specified by the vta_config.json file.
+if env.TARGET == "pynq" or env.TARGET == "de10nano":
+
+    # Make sure that TVM was compiled with RPC=1
+    assert tvm.runtime.enabled("rpc")
+    remote = rpc.connect(host, port)
+
+    # Reconfigure the JIT runtime
+    vta.reconfig_runtime(remote)
+
+    # Program the FPGA with a pre-compiled VTA bitstream.
+    # You can program the FPGA with your own custom bitstream
+    # by passing the path to the bitstream file instead of None.
+    vta.program_fpga(remote, bitstream=None)
+
+# In simulation mode, host the RPC server locally.
+elif env.TARGET in ["sim", "tsim"]:
+    remote = rpc.LocalSession()
+
+######################################################################
+# Computation Declaration
+# -----------------------
+# In this example we describe a simple matrix multiplication addition, which
+# requires multiple computation stages, as shown in the dataflow diagram below.
+# First we describe the input tensors :code:`A` and :code:`B` that are living
+# in main memory.
+# Second, we need to declare intermediate tensors :code:`A_buf` and
+# :code:`B_buf`, which will live in VTA's on-chip buffers.
+# Having this extra computational stage allows us to explicitly
+# stage cached reads and writes.
+# Third, we describe the matrix multiplication computation over
+# :code:`A_buf` and :code:`B_buf` to produce the product matrix :code:`C_buf`.
+# The last operation is a cast and copy back to DRAM, into results tensor
+# :code:`C`.
+#
+# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/gemm_dataflow.png
+#      :align: center
+
+######################################################################
+# Data Layout
+# ~~~~~~~~~~~
+# We describe the placeholder tensors :code:`A`, and :code:`B` in a tiled data
+# format to match the data layout requirements imposed by the VTA tensor core.
+
+######################################################################
+# .. note::
+#
+#   **Data Tiling**
+#
+#   One source of complexity when targeting accelerators is to make sure
+#   that the data layout matches the layout imposed by the accelerator design.
+#   VTA is designed around a *tensor core* that performs, one matrix-matrix
+#   operation per cycle between an activation matrix and a weight matrix,
+#   adding the result matrix to an accumulator matrix, as shown in the
+#   figure below.
+#
+#   .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/tensor_core.png
+#        :align: center
+#        :width: 480px
+#
+#   The dimensions of that matrix-matrix multiplication are specified in
+#   the :code:`vta_config.json` configuration file.
+#   The activation matrix has a :code:`(BATCH, BLOCK_IN)` shape
+#   and the transposed weight matrix has a :code:`(BLOCK_OUT, BLOCK_IN)` shape,
+#   thus inferring that the resulting output matrix has a
+#   :code:`(BATCH, BLOCK_OUT)` shape.
+#   Consequently input and output tensors processed by VTA need to be
+#   tiled according to these aforementioned dimension.
+#
+#   The diagram below shows the impact of data tiling on a matrix that is
+#   originally of shape (4, 8).
+#   Tiling by a (2, 2) tile shape ensures that data within each tile is
+#   contiguous.
+#   The resulting tiled tensor has a shape of (2, 4, 2, 2).
+#
+#   .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/data_tiling.png
+#        :align: center
+#        :width: 480px
+#
+# We first define the variables :code:`m`, :code:`n`, :code:`o` to represent
+# the shape of the matrix multiplication. These variables are multiplicative
+# factors over the :code:`BLOCK_OUT`, :code:`BLOCK_IN`, and :code:`BATCH`
+# tensor dimensions respectively.
+# By default, the configuration file sets :code:`BATCH`, :code:`BLOCK_IN`, and
+# :code:`BLOCK_OUT` to be 1, 16 and 16 respectively (:code:`BATCH` being set to
+# 1 implies that our compute building block is vector-matrix multiply).
+#
+
+######################################################################
+# .. note::
+#
+#   **Data Types**
+#
+#   It's important to not only match the inner-tile
+#   dimension of VTA's tensor core, but also to match the specific data types
+#   expected by VTA.
+#   VTA for now only supports fixed point data types, which integer width is
+#   specified in the :code:`vta_config.json` file by :code:`INP_WIDTH` and
+#   :code:`WGT_WIDTH` for the activations and weights data types respectively.
+#   In addition, the accumulator data type integer width is specified by
+#   :code:`ACC_WIDTH`.
+#
+# By default, the configuration file sets :code:`INP_WIDTH`
+# and :code:`WGT_WIDTH` to 8.
+# The accumulator width :code:`ACC_WIDTH` is set to 32, in order to avoid
+# overflow during accumulation.
+# As a result, :code:`env.inp_dtype` and :code:`env.wgt_dtype` are all
+# narrow 8-bit integers, while :code:`env.acc_dtype` is a standard 32-bit
+# integer.
+
+# Output channel factor m - total 16x16=256 output channels
+m = 16
+# Input channel factor n - total 16x16=256 input channels
+n = 16
+# Batch factor o (we use single batch inference)
+o = 1
+# A placeholder tensor in tiled data format
+A = te.placeholder((o, n, env.BATCH, env.BLOCK_IN), name="A", dtype=env.inp_dtype)
+# B placeholder tensor in tiled data format
+B = te.placeholder((m, n, env.BLOCK_OUT, env.BLOCK_IN), name="B", dtype=env.wgt_dtype)
+# A copy buffer
+A_buf = te.compute((o, n, env.BATCH, env.BLOCK_IN), lambda *i: A(*i), "A_buf")
+# B copy buffer
+B_buf = te.compute((m, n, env.BLOCK_OUT, env.BLOCK_IN), lambda *i: B(*i), "B_buf")
+
+######################################################################
+# Matrix Multiplication
+# ~~~~~~~~~~~~~~~~~~~~~
+# Now we're ready to describe the matrix multiplication result tensor :code:`C`,
+# with another compute operation.
+# The compute function takes the shape of the tensor, as well as a lambda
+# function that describes the computation rule for each position of the tensor.
+#
+# In order to implement matrix multiplication, the lambda function needs to
+# include a reduction formula over the input channel dimension axes.
+# To create a reduction formula, we can declare a reduction axis using
+# :code:`te.reduce_axis`, which takes in the range of reductions.
+# :code:`te.sum` takes in the expression to be reduced as well as
+# the reduction axes to compute the sum of value over all k in the declared
+# ranges.
+#
+# Note that the reduction needs to be performed over 32-bit :code:`env.acc_dtype`
+# accumulator data types.
+#
+# No computation happens during this phase, as we are only declaring how
+# the computation should be done.
+
+# Outer input feature reduction axis
+ko = te.reduce_axis((0, n), name="ko")
+# Inner input feature reduction axis
+ki = te.reduce_axis((0, env.BLOCK_IN), name="ki")
+# Describe the in-VTA matrix multiplication
+C_buf = te.compute(
+    (o, m, env.BATCH, env.BLOCK_OUT),
+    lambda bo, co, bi, ci:
+        te.sum(A_buf[bo, ko, bi, ki].astype(env.acc_dtype) *
+                B_buf[co, ko, ci, ki].astype(env.acc_dtype),
+                axis=[ko, ki]),
+    name="C_buf")
+
+######################################################################
+# Casting the Results
+# ~~~~~~~~~~~~~~~~~~~
+# After the computation is done, we'll need to send the results computed by VTA
+# back to main memory.
+
+######################################################################
+# .. note::
+#
+#   **Memory Store Restrictions**
+#
+#   One specificity of VTA is that it only supports DRAM stores in the narrow
+#   :code:`env.inp_dtype` data type format.
+#   This lets us reduce the data footprint for memory transfers, but also lets
+#   us quantize the wide accumulator data type down to a data format that
+#   matches the input activation data type.
+#   This means that in the context of neural network inference, the outputs
+#   of a given layer after activation can be consumed directly by the next
+#   layer.
+#
+# We perform one last typecast operation to the narrow
+# input activation data format.
+
+# Cast to output type, and send to main memory
+C = te.compute(
+    (o, m, env.BATCH, env.BLOCK_OUT),
+    lambda *i: C_buf(*i).astype(env.inp_dtype),
+    name="C")
+
+######################################################################
+# This concludes the computation declaration part of this tutorial.
+
+######################################################################
+# Scheduling the Computation
+# --------------------------
+# While the above lines describes the computation rule, we can obtain
+# :code:`C` in many ways.
+# TVM asks the user to provide an implementation of the computation called
+# *schedule*.
+#
+# A schedule is a set of transformations to an original computation that
+# transforms the implementation of the computation without affecting
+# correctness.
+# This simple VTA programming tutorial aims to demonstrate basic schedule
+# transformations that will map the original schedule down to VTA hardware
+# primitives.
+
+
+######################################################################
+# Default Schedule
+# ~~~~~~~~~~~~~~~~
+# After we construct the schedule, by default the schedule computes
+# :code:`C` in the following way:
+
+# Let's take a look at the generated schedule
+s = te.create_schedule(C.op)
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+######################################################################
+# Although this schedule makes sense, it won't compile to VTA.
+# In order to obtain correct code generation, we need to apply scheduling
+# primitives and code annotation that will transform the schedule into
+# one that can be directly lowered onto VTA hardware intrinsics.
+# Those include:
+#
+#  - DMA copy operations which will take globally-scoped tensors and copy
+#    those into locally-scoped tensors.
+#  - Tensor operations that will perform the matrix multiplication.
+
+######################################################################
+# Buffer Scopes
+# ~~~~~~~~~~~~~
+# First, we set the scope of the buffers to tell TVM that these buffers
+# will be living in the VTA's on-chip SRAM caches.
+# Below, we tell TVM that :code:`A_buf`, :code:`B_buf`, :code:`C_buf`
+# will respectively live in VTA's on-chip input, weight and accumulator
+# memory.
+
+######################################################################
+# .. note::
+#
+#   **VTA's On-Chip SRAMs**
+#
+#   VTA has three different memory scopes, each corresponding to different
+#   on-chip SRAM buffers.
+#
+#    - :code:`env.inp_scope`: Input buffer, which is a read-only SRAM buffer
+#      that stores input matrices of shape :code:`(env.BATCH, env.BLOCK_IN)`
+#      of type :code:`env.inp_dtype`. The input buffer contains
+#      `2 ^ LOG_INP_BUFF_SIZE` matrix elements (as specified in the
+#      :code:`vta_config.json` file).
+#    - :code:`env.wgt_scope`: Weight buffer, which is a read-only SRAM buffer
+#      that stores weight matrices of shape :code:`(env.BLOCK_OUT, env.BLOCK_IN)`
+#      of type :code:`env.wgt_dtype`. The weight buffer contains
+#      `2 ^ LOG_WGT_BUFF_SIZE` matrix elements.
+#    - :code:`env.acc_scope`: Accumulator buffer, which is a read/write SRAM
+#      buffer that stores accumulator matrices of shape
+#      :code:`(env.BATCH, env.BLOCK_OUT)` of type :code:`env.acc_dtype`.
+#      The accumulator buffer is VTA's general purpose register file: it holds
+#      both intermediate results of convolutions and matrix multiplications
+#      as well as intermediate results of pooling, batch normalization, and
+#      activation layers. The accumulator buffer contains
+#      `2 ^ LOG_ACC_BUFF_SIZE` matrix elements.
+
+# Set the intermediate tensor's scope to VTA's on-chip buffers
+s[A_buf].set_scope(env.inp_scope)
+s[B_buf].set_scope(env.wgt_scope)
+s[C_buf].set_scope(env.acc_scope)
+
+######################################################################
+# DMA Transfers
+# ~~~~~~~~~~~~~
+# We need to schedule DMA transfers to move data living in DRAM to
+# and from the VTA on-chip buffers.
+# This can be achieved using the :code:`compute_at` schedule primitive
+# which nests the copying of the buffers into the computation loop
+# that performs the matrix multiplication.
+#
+# We insert :code:`dma_copy` pragmas to indicate to the compiler
+# that the copy operations will be performed in bulk via DMA,
+# which is common in hardware accelerators.
+# Finally, we print the temporary schedule to observe the effects of
+# moving the copy operations into the matrix multiplication loop.
+
+# Move buffer copy into matrix multiply loop
+s[A_buf].compute_at(s[C_buf], ko)
+s[B_buf].compute_at(s[C_buf], ko)
+
+# Tag the buffer copies with the DMA pragma to insert a DMA transfer
+s[A_buf].pragma(s[A_buf].op.axis[0], env.dma_copy)
+s[B_buf].pragma(s[B_buf].op.axis[0], env.dma_copy)
+s[C].pragma(s[C].op.axis[0], env.dma_copy)
+
+# Let's take a look at the transformed schedule
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+######################################################################
+# Tensorization
+# ~~~~~~~~~~~~~
+# The last step of the schedule transformation consists in applying
+# *tensorization* to our schedule.
+# Tensorization is analogous to vectorization, but extends the concept
+# to a higher-dimensional unit of computation.
+# Consequently, tensorization imposes data layout constraints as discussed
+# earlier when declaring the data layout input placeholders.
+# We've already arranged our tensors in a tiled format, so the next thing
+# we need to perform is loop reordering to accommodate for tensorization.
+#
+# Here we choose to move the outermost reduction axis all the way out.
+# This dictates that we first iterate over input channels, then batch
+# dimensions, and finally output channels.
+# Lastly, we apply the tensorization scheduling primitive :code:`tensorize`
+# along the outer axis of the inner-most matrix matrix multiplication tensor
+# block.
+# We print the finalized schedule that is ready for code-generation
+# by the VTA runtime JIT compiler.
+
+s[C_buf].reorder(
+    ko,
+    s[C_buf].op.axis[0],
+    s[C_buf].op.axis[1],
+    s[C_buf].op.axis[2],
+    s[C_buf].op.axis[3],
+    ki)
+s[C_buf].tensorize(s[C_buf].op.axis[2], env.gemm)
+
+# Let's take a look at the finalized schedule
+print(vta.lower(s, [A, B, C], simple_mode=True))
+
+######################################################################
+# This concludes the scheduling portion of this tutorial.
+
+######################################################################
+# TVM Compilation
+# ---------------
+# After we have finished specifying the schedule, we can compile it
+# into a TVM function.
+
+# Build GEMM VTA kernel
+my_gemm = vta.build(s, [A, B, C], "ext_dev", env.target_host, name="my_gemm")
+
+# Write the compiled module into an object file.
+temp = util.tempdir()
+my_gemm.save(temp.relpath("gemm.o"))
+
+# Send the executable over RPC
+remote.upload(temp.relpath("gemm.o"))
+
+# Load the compiled module
+f = remote.load_module("gemm.o")
+
+######################################################################
+# Running the Function
+# --------------------
+# The compiled TVM function uses a concise C API and can be invoked from
+# code language.
+#
+# TVM provides an array API in python to aid quick testing and prototyping.
+# The array API is based on `DLPack <https://github.com/dmlc/dlpack>`_ standard.
+#
+# - We first create a remote context (for remote execution on the Pynq).
+# - Then :code:`tvm.nd.array` formats the data accordingly.
+# - :code:`f()` runs the actual computation.
+# - :code:`asnumpy()` copies the result array back in a format that can be
+#   interpreted.
+#
+
+# Get the remote device context
+ctx = remote.ext_dev(0)
+
+# Initialize the A and B arrays randomly in the int range of (-128, 128]
+A_orig = np.random.randint(
+    -128, 128, size=(o * env.BATCH, n * env.BLOCK_IN)).astype(A.dtype)
+B_orig = np.random.randint(
+    -128, 128, size=(m * env.BLOCK_OUT, n * env.BLOCK_IN)).astype(B.dtype)
+
+# Apply packing to the A and B arrays from a 2D to a 4D packed layout
+A_packed = A_orig.reshape(
+    o, env.BATCH, n, env.BLOCK_IN).transpose((0, 2, 1, 3))
+B_packed = B_orig.reshape(
+    m, env.BLOCK_OUT, n, env.BLOCK_IN).transpose((0, 2, 1, 3))
+
+# Format the input/output arrays with tvm.nd.array to the DLPack standard
+A_nd = tvm.nd.array(A_packed, ctx)
+B_nd = tvm.nd.array(B_packed, ctx)
+C_nd = tvm.nd.array(np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(C.dtype), ctx)
+
+# Clear stats
+if env.TARGET in ["sim", "tsim"]:
+    simulator.clear_stats()
+
+# Invoke the module to perform the computation
+f(A_nd, B_nd, C_nd)
+
+######################################################################
+# Verifying Correctness
+# ---------------------
+# Compute the reference result with numpy and assert that the output of the
+# matrix multiplication indeed is correct
+
+# Compute reference result with numpy
+C_ref = np.dot(A_orig.astype(env.acc_dtype),
+               B_orig.T.astype(env.acc_dtype)).astype(C.dtype)
+C_ref = C_ref.reshape(
+    o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))
+np.testing.assert_equal(C_ref, C_nd.asnumpy())
+
+# Print stats
+if env.TARGET in ["sim", "tsim"]:
+    sim_stats = simulator.stats()
+    print("Execution statistics:")
+    for k, v in sim_stats.items():
+        print("\t{:<16}: {:>16}".format(k, v))
+
+print("Successful matrix multiply test!")
+
+######################################################################
+# Summary
+# -------
+# This tutorial showcases the TVM workflow to implement a simple matrix
+# multiplication example on VTA.
+# The general workflow includes:
+#
+# - Programming the FPGA with the VTA bitstream over RPC.
+# - Describing matrix multiplication via a series of computations.
+# - Describing how we want to perform the computation using schedule primitives.
+# - Compiling the function to the VTA target.
+# - Running the compiled module and verifying it against a numpy implementation.
+#
diff --git a/docs/_downloads/143c743c62f58570eabd77fd3395ca8c/scan.py b/docs/_downloads/143c743c62f58570eabd77fd3395ca8c/scan.py
new file mode 100644
index 0000000..73790da
--- /dev/null
+++ b/docs/_downloads/143c743c62f58570eabd77fd3395ca8c/scan.py
@@ -0,0 +1,153 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Scan and Recurrent Kernel
+=========================
+**Author**: `Tianqi Chen <https://tqchen.github.io>`_
+
+This is an introduction material on how to do recurrent computing in TVM.
+Recurrent computing is a typical pattern in neural networks.
+"""
+from __future__ import absolute_import, print_function
+
+import tvm
+from tvm import te
+import numpy as np
+
+######################################################################
+# TVM supports a scan operator to describe symbolic loop.
+# The following scan op computes cumsum over columns of X.
+#
+# The scan is carried over the highest dimension of the tensor.
+# :code:`s_state` is a placeholder that describes the transition state of the scan.
+# :code:`s_init` describes how we can initialize the first k timesteps.
+# Here since s_init's first dimension is 1, it describes how we initialize
+# The state at first timestep.
+#
+# :code:`s_update` describes how to update the value at timestep t. The update
+# value can refer back to the values of previous timestep via state placeholder.
+# Note that while it is invalid to refer to :code:`s_state` at current or later timestep.
+#
+# The scan takes in state placeholder, initial value and update description.
+# It is also recommended(although not necessary) to list the inputs to the scan cell.
+# The result of the scan is a tensor, giving the result of :code:`s_state` after the
+# update over the time domain.
+#
+m = te.var("m")
+n = te.var("n")
+X = te.placeholder((m, n), name="X")
+s_state = te.placeholder((m, n))
+s_init = te.compute((1, n), lambda _, i: X[0, i])
+s_update = te.compute((m, n), lambda t, i: s_state[t-1, i] + X[t, i])
+s_scan = tvm.te.scan(s_init, s_update, s_state, inputs=[X])
+
+######################################################################
+# Schedule the Scan Cell
+# ----------------------
+# We can schedule the body of the scan by scheduling the update and
+# init part seperately. Note that it is invalid to schedule the
+# first iteration dimension of the update part.
+# To split on the time iteration, user can schedule on scan_op.scan_axis instead.
+#
+s = te.create_schedule(s_scan.op)
+num_thread = 256
+block_x = te.thread_axis("blockIdx.x")
+thread_x = te.thread_axis("threadIdx.x")
+xo, xi = s[s_init].split(s_init.op.axis[1], factor=num_thread)
+s[s_init].bind(xo, block_x)
+s[s_init].bind(xi, thread_x)
+xo, xi = s[s_update].split(s_update.op.axis[1], factor=num_thread)
+s[s_update].bind(xo, block_x)
+s[s_update].bind(xi, thread_x)
+print(tvm.lower(s, [X, s_scan], simple_mode=True))
+
+######################################################################
+# Build and Verify
+# ----------------
+# We can build the scan kernel like other TVM kernels, here we use
+# numpy to verify the correctness of the result.
+#
+fscan = tvm.build(s, [X, s_scan], "cuda", name="myscan")
+ctx = tvm.gpu(0)
+n = 1024
+m = 10
+a_np = np.random.uniform(size=(m, n)).astype(s_scan.dtype)
+a = tvm.nd.array(a_np, ctx)
+b = tvm.nd.array(np.zeros((m, n), dtype=s_scan.dtype), ctx)
+fscan(a, b)
+tvm.testing.assert_allclose(b.asnumpy(), np.cumsum(a_np, axis=0))
+
+######################################################################
+# Multi-Stage Scan Cell
+# ---------------------
+# In the above example we described the scan cell using one Tensor
+# computation stage in s_update. It is possible to use multiple
+# Tensor stages in the scan cell.
+#
+# The following lines demonstrate a scan with two stage operations
+# in the scan cell.
+#
+m = te.var("m")
+n = te.var("n")
+X = te.placeholder((m, n), name="X")
+s_state = te.placeholder((m, n))
+s_init = te.compute((1, n), lambda _, i: X[0, i])
+s_update_s1 = te.compute((m, n), lambda t, i: s_state[t-1, i] * 2, name="s1")
+s_update_s2 = te.compute((m, n), lambda t, i: s_update_s1[t, i] + X[t, i], name="s2")
+s_scan = tvm.te.scan(s_init, s_update_s2, s_state, inputs=[X])
+
+######################################################################
+# These intermediate tensors can also be scheduled normally.
+# To ensure correctness, TVM creates a group constraint to forbid
+# the body of scan to be compute_at locations outside the scan loop.
+#
+s = te.create_schedule(s_scan.op)
+xo, xi = s[s_update_s2].split(s_update_s2.op.axis[1], factor=32)
+s[s_update_s1].compute_at(s[s_update_s2], xo)
+print(tvm.lower(s, [X, s_scan], simple_mode=True))
+
+######################################################################
+# Multiple States
+# ---------------
+# For complicated applications like RNN, we might need more than one
+# recurrent state. Scan support multiple recurrent states.
+# The following example demonstrates how we can build recurrence with two states.
+#
+m = te.var("m")
+n = te.var("n")
+l = te.var("l")
+X = te.placeholder((m, n), name="X")
+s_state1 = te.placeholder((m, n))
+s_state2 = te.placeholder((m, l))
+s_init1 = te.compute((1, n), lambda _, i: X[0, i])
+s_init2 = te.compute((1, l), lambda _, i: 0.0)
+s_update1 = te.compute((m, n), lambda t, i: s_state1[t-1, i] + X[t, i])
+s_update2 = te.compute((m, l), lambda t, i: s_state2[t-1, i] + s_state1[t-1, 0])
+s_scan1, s_scan2 = tvm.te.scan([s_init1, s_init2],
+                            [s_update1, s_update2],
+                            [s_state1, s_state2], inputs=[X])
+s = te.create_schedule(s_scan1.op)
+print(tvm.lower(s, [X, s_scan1, s_scan2], simple_mode=True))
+
+######################################################################
+# Summary
+# -------
+# This tutorial provides a walk through of scan primitive.
+#
+# - Describe scan with init and update.
+# - Schedule the scan cells as normal schedule.
+# - For complicated workload, use multiple states and steps in scan cell.
diff --git a/docs/_downloads/157d346b9435735811846746471aebed/tensor_expr_get_started.ipynb b/docs/_downloads/157d346b9435735811846746471aebed/tensor_expr_get_started.ipynb
new file mode 100644
index 0000000..691532a
--- /dev/null
+++ b/docs/_downloads/157d346b9435735811846746471aebed/tensor_expr_get_started.ipynb
@@ -0,0 +1,287 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n\nGet Started with Tensor Expression\n==================================\n**Author**: `Tianqi Chen <https://tqchen.github.io>`_\n\nThis is an introductory tutorial to the Tensor expression language in TVM.\nTVM uses a domain specific tensor expression for efficient kernel construction.\n\nIn this tutorial, we will demonstrate the basic workflow to use\nthe tensor expression language.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from __future__ import absolute_import, print_function\n\nimport tvm\nfrom tvm import te\nimport numpy as np\n\n# Global declarations of environment.\n\ntgt_host=\"llvm\"\n# Change it to respective GPU if gpu is enabled Ex: cuda, opencl, rocm\ntgt=\"cuda\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Vector Add Example\n------------------\nIn this tutorial, we will use a vector addition example to demonstrate\nthe workflow.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Describe the Computation\n------------------------\nAs a first step, we need to describe our computation.\nTVM adopts tensor semantics, with each intermediate result\nrepresented as a multi-dimensional array. The user needs to describe\nthe computation rule that generates the tensors.\n\nWe first define a symbolic variable n to represent the shape.\nWe then define two placeholder Tensors, A and B, with given shape (n,)\n\nWe then describe the result tensor C, with a compute oper [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "n = te.var(\"n\")\nA = te.placeholder((n,), name='A')\nB = te.placeholder((n,), name='B')\nC = te.compute(A.shape, lambda i: A[i] + B[i], name=\"C\")\nprint(type(C))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Schedule the Computation\n------------------------\nWhile the above lines describe the computation rule, we can compute\nC in many ways since the axis of C can be computed in a data\nparallel manner.  TVM asks the user to provide a description of the\ncomputation called a schedule.\n\nA schedule is a set of transformation of computation that transforms\nthe loop of computations in the program.\n\nAfter we construct the schedule, by default the schedule computes\nC in a serial ma [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "s = te.create_schedule(C.op)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "We used the split construct to split the first axis of C,\nthis will split the original iteration axis into product of\ntwo iterations. This is equivalent to the following code.\n\n.. code-block:: c\n\n  for (int bx = 0; bx < ceil(n / 64); ++bx) {\n    for (int tx = 0; tx < 64; ++tx) {\n      int i = bx * 64 + tx;\n      if (i < n) {\n        C[i] = A[i] + B[i];\n      }\n    }\n  }\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "bx, tx = s[C].split(C.op.axis[0], factor=64)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Finally we bind the iteration axis bx and tx to threads in the GPU\ncompute grid. These are GPU specific constructs that allow us\nto generate code that runs on GPU.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "if tgt == \"cuda\" or tgt == \"rocm\" or tgt.startswith('opencl'):\n  s[C].bind(bx, te.thread_axis(\"blockIdx.x\"))\n  s[C].bind(tx, te.thread_axis(\"threadIdx.x\"))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Compilation\n-----------\nAfter we have finished specifying the schedule, we can compile it\ninto a TVM function. By default TVM compiles into a type-erased\nfunction that can be directly called from the python side.\n\nIn the following line, we use tvm.build to create a function.\nThe build function takes the schedule, the desired signature of the\nfunction (including the inputs and outputs) as well as target language\nwe want to compile to.\n\nThe result of compilation fadd is [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "fadd = tvm.build(s, [A, B, C], tgt, target_host=tgt_host, name=\"myadd\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Run the Function\n----------------\nThe compiled TVM function is exposes a concise C API\nthat can be invoked from any language.\n\nWe provide a minimal array API in python to aid quick testing and prototyping.\nThe array API is based on the `DLPack <https://github.com/dmlc/dlpack>`_ standard.\n\n- We first create a GPU context.\n- Then tvm.nd.array copies the data to the GPU.\n- fadd runs the actual computation.\n- asnumpy() copies the GPU array back to the CPU and we can use t [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "ctx = tvm.context(tgt, 0)\n\nn = 1024\na = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)\nb = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)\nc = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)\nfadd(a, b, c)\ntvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Inspect the Generated Code\n--------------------------\nYou can inspect the generated code in TVM. The result of tvm.build\nis a TVM Module. fadd is the host module that contains the host wrapper,\nit also contains a device module for the CUDA (GPU) function.\n\nThe following code fetches the device module and prints the content code.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "if tgt == \"cuda\" or tgt == \"rocm\" or tgt.startswith('opencl'):\n    dev_module = fadd.imported_modules[0]\n    print(\"-----GPU code-----\")\n    print(dev_module.get_source())\nelse:\n    print(fadd.get_source())"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "<div class=\"alert alert-info\"><h4>Note</h4><p>Code Specialization\n\n  As you may have noticed, the declarations of A, B and C all\n  take the same shape argument, n. TVM will take advantage of this\n  to pass only a single shape argument to the kernel, as you will find in\n  the printed device code. This is one form of specialization.\n\n  On the host side, TVM will automatically generate check code\n  that checks the constraints in the parameters. So if you pass\n  arrays wi [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Save Compiled Module\n--------------------\nBesides runtime compilation, we can save the compiled modules into\na file and load them back later. This is called ahead of time compilation.\n\nThe following code first performs the following steps:\n\n- It saves the compiled host module into an object file.\n- Then it saves the device module into a ptx file.\n- cc.create_shared calls a compiler (gcc) to create a shared library\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from tvm.contrib import cc\nfrom tvm.contrib import util\n\ntemp = util.tempdir()\nfadd.save(temp.relpath(\"myadd.o\"))\nif tgt == \"cuda\":\n    fadd.imported_modules[0].save(temp.relpath(\"myadd.ptx\"))\nif tgt == \"rocm\":\n    fadd.imported_modules[0].save(temp.relpath(\"myadd.hsaco\"))\nif tgt.startswith('opencl'):\n    fadd.imported_modules[0].save(temp.relpath(\"myadd.cl\"))\ncc.create_shared(temp.relpath(\"myadd.so\"), [temp.relpath(\"myadd.o\")])\nprint(temp.listdir())"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "<div class=\"alert alert-info\"><h4>Note</h4><p>Module Storage Format\n\n  The CPU (host) module is directly saved as a shared library (.so).\n  There can be multiple customized formats of the device code.\n  In our example, the device code is stored in ptx, as well as a meta\n  data json file. They can be loaded and linked separately via import.</p></div>\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Load Compiled Module\n--------------------\nWe can load the compiled module from the file system and run the code.\nThe following code loads the host and device module separately and\nre-links them together. We can verify that the newly loaded function works.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "fadd1 = tvm.runtime.load_module(temp.relpath(\"myadd.so\"))\nif tgt == \"cuda\":\n    fadd1_dev = tvm.runtime.load_module(temp.relpath(\"myadd.ptx\"))\n    fadd1.import_module(fadd1_dev)\n\nif tgt == \"rocm\":\n    fadd1_dev = tvm.runtime.load_module(temp.relpath(\"myadd.hsaco\"))\n    fadd1.import_module(fadd1_dev)\n\nif tgt.startswith('opencl'):\n    fadd1_dev = tvm.runtime.load_module(temp.relpath(\"myadd.cl\"))\n    fadd1.import_module(fadd1_dev)\n\nfadd1(a, b, c)\ntvm.testi [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Pack Everything into One Library\n--------------------------------\nIn the above example, we store the device and host code separately.\nTVM also supports export everything as one shared library.\nUnder the hood, we pack the device modules into binary blobs and link\nthem together with the host code.\nCurrently we support packing of Metal, OpenCL and CUDA modules.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "fadd.export_library(temp.relpath(\"myadd_pack.so\"))\nfadd2 = tvm.runtime.load_module(temp.relpath(\"myadd_pack.so\"))\nfadd2(a, b, c)\ntvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "<div class=\"alert alert-info\"><h4>Note</h4><p>Runtime API and Thread-Safety\n\n  The compiled modules of TVM do not depend on the TVM compiler.\n  Instead, they only depend on a minimum runtime library.\n  The TVM runtime library wraps the device drivers and provides\n  thread-safe and device agnostic calls into the compiled functions.\n\n  This means that you can call the compiled TVM functions from any thread,\n  on any GPUs.</p></div>\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Generate OpenCL Code\n--------------------\nTVM provides code generation features into multiple backends,\nwe can also generate OpenCL code or LLVM code that runs on CPU backends.\n\nThe following code blocks generate OpenCL code, creates array on an OpenCL\ndevice, and verifies the correctness of the code.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "if tgt.startswith('opencl'):\n    fadd_cl = tvm.build(s, [A, B, C], tgt, name=\"myadd\")\n    print(\"------opencl code------\")\n    print(fadd_cl.imported_modules[0].get_source())\n    ctx = tvm.cl(0)\n    n = 1024\n    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)\n    b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)\n    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)\n    fadd_cl(a, b, c)\n    tvm.testing.assert_allclose(c.asnumpy(), a.a [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Summary\n-------\nThis tutorial provides a walk through of TVM workflow using\na vector add example. The general workflow is\n\n- Describe your computation via a series of operations.\n- Describe how we want to compute use schedule primitives.\n- Compile to the target function we want.\n- Optionally, save the function to be loaded later.\n\nYou are more than welcome to checkout other examples and\ntutorials to learn more about the supported operations, scheduling primitives\nand [...]
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/_downloads/1d6c63023d70a80db938b62570b7281c/tutorials_jupyter.zip b/docs/_downloads/1d6c63023d70a80db938b62570b7281c/tutorials_jupyter.zip
new file mode 100644
index 0000000..ff3caa6
Binary files /dev/null and b/docs/_downloads/1d6c63023d70a80db938b62570b7281c/tutorials_jupyter.zip differ
diff --git a/docs/_downloads/2354a24ad8bc07194943c49f2fb48874/tune_conv2d_cuda.ipynb b/docs/_downloads/2354a24ad8bc07194943c49f2fb48874/tune_conv2d_cuda.ipynb
new file mode 100644
index 0000000..5ae5df3
--- /dev/null
+++ b/docs/_downloads/2354a24ad8bc07194943c49f2fb48874/tune_conv2d_cuda.ipynb
@@ -0,0 +1,115 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\nTuning High Performance Convolution on NVIDIA GPUs\n=========================================================================\n**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_\n\nThis is an advanced tutorial for writing high performance tunable template for\nNVIDIA GPU. By running auto-tuner on this template, we can outperform the\nvendor provided library CuDNN in many cases.\n\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Install dependencies\n--------------------\nTo use autotvm package in tvm, we need to install some extra dependencies.\n(change \"3\" to \"2\" if you use python2):\n\n.. code-block:: bash\n\n  pip3 install --user psutil xgboost tornado\n\nTo make TVM run faster in tuning, it is recommended to use cython\nas FFI of tvm. In the root directory of tvm, execute\n\n.. code-block:: bash\n\n  pip3 install --user cython\n  sudo make cython3\n\nNow return to python code. Import packages.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import logging\nimport sys\nimport numpy as np\n\nimport tvm\nfrom tvm import te\nimport topi\nfrom topi.testing import conv2d_nchw_python\n\nfrom tvm import autotvm"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Step 1:  Define the search space\n--------------------------------\nThere are plenty of useful schedule primitives in tvm. You can also find\nsome tutorials that describe them in more details, such as\n(1). `opt-conv-gpu`\n(2). `Optimizing DepthwiseConv on NVIDIA GPU <https://tvm.apache.org/2017/08/22/Optimize-Deep-Learning-GPU-Operators-with-TVM-A-Depthwise-Convolution-Example>`_\n\nHowever, their implementations are manually tuned for some special input\nshapes. In this sectio [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "@autotvm.template(\"tutorial/conv2d_no_batching\")\ndef conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):\n    assert N == 1, \"Only consider batch_size = 1 in this template\"\n\n    data = te.placeholder((N, CI, H, W), name='data')\n    kernel = te.placeholder((CO, CI, KH, KW), name='kernel')\n    conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype='float32')\n    s = te.create_schedule([conv.op])\n\n    ##### space definition begin ##### [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Step 2:  Search through the space\n---------------------------------\nWe pick the last layer on resnet as test case.\nSince our space is very large, :code:`XGBoostTuner` is most suitable\nfor our case. Here we only do 20 trials for demonstration.\nIn practice, making 1000 trials usually can find some good kernels\nfor this template\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# logging config (for printing tuning log to screen)\nlogging.getLogger('autotvm').setLevel(logging.DEBUG)\nlogging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))\n\n# the last layer in resnet\nN, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1)\ntask = autotvm.task.create(\"tutorial/conv2d_no_batching\",\n                           args=(N, H, W, CO, CI, KH, KW, strides, padding),\n                           target='cuda')\npr [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Finally we can inspect the best config from log file, check correctness,\nand measure running time.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# inspect the best config\ndispatch_context = autotvm.apply_history_best(\"conv2d.log\")\nbest_config = dispatch_context.query(task.target, task.workload)\nprint(\"\\nBest config:\")\nprint(best_config)\n\n# apply history best from log file\nwith autotvm.apply_history_best('conv2d.log'):\n    with tvm.target.create(\"cuda\"):\n        s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides, padding)\n        func = tvm.build(s, arg_bufs)\n\n# check correctness\na_np =  [...]
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/_downloads/272a5a893d007658546dc0eaf0a7aeed/tune_relay_cuda.py b/docs/_downloads/272a5a893d007658546dc0eaf0a7aeed/tune_relay_cuda.py
new file mode 100644
index 0000000..4195075
--- /dev/null
+++ b/docs/_downloads/272a5a893d007658546dc0eaf0a7aeed/tune_relay_cuda.py
@@ -0,0 +1,378 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-tuning a convolutional network for NVIDIA GPU
+==================================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, `Eddie Yan <https://github.com/eqy/>`_
+
+Auto-tuning for specific devices and workloads is critical for getting the
+best performance. This is a tutorial on how to tune a whole convolutional
+network for NVIDIA GPU.
+
+The operator implementation for NVIDIA GPU in TVM is written in template form.
+The template has many tunable knobs (tile factor, unrolling, etc).
+We will tune all convolution and depthwise convolution operators
+in the neural network. After tuning, we produce a log file which stores
+the best knob values for all required operators. When the TVM compiler compiles
+these operators, it will query this log file to get the best knob values.
+
+We also released pre-tuned parameters for some NVIDIA GPUs. You can go to
+`NVIDIA GPU Benchmark <https://github.com/apache/incubator-tvm/wiki/Benchmark#nvidia-gpu>`_
+to see the results.
+"""
+
+######################################################################
+# Install dependencies
+# --------------------
+# To use the autotvm package in tvm, we need to install some extra dependencies.
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user psutil xgboost tornado
+#
+# To make TVM run faster during tuning, it is recommended to use cython
+# as FFI of tvm. In the root directory of tvm, execute:
+#
+# .. code-block:: bash
+#
+#   pip3 install --user cython
+#   sudo make cython3
+#
+# Now return to python code. Import packages.
+
+import os
+
+import numpy as np
+
+import tvm
+from tvm import te
+from tvm import autotvm
+from tvm import relay
+import tvm.relay.testing
+from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+from tvm.contrib.util import tempdir
+import tvm.contrib.graph_runtime as runtime
+
+#################################################################
+# Define Network
+# --------------
+# First we need to define the network in relay frontend API.
+# We can load some pre-defined network from :code:`tvm.relay.testing`.
+# We can also load models from MXNet, ONNX and TensorFlow.
+
+def get_network(name, batch_size):
+    """Get the symbol definition and random weight of a network"""
+    input_shape = (batch_size, 3, 224, 224)
+    output_shape = (batch_size, 1000)
+
+    if "resnet" in name:
+        n_layer = int(name.split('-')[1])
+        mod, params = relay.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)
+    elif "vgg" in name:
+        n_layer = int(name.split('-')[1])
+        mod, params = relay.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)
+    elif name == 'mobilenet':
+        mod, params = relay.testing.mobilenet.get_workload(batch_size=batch_size, dtype=dtype)
+    elif name == 'squeezenet_v1.1':
+        mod, params = relay.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1', dtype=dtype)
+    elif name == 'inception_v3':
+        input_shape = (1, 3, 299, 299)
+        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
+    elif name == 'mxnet':
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+        block = get_model('resnet18_v1', pretrained=True)
+        mod, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype)
+        net = mod["main"]
+        net = relay.Function(net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs)
+        mod = tvm.IRModule.from_expr(net)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return mod, params, input_shape, output_shape
+
+###########################################
+# Set Tuning Options
+# ------------------
+# Before tuning, we apply some configurations.
+
+#### DEVICE CONFIG ####
+target = tvm.target.cuda()
+
+#### TUNING OPTION ####
+network = 'resnet-18'
+log_file = "%s.log" % network
+dtype = 'float32'
+
+tuning_option = {
+    'log_filename': log_file,
+
+    'tuner': 'xgb',
+    'n_trial': 2000,
+    'early_stopping': 600,
+
+    'measure_option': autotvm.measure_option(
+        builder=autotvm.LocalBuilder(timeout=10),
+        #runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, min_repeat_ms=150),
+        runner=autotvm.RPCRunner(
+            '1080ti',  # change the device key to your key
+            '0.0.0.0', 9190,
+            number=20, repeat=3, timeout=4, min_repeat_ms=150)
+    ),
+}
+
+####################################################################
+#
+# .. note:: How to set tuning options
+#
+#   In general, the default value provided here works well.
+#
+#   If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
+#   which makes the tuning runs longer.
+#
+#   If you have multiple devices, you can use all of them for measurement to
+#   accelerate the tuning process. (see the 'Scale up measurement` section below).
+#
+
+###################################################################
+# Begin Tuning
+# ------------
+# Now we can extract tuning tasks from the network and begin tuning.
+# Here, we provide a simple utility function to tune a list of tasks.
+# This function is just an initial implementation which tunes them in sequential order.
+# We will introduce a more sophisticated tuning scheduler in the future.
+
+# You can skip the implementation of this function for this tutorial.
+def tune_tasks(tasks,
+               measure_option,
+               tuner='xgb',
+               n_trial=1000,
+               early_stopping=None,
+               log_filename='tuning.log',
+               use_transfer_learning=True):
+    # create tmp log file
+    tmp_log_file = log_filename + ".tmp"
+    if os.path.exists(tmp_log_file):
+        os.remove(tmp_log_file)
+
+    for i, tsk in enumerate(reversed(tasks)):
+        prefix = "[Task %2d/%2d] " %(i+1, len(tasks))
+
+        # create tuner
+        if tuner == 'xgb' or tuner == 'xgb-rank':
+            tuner_obj = XGBTuner(tsk, loss_type='rank')
+        elif tuner == 'ga':
+            tuner_obj = GATuner(tsk, pop_size=100)
+        elif tuner == 'random':
+            tuner_obj = RandomTuner(tsk)
+        elif tuner == 'gridsearch':
+            tuner_obj = GridSearchTuner(tsk)
+        else:
+            raise ValueError("Invalid tuner: " + tuner)
+
+        if use_transfer_learning:
+            if os.path.isfile(tmp_log_file):
+                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
+
+        # do tuning
+        tsk_trial = min(n_trial, len(tsk.config_space))
+        tuner_obj.tune(n_trial=tsk_trial,
+                       early_stopping=early_stopping,
+                       measure_option=measure_option,
+                       callbacks=[
+                           autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
+                           autotvm.callback.log_to_file(tmp_log_file)
+                       ])
+
+    # pick best records to a cache file
+    autotvm.record.pick_best(tmp_log_file, log_filename)
+    os.remove(tmp_log_file)
+
+
+########################################################################
+# Finally, we launch tuning jobs and evaluate the end-to-end performance.
+
+def tune_and_evaluate(tuning_opt):
+    # extract workloads from relay program
+    print("Extract tasks...")
+    mod, params, input_shape, out_shape = get_network(network, batch_size=1)
+    tasks = autotvm.task.extract_from_program(mod["main"], target=target,
+                                              params=params,
+                                              ops=(relay.op.get("nn.conv2d"),))
+
+    # run tuning tasks
+    print("Tuning...")
+    tune_tasks(tasks, **tuning_opt)
+
+    # compile kernels with history best records
+    with autotvm.apply_history_best(log_file):
+        print("Compile...")
+        with relay.build_config(opt_level=3):
+            graph, lib, params = relay.build_module.build(
+                mod, target=target, params=params)
+
+        # export library
+        tmp = tempdir()
+        filename = "net.tar"
+        lib.export_library(tmp.relpath(filename))
+
+        # load parameters
+        ctx = tvm.context(str(target), 0)
+        module = runtime.create(graph, lib, ctx)
+        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+        module.set_input('data', data_tvm)
+        module.set_input(**params)
+
+        # evaluate
+        print("Evaluate inference time cost...")
+        ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=600)
+        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
+        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
+              (np.mean(prof_res), np.std(prof_res)))
+
+# We do not run the tuning in our webpage server since it takes too long.
+# Uncomment the following line to run it by yourself.
+
+# tune_and_evaluate(tuning_option)
+
+######################################################################
+# Sample Output
+# -------------
+# The tuning needs to compile many programs and extract feature from them.
+# So a high performance CPU is recommended. One sample output is listed below.
+# It takes about 4 hours to get the following output on a 32T AMD Ryzen Threadripper.
+# The tuning target is NVIDIA 1080 Ti.
+# (You can see some errors during compilation. If the tuning is not stuck, it is okay.)
+#
+# .. code-block:: bash
+#
+#    Extract tasks...
+#    Tuning...
+#    [Task  1/12]  Current/Best:  541.83/3570.66 GFLOPS | Progress: (960/2000) | 1001.31 s Done.
+#    [Task  2/12]  Current/Best:    0.56/ 803.33 GFLOPS | Progress: (704/2000) | 608.08 s Done.
+#    [Task  3/12]  Current/Best:  103.69/1141.25 GFLOPS | Progress: (768/2000) | 702.13 s Done.
+#    [Task  4/12]  Current/Best: 2905.03/3925.15 GFLOPS | Progress: (864/2000) | 745.94 sterminate called without an active exception
+#    [Task  4/12]  Current/Best: 2789.36/3925.15 GFLOPS | Progress: (1056/2000) | 929.40 s Done.
+#    [Task  5/12]  Current/Best:   89.06/1076.24 GFLOPS | Progress: (704/2000) | 601.73 s Done.
+#    [Task  6/12]  Current/Best:   40.39/2129.02 GFLOPS | Progress: (1088/2000) | 1125.76 s Done.
+#    [Task  7/12]  Current/Best: 4090.53/5007.02 GFLOPS | Progress: (800/2000) | 903.90 s Done.
+#    [Task  8/12]  Current/Best:    4.78/1272.28 GFLOPS | Progress: (768/2000) | 749.14 s Done.
+#    [Task  9/12]  Current/Best: 1391.45/2325.08 GFLOPS | Progress: (992/2000) | 1084.87 s Done.
+#    [Task 10/12]  Current/Best: 1995.44/2383.59 GFLOPS | Progress: (864/2000) | 862.60 s Done.
+#    [Task 11/12]  Current/Best: 4093.94/4899.80 GFLOPS | Progress: (224/2000) | 240.92 sterminate called without an active exception
+#    [Task 11/12]  Current/Best: 3487.98/4909.91 GFLOPS | Progress: (480/2000) | 534.96 sterminate called without an active exception
+#    [Task 11/12]  Current/Best: 4636.84/4912.17 GFLOPS | Progress: (1184/2000) | 1381.16 sterminate called without an active exception
+#    [Task 11/12]  Current/Best:   50.12/4912.17 GFLOPS | Progress: (1344/2000) | 1602.81 s Done.
+#    [Task 12/12]  Current/Best: 3581.31/4286.30 GFLOPS | Progress: (736/2000) | 943.52 s Done.
+#    Compile...
+#    Evaluate inference time cost...
+#    Mean inference time (std dev): 1.07 ms (0.05 ms)
+#
+# As a reference baseline, the time cost of MXNet + TensorRT on resnet-18 is 1.30ms. So we are a little faster.
+
+######################################################################
+#
+# .. note:: **Experiencing Difficulties?**
+#
+#   The auto tuning module is error-prone. If you always see " 0.00/ 0.00 GFLOPS",
+#   then there must be something wrong.
+#
+#   First, make sure you set the correct configuration of your device.
+#   Then, you can print debug information by adding these lines in the beginning
+#   of the script. It will print every measurement result, where you can find useful
+#   error messages.
+#
+#   .. code-block:: python
+#
+#      import logging
+#      logging.getLogger('autotvm').setLevel(logging.DEBUG)
+#
+#   Finally, always feel free to ask our community for help on https://discuss.tvm.ai
+
+
+#################################################################
+# Scale up measurement by using multiple devices
+# ----------------------------------------------
+#
+# If you have multiple devices, you can use all of them for measurement.
+# TVM uses the RPC Tracker to manage distributed devices.
+# The RPC Tracker is a centralized master node. We can register all devices to
+# the tracker. For example, if we have 10 GPU cards, we can register all of them
+# to the tracker, and run 10 measurements in parallel, accelerating the tuning process.
+#
+# To start an RPC tracker, run this command on the host machine. The tracker is
+# required during the whole tuning process, so we need to open a new terminal for
+# this command:
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190
+#
+# The expected output is
+#
+# .. code-block:: bash
+#
+#   INFO:RPCTracker:bind to 0.0.0.0:9190
+#
+# Then open another new terminal for the RPC server. We need to start one server
+# for each dedicated device. We use a string key to distinguish the types of devices.
+# You can pick a name you like.
+# (Note: For rocm backend, there are some internal errors with the compiler,
+# we need to add `--no-fork` to the argument list.)
+#
+# .. code-block:: bash
+#
+#     python -m tvm.exec.rpc_server --tracker=0.0.0.0:9190 --key=1080ti
+#
+# After registering devices, we can confirm it by querying rpc_tracker
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190
+#
+# For example, if we have four 1080ti, two titanx and one gfx900, the output can be
+#
+# .. code-block:: bash
+#
+#    Queue Status
+#    ----------------------------------
+#    key          total  free  pending
+#    ----------------------------------
+#    1080ti       4      4     0
+#    titanx       2      2     0
+#    gfx900       1      1     0
+#    ----------------------------------
+#
+# Finally, we need to change the tuning option to use RPCRunner. Use the code below
+# to replace the corresponding part above.
+
+tuning_option = {
+    'log_filename': log_file,
+
+    'tuner': 'xgb',
+    'n_trial': 2000,
+    'early_stopping': 600,
+
+    'measure_option': autotvm.measure_option(
+        builder=autotvm.LocalBuilder(timeout=10),
+        runner=autotvm.RPCRunner(
+            '1080ti',  # change the device key to your key
+            '0.0.0.0', 9190,
+            number=20, repeat=3, timeout=4, min_repeat_ms=150),
+    ),
+}
diff --git a/docs/_downloads/289640e7ded85c57931fc0537d443c5f/relay_pass_infra.ipynb b/docs/_downloads/289640e7ded85c57931fc0537d443c5f/relay_pass_infra.ipynb
new file mode 100644
index 0000000..c57cd2a
--- /dev/null
+++ b/docs/_downloads/289640e7ded85c57931fc0537d443c5f/relay_pass_infra.ipynb
@@ -0,0 +1,252 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n\nHow to Use Relay Pass Infra\n===========================\n**Author**: `Zhi Chen <https://github.com/zhiics>`_\n\nAs the number of optimization passes increases in Relay, it becomes intractable to\nexecute them and maintain their dependencies manually. Therefore, we have\nintroduced an infrastructure to manage the optimization passes.\n\nThe optimizations of a Relay program could be applied at various granularity,\nnamely function-level and module-level using :py:class:`tvm.r [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\nimport tvm\nfrom tvm import te\nimport tvm.relay as relay"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Create An Example Relay Program\n-------------------------------\nFirst of all, we create a simple Relay program for the tutorial. This program\nwill be used by various optimizations of the examples in this tutorial.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "def example():\n    shape = (1, 64, 54, 54)\n    c_data = np.empty(shape).astype(\"float32\")\n    c = relay.const(c_data)\n    weight = relay.var('weight', shape=(64, 64, 3, 3))\n    x = relay.var(\"x\", relay.TensorType((1, 64, 56, 56), \"float32\"))\n    conv = relay.nn.conv2d(x, weight)\n    y = relay.add(c, c)\n    y = relay.multiply(y, relay.const(2, \"float32\"))\n    y = relay.add(conv, y)\n    z = relay.add(y, c)\n    z1 = relay.add(y, c)\n    z2 = relay.add(z, z1)\n    [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Let us register layout alteration for a conv2d op so that we can apply the\nlayout alteration pass on the example. How alter layout pass works is out\nthe scope of this tutorial.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "@relay.op.register_alter_op_layout(\"nn.conv2d\", level=101)\ndef alter_conv2d(attrs, inputs, tinfos, out_type):\n    data, weight = inputs\n    new_attrs = dict(attrs)\n    new_attrs['data_layout'] = 'NCHW16c'\n    return relay.nn.conv2d(data, weight, **new_attrs)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Optimize the Program\n--------------------\nNow we would like to optimize the program. Relay features a host of\noptimizations. We will select some of them to apply on this example program.\n\nThere are multiple ways to optimize a Relay program. Below we will provide\nexamples for each of them.\n\nManually Apply Optimization Passes\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Let's first create a relay Module which contains one or multiple Relay\n# functions for optimization.\nf = example()\nmod = tvm.IRModule.from_expr(f)\n\n# Now we can apply constant folding on the module.\n# fold_const here is a callback that doesn't take any parameters.\nfold_const = relay.transform.FoldConstant()\n# Then, we can invoke the pass on the given module. Note that the constant\n# folding pass works at the function-level. That being said, each function in\n# the mod [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "More optimizations can be applied in the similar manner. For instance, we can\neliminate the common expressions that used by `z` and `z1`.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "mod = relay.transform.EliminateCommonSubexpr()(mod)\nprint(mod)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Some optimizations, such as fusion, are parameteric as well. For example,\nopt level 0 will not allow operators to be fused together. Users can pass the\n`fuse_opt_level` to enable this.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "mod = relay.transform.FuseOps(fuse_opt_level=0)(mod)\n\n# We can observe that the optimized module contains functions that only have\n# a signle primitive op.\nprint(mod)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Use Sequential to Apply a Sequence of Passes\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\nApplying passes as above is actually tedious and it may require users to have\nbetter understanding about the dependencies between them. For example, fusion\ncurrently doesn't work well on let bindings. Therefore, we would not be able\nto fuse operators that were fusable if :py:func:`relay.transform.ToANormalForm` is applied before\nfusion, as this pass generates let bindings for each exp [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Now let's execute some passes through :py:class:`tvm.relay.transform.Sequential`\nf = example()\nmod = tvm.IRModule.from_expr(f)\n# Glob the interested passes.\nseq = relay.transform.Sequential([relay.transform.FoldConstant(),\n                                  relay.transform.EliminateCommonSubexpr(),\n                                  relay.transform.FuseOps(fuse_opt_level=2)])\nmod1 = seq(mod)\nprint(mod1)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "From the transformed Relay program, we can see that there are still two\nidentical addition operations. This is because `EliminateCommonSubexpr`\nwas not actually performed. The reason is because only the passes that have\noptimization level less or equal to 2 will be executed by default under\n:py:class:`tvm.relay.transform.Sequential`. The pass infra,\nhowever, provides a configuration interface\nfor users to customize the optimization level that they want to execute.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "with relay.build_config(opt_level=3):\n    mod2 = seq(mod)\nprint(mod2)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Now we can see that only one of the two identical additions is kept.\n\nIn addition, users can selectively disable some passes using the\n`disabled_pass` config, which is similar to the `-fno-xxx` option used the\ngeneral purpose compilers, such as Clang and GCC. For example, we can disable\nEliminateCommonSubexpr as following. The printed module will again show two\nidentical addition operations.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "with relay.build_config(opt_level=3, disabled_pass=[\"EliminateCommonSubexpr\"]):\n    mod3 = seq(mod)\nprint(mod3)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "The passes applied so far are target independent. The pass infra also\nprovides a means to make pass target-aware. For example, the layout\nalteration pass falls in such category.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "with relay.build_config(opt_level=3):\n    mod4 = seq(mod)\nprint(mod4)\n\nseq1 = relay.transform.Sequential([relay.transform.AlterOpLayout()])\nwith relay.build_config(opt_level=3):\n    with tvm.target.create(\"llvm\"):\n        mod5 = seq1(mod)\nprint(mod5)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Implement a Pass Using Python Decorator\n------------------------------------------\nThe next example illustrates how we can orchestrate a customized optimization\npipeline through the pass infra using Python decorators. This functionality\ngreatly eases the implementation of passes. For example, users can simply\ndefine a decorated class to do function-level optimizations as the following\nexample shows. `transform_function` wraps a class to replace all constants\nwith a multip [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "@relay.transform.function_pass(opt_level=1)\nclass CustomPipeline:\n    \"\"\"Simple test function to replace one argument to another.\"\"\"\n\n    def __init__(self, multiplier):\n        self.multiplier = multiplier\n\n    # This function can define a pass.\n    def transform_function(self, func, mod, ctx):\n        obj = self\n\n        class ReplaceConstant(tvm.relay.ExprMutator):\n            def visit_const(self, c):\n                return relay.multiply(obj.multiplier, c [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Debug a Pass\n------------\nRelay provides users a plug-and-play style debugging pass that print the IR\nafter a certain pass is done. For example, we can print out the IR on the\ncompletion of constant folding and fusion by adding the debugging pass after\nthem.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "f = example()\nmod = tvm.IRModule.from_expr(f)\nseq = relay.transform.Sequential([relay.transform.FoldConstant(),\n                                  relay.transform.PrintIR(False),\n                                  relay.transform.EliminateCommonSubexpr(),\n                                  relay.transform.FuseOps(),\n                                  relay.transform.PrintIR(False)])\nwith relay.build_config(opt_level=3):\n    mod = seq(mod)\n\nprint(\"done\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/_downloads/2c0ed53a9ebd68caf76cd8235fae2711/tune_relay_mobile_gpu.ipynb b/docs/_downloads/2c0ed53a9ebd68caf76cd8235fae2711/tune_relay_mobile_gpu.ipynb
new file mode 100644
index 0000000..ddd7c4f
--- /dev/null
+++ b/docs/_downloads/2c0ed53a9ebd68caf76cd8235fae2711/tune_relay_mobile_gpu.ipynb
@@ -0,0 +1,168 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\nAuto-tuning a convolutional network for Mobile GPU\n==================================================\n**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, `Eddie Yan <https://github.com/eqy>`_\n\nAuto-tuning for a specific device is critical for getting the best\nperformance. This is a tutorial about how to tune a whole convolutional\nnetwork.\n\nThe operator implementation for Mobile GPU in TVM is written in template form.\nThe template has many tunable knobs (tile [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Install dependencies\n--------------------\nTo use the autotvm package in tvm, we need to install some extra dependencies.\n(change \"3\" to \"2\" if you use python2):\n\n.. code-block:: bash\n\n  pip3 install --user psutil xgboost tornado\n\nTo make TVM run faster during tuning, it is recommended to use cython\nas FFI of tvm. In the root directory of tvm, execute\n(change \"3\" to \"2\" if you use python2):\n\n.. code-block:: bash\n\n  pip3 install --user cython\n  sudo make cy [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import os\n\nimport numpy as np\n\nimport tvm\nfrom tvm import te\nfrom tvm import autotvm\nfrom tvm import relay\nimport tvm.relay.testing\nfrom tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner\nfrom tvm.contrib.util import tempdir\nimport tvm.contrib.graph_runtime as runtime"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Define network\n--------------\nFirst we need to define the network in relay frontend API.\nWe can load some pre-defined network from :code:`relay.testing`.\nWe can also load models from MXNet, ONNX and TensorFlow.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "def get_network(name, batch_size):\n    \"\"\"Get the symbol definition and random weight of a network\"\"\"\n    input_shape = (batch_size, 3, 224, 224)\n    output_shape = (batch_size, 1000)\n\n    if \"resnet\" in name:\n        n_layer = int(name.split('-')[1])\n        mod, params = relay.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)\n    elif \"vgg\" in name:\n        n_layer = int(name.split('-')[1])\n        mod, params = relay.testi [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Start RPC Tracker\n-----------------\nTVM uses RPC session to communicate with ARM boards.\nDuring tuning, the tuner will send the generated code to the board and\nmeasure the speed of code on the board.\n\nTo scale up the tuning, TVM uses RPC Tracker to manage distributed devices.\nThe RPC Tracker is a centralized master node. We can register all devices to\nthe tracker. For example, if we have 10 phones, we can register all of them\nto the tracker, and run 10 measurements in p [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Register devices to RPC Tracker\n-----------------------------------\nNow we can register our devices to the tracker. The first step is to\nbuild the TVM runtime for the ARM devices.\n\n* For Linux:\n  Follow this section `build-tvm-runtime-on-device` to build\n  the TVM runtime on the device. Then register the device to tracker by\n\n  .. code-block:: bash\n\n    python -m tvm.exec.rpc_server --tracker=[HOST_IP]:9190 --key=rk3399\n\n  (replace :code:`[HOST_IP]` with the IP addr [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Set Tuning Options\n------------------\nBefore tuning, we should apply some configurations. Here I use an RK3399 board\nas example. In your setting, you should modify the target and device_key accordingly.\nset :code:`use_android` to True if you use android phone.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "#### DEVICE CONFIG ####\n\ntarget = tvm.target.create('opencl -device=mali')\n\n# Replace \"aarch64-linux-gnu\" with the correct target of your board.\n# This target host is used for cross compilation. You can query it by :code:`gcc -v` on your device.\ntarget_host = 'llvm -target=aarch64-linux-gnu'\n\n# Also replace this with the device key in your tracker\ndevice_key = 'rk3399'\n\n# Set this to True if you use android phone\nuse_android = False\n\n#### TUNING OPTION ####\nnetw [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "<div class=\"alert alert-info\"><h4>Note</h4><p>How to set tuning options\n\n  In general, the default values provided here work well.\n  If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,\n  which makes the tuning run longer.\n  If your device runs very slow or your conv2d operators have many GFLOPs, considering to\n  set timeout larger.</p></div>\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Begin Tuning\n------------\nNow we can extract tuning tasks from the network and begin tuning.\nHere, we provide a simple utility function to tune a list of tasks.\nThis function is just an initial implementation which tunes them in sequential order.\nWe will introduce a more sophisticated tuning scheduler in the future.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# You can skip the implementation of this function for this tutorial.\ndef tune_tasks(tasks,\n               measure_option,\n               tuner='xgb',\n               n_trial=1000,\n               early_stopping=None,\n               log_filename='tuning.log',\n               use_transfer_learning=True):\n    # create tmp log file\n    tmp_log_file = log_filename + \".tmp\"\n    if os.path.exists(tmp_log_file):\n        os.remove(tmp_log_file)\n\n    for i, tsk in enumerate(r [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Finally, we launch tuning jobs and evaluate the end-to-end performance.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "def tune_and_evaluate(tuning_opt):\n    # extract workloads from relay program\n    print(\"Extract tasks...\")\n    mod, params, input_shape, _ = get_network(network, batch_size=1)\n    tasks = autotvm.task.extract_from_program(mod[\"main\"],\n                                              target=target,\n                                              target_host=target_host,\n                                              params=params,\n                                           [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Sample Output\n-------------\nThe tuning needs to compile many programs and extract feature from them.\nSo a high performance CPU is recommended.\nOne sample output is listed below. It takes about 3 hours on a 32T AMD Ryzen Threadripper.\n\n.. code-block:: bash\n\n   Extract tasks...\n   Tuning...\n   [Task  1/17]  Current/Best:   25.30/  39.12 GFLOPS | Progress: (992/1000) | 751.22 s Done.\n   [Task  2/17]  Current/Best:   40.70/  45.50 GFLOPS | Progress: (736/1000) | 545.46 s  [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "<div class=\"alert alert-info\"><h4>Note</h4><p>**Experiencing Difficulties?**\n\n  The auto tuning module is error-prone. If you always see \" 0.00/ 0.00 GFLOPS\",\n  then there must be something wrong.\n\n  First, make sure you set the correct configuration of your device.\n  Then, you can print debug information by adding these lines in the beginning\n  of the script. It will print every measurement result, where you can find useful\n  error messages.\n\n  .. code-block:: pyt [...]
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/_downloads/2c8ef0390ad4c53ca85671fa36c33b26/tune_conv2d_cuda.py b/docs/_downloads/2c8ef0390ad4c53ca85671fa36c33b26/tune_conv2d_cuda.py
new file mode 100644
index 0000000..3cdbb84
--- /dev/null
+++ b/docs/_downloads/2c8ef0390ad4c53ca85671fa36c33b26/tune_conv2d_cuda.py
@@ -0,0 +1,237 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Tuning High Performance Convolution on NVIDIA GPUs
+=========================================================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
+
+This is an advanced tutorial for writing high performance tunable template for
+NVIDIA GPU. By running auto-tuner on this template, we can outperform the
+vendor provided library CuDNN in many cases.
+"""
+
+######################################################################
+# Install dependencies
+# --------------------
+# To use autotvm package in tvm, we need to install some extra dependencies.
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user psutil xgboost tornado
+#
+# To make TVM run faster in tuning, it is recommended to use cython
+# as FFI of tvm. In the root directory of tvm, execute
+#
+# .. code-block:: bash
+#
+#   pip3 install --user cython
+#   sudo make cython3
+#
+# Now return to python code. Import packages.
+
+import logging
+import sys
+import numpy as np
+
+import tvm
+from tvm import te
+import topi
+from topi.testing import conv2d_nchw_python
+
+from tvm import autotvm
+
+######################################################################
+# Step 1:  Define the search space
+# --------------------------------
+# There are plenty of useful schedule primitives in tvm. You can also find
+# some tutorials that describe them in more details, such as
+# (1). :ref:`opt-conv-gpu`
+# (2). `Optimizing DepthwiseConv on NVIDIA GPU <https://tvm.apache.org/2017/08/22/Optimize-Deep-Learning-GPU-Operators-with-TVM-A-Depthwise-Convolution-Example>`_
+#
+# However, their implementations are manually tuned for some special input
+# shapes. In this section, we build a large enough space to cover
+# the techniques used in these tutorials. Then we rely on the efficient auto-tuner
+# to search through this space and pick some good configurations.
+#
+# If you are familiar with writing cuda schedule, you can find the following
+# template is very general. Actually this template can be easily modified
+# to tune other operators such as depthwise convolution and gemm.
+# In order to fully understand this template, you should be familiar with
+# the schedule primitives and auto tuning API. You can refer to the above
+# tutorials and :doc:`autotvm tutorial <tune_simple_template>`
+#
+# It is worth noting that the search space for a conv2d operator
+# can be very large (at the level of 10^9 for some input shapes)
+#
+
+@autotvm.template("tutorial/conv2d_no_batching")
+def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
+    assert N == 1, "Only consider batch_size = 1 in this template"
+
+    data = te.placeholder((N, CI, H, W), name='data')
+    kernel = te.placeholder((CO, CI, KH, KW), name='kernel')
+    conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype='float32')
+    s = te.create_schedule([conv.op])
+
+    ##### space definition begin #####
+    n, f, y, x = s[conv].op.axis
+    rc, ry, rx = s[conv].op.reduce_axis
+
+    cfg = autotvm.get_config()
+    cfg.define_split("tile_f", f, num_outputs=4)
+    cfg.define_split("tile_y", y, num_outputs=4)
+    cfg.define_split("tile_x", x, num_outputs=4)
+    cfg.define_split("tile_rc", rc, num_outputs=3)
+    cfg.define_split("tile_ry", ry, num_outputs=3)
+    cfg.define_split("tile_rx", rx, num_outputs=3)
+    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+    cfg.define_knob("unroll_explicit", [0, 1])
+    ##### space definition end #####
+
+    # inline padding
+    pad_data = s[conv].op.input_tensors[0]
+    s[pad_data].compute_inline()
+    data, raw_data = pad_data, data
+
+    output = conv
+    OL = s.cache_write(conv, 'local')
+
+    # create cache stage
+    AA = s.cache_read(data, 'shared', [OL])
+    WW = s.cache_read(kernel, 'shared', [OL])
+    AL = s.cache_read(AA, 'local', [OL])
+    WL = s.cache_read(WW, 'local', [OL])
+
+    # tile and bind spatial axes
+    n, f, y, x = s[output].op.axis
+    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
+    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
+    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
+    kernel_scope = n  # this is the scope to attach global config inside this kernel
+
+    s[output].bind(bf, te.thread_axis("blockIdx.z"))
+    s[output].bind(by, te.thread_axis("blockIdx.y"))
+    s[output].bind(bx, te.thread_axis("blockIdx.x"))
+    s[output].bind(vf, te.thread_axis("vthread"))
+    s[output].bind(vy, te.thread_axis("vthread"))
+    s[output].bind(vx, te.thread_axis("vthread"))
+    s[output].bind(tf, te.thread_axis("threadIdx.z"))
+    s[output].bind(ty, te.thread_axis("threadIdx.y"))
+    s[output].bind(tx, te.thread_axis("threadIdx.x"))
+    s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
+    s[OL].compute_at(s[output], tx)
+
+    # tile reduction axes
+    n, f, y, x = s[OL].op.axis
+    rc, ry, rx = s[OL].op.reduce_axis
+    rco, rcm, rci = cfg['tile_rc'].apply(s, OL, rc)
+    ryo, rym, ryi = cfg['tile_rx'].apply(s, OL, ry)
+    rxo, rxm, rxi = cfg['tile_ry'].apply(s, OL, rx)
+    s[OL].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, n, f, y, x)
+
+    s[AA].compute_at(s[OL], rxo)
+    s[WW].compute_at(s[OL], rxo)
+    s[AL].compute_at(s[OL], rxm)
+    s[WL].compute_at(s[OL], rxm)
+
+    # cooperative fetching
+    for load in [AA, WW]:
+        n, f, y, x = s[load].op.axis
+        fused = s[load].fuse(n, f, y, x)
+        tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
+        ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
+        tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
+        s[load].bind(tz, te.thread_axis("threadIdx.z"))
+        s[load].bind(ty, te.thread_axis("threadIdx.y"))
+        s[load].bind(tx, te.thread_axis("threadIdx.x"))
+
+    # tune unroll
+    s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
+    s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
+
+    return s, [raw_data, kernel, conv]
+
+######################################################################
+# Step 2:  Search through the space
+# ---------------------------------
+# We pick the last layer on resnet as test case.
+# Since our space is very large, :code:`XGBoostTuner` is most suitable
+# for our case. Here we only do 20 trials for demonstration.
+# In practice, making 1000 trials usually can find some good kernels
+# for this template
+
+# logging config (for printing tuning log to screen)
+logging.getLogger('autotvm').setLevel(logging.DEBUG)
+logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))
+
+# the last layer in resnet
+N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1)
+task = autotvm.task.create("tutorial/conv2d_no_batching",
+                           args=(N, H, W, CO, CI, KH, KW, strides, padding),
+                           target='cuda')
+print(task.config_space)
+
+# Use local gpu, measure 10 times for every config to reduce variance
+# The timeout of compiling a program is 10 seconds, the timeout for running is 4 seconds
+measure_option = autotvm.measure_option(
+    builder=autotvm.LocalBuilder(),
+    runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4)
+)
+
+# Begin tuning, log records to file `conv2d.log`
+# During tuning we will also try many invalid configs, so you are expected to
+# see many error reports. As long as you can see non-zero GFLOPS, it is okay.
+tuner = autotvm.tuner.XGBTuner(task)
+tuner.tune(n_trial=20,
+           measure_option=measure_option,
+           callbacks=[autotvm.callback.log_to_file('conv2d.log')])
+
+#########################################################################
+# Finally we can inspect the best config from log file, check correctness,
+# and measure running time.
+
+# inspect the best config
+dispatch_context = autotvm.apply_history_best("conv2d.log")
+best_config = dispatch_context.query(task.target, task.workload)
+print("\nBest config:")
+print(best_config)
+
+# apply history best from log file
+with autotvm.apply_history_best('conv2d.log'):
+    with tvm.target.create("cuda"):
+        s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides, padding)
+        func = tvm.build(s, arg_bufs)
+
+# check correctness
+a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32)
+w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32)
+c_np = conv2d_nchw_python(a_np, w_np, strides, padding)
+
+ctx = tvm.gpu()
+a_tvm = tvm.nd.array(a_np, ctx=ctx)
+w_tvm = tvm.nd.array(w_np, ctx=ctx)
+c_tvm = tvm.nd.empty(c_np.shape, ctx=ctx)
+func(a_tvm, w_tvm, c_tvm)
+
+tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
+
+# Evaluate running time. Here we choose a large repeat number (400) to reduce the noise
+# and the overhead of kernel launch. You can also use nvprof to validate the result.
+evaluator = func.time_evaluator(func.entry_name, ctx, number=400)
+print('Time cost of this operator: %f' % evaluator(a_tvm, w_tvm, c_tvm).mean)
+
diff --git a/docs/_downloads/2daaacf3c023a8ad30b14e52b9aaa635/matrix_multiply_opt.ipynb b/docs/_downloads/2daaacf3c023a8ad30b14e52b9aaa635/matrix_multiply_opt.ipynb
new file mode 100644
index 0000000..96f65be
--- /dev/null
+++ b/docs/_downloads/2daaacf3c023a8ad30b14e52b9aaa635/matrix_multiply_opt.ipynb
@@ -0,0 +1,176 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n\nMatrix Multiply Blocking\n========================\n**Author**: `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_\n\nThis tutorial provides an overview on how to use TVM to map matrix\nmultiplication efficiently on the VTA design.\nWe recommend covering the `basic-mat-mult` tutorial first.\n\nIn this tutorial, we will demonstrate TVM schedule optimizations to break large\nneural network operators down onto smaller blocks to achieve computation within\nlimited hard [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "RPC Setup\n---------\nWe start by programming the Pynq's FPGA and building its RPC runtime.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from __future__ import absolute_import, print_function\n\nimport os\nimport tvm\nfrom tvm import te\nimport vta\nimport numpy as np\nfrom tvm import rpc\nfrom tvm.contrib import util\nfrom vta.testing import simulator\n\n# Load VTA parameters from the vta/vta-hw/config/vta_config.json file\nenv = vta.get_env()\n\n# We read the Pynq RPC host IP address and port number from the OS environment\nhost = os.environ.get(\"VTA_RPC_HOST\", \"192.168.2.99\")\nport = int(os.environ.get(\"V [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Computation Declaration\n-----------------------\nAs a first step, we need to describe our matrix multiplication computation.\nWe define the matrix multiplication as the computation one would find in a\nfully connected layer, defined by its batch size, input channels, and output\nchannels.\nThese have to be integer multiples of the VTA tensor shape:\n:code:`BATCH`, :code:`BLOCK_IN`, and :code:`BLOCK_OUT` respectively.\n\nWe've added extra operators to the matrix multiplication t [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Fully connected layer dimensions: 1024 x 1024\nbatch_size = 1\nin_channels = 1024\nout_channels = 1024\nassert batch_size % env.BATCH == 0\nassert in_channels % env.BLOCK_IN == 0\nassert out_channels % env.BLOCK_OUT == 0\n\n# Let's derive the tiled input tensor shapes\ndata_shape = (batch_size // env.BATCH,\n              in_channels // env.BLOCK_IN,\n              env.BATCH,\n              env.BLOCK_IN)\nweight_shape = (out_channels // env.BLOCK_OUT,\n                in_chann [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Scheduling the Computation\n--------------------------\nWe'll look at a set of schedule transformations necessary to map the\nmatrix multiplications onto VTA in an efficient fashion.\nThose include:\n\n- Computation blocking\n- Lowering to VTA hardware intrinsics\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Create TVM schedule\ns = te.create_schedule(res.op)\n# Let's look at the default TVM schedule\nprint(tvm.lower(s, [data, weight, res], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Blocking the Computation\n~~~~~~~~~~~~~~~~~~~~~~~~\nThe matrix multiplication is by default too large for activations or weights\nto fit on VTA's on-chip buffers all at once.\nWe block the (1, 1024) by (1024, 1024) matrix multiplication into\nsmaller (1, 256) by (256, 256) matrix multiplications so the intermediate\ntensors can fit on the accelerator's on-chip SRAM.\nThis approach is similar to blocking techniques applied to CPUs and GPUs in\norder to increase cache hit rate.\n\ [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Let's define tiling sizes (expressed in multiples of VTA tensor shape size)\nb_block = 1 // env.BATCH\ni_block = 256 // env.BLOCK_IN\no_block = 256 // env.BLOCK_OUT\n\n# Tile the output tensor along the batch and output channel dimensions\n# (since by default we are doing single batch inference, the split along\n#  the batch dimension has no effect)\nb, oc, b_tns, oc_tns = s[res].op.axis\nb_out, b_inn = s[res].split(b, b_block)\noc_out, oc_inn = s[res].split(oc, o_block)\ns[re [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Lowering Copies to DMA Transfers\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\nNext we set the buffer scopes to the corresponding on-chip VTA SRAM buffers.\nWe move the load loops into the matrix multiply computation loop to stage\nmemory loads such that they fit in the on-chip SRAM buffers.\nFinally we annotate the load/store loop outer axes with the DMA copy pragma\nto perform bulk memory transfers on VTA.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Set scope of SRAM buffers\ns[data_buf].set_scope(env.inp_scope)\ns[weight_buf].set_scope(env.wgt_scope)\ns[res_gemm].set_scope(env.acc_scope)\ns[res_shr].set_scope(env.acc_scope)\ns[res_min].set_scope(env.acc_scope)\ns[res_max].set_scope(env.acc_scope)\n\n# Block data and weight cache reads\ns[data_buf].compute_at(s[res_gemm], ic_out)\ns[weight_buf].compute_at(s[res_gemm], ic_out)\n\n# Use DMA copy pragma on DRAM->SRAM operations\ns[data_buf].pragma(s[data_buf].op.axis[0], env [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Lowering Computation to VTA Compute Intrinsics\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\nThe last phase is to lower the computation loops down to VTA hardware\nintrinsics by mapping the matrix multiplication to tensor intrinsics,\nand mapping the shift, and clipping computation to the vector ALU.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Apply tensorization over the batch tensor tile axis\ns[res_gemm].tensorize(b_tns, env.gemm)\n\n# Add an ALU pragma over the shift and clipping operations\ns[res_shr].pragma(s[res_shr].op.axis[0], env.alu)\ns[res_min].pragma(s[res_min].op.axis[0], env.alu)\ns[res_max].pragma(s[res_max].op.axis[0], env.alu)\n\n# Let's look at the final lowered TVM schedule after lowering memory\n# loads/stores down to DMA copy intrinsics, and the computation down to\n# VTA compute intrinsics.\np [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "TVM Compilation and Verification\n--------------------------------\nAfter specifying the schedule, we can compile it into a TVM function.\nWe save the module so we can send it over RPC.\nWe run the function and verify it against a numpy implementation to\nensure correctness.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Compile the TVM module\nmy_gemm = vta.build(s, [data, weight, res], \"ext_dev\", env.target_host, name=\"my_gemm\")\ntemp = util.tempdir()\nmy_gemm.save(temp.relpath(\"gemm.o\"))\nremote.upload(temp.relpath(\"gemm.o\"))\nf = remote.load_module(\"gemm.o\")\n\n# Get the remote device context\nctx = remote.ext_dev(0)\n\n# Initialize the data and weight arrays randomly in the int range of (-128, 128]\ndata_np = np.random.randint(\n    -128, 128, size=(batch_size, in_channels)).ast [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Summary\n-------\nThis tutorial demonstrates how TVM scheduling primitives can achieve\ncomputation blocking for a matrix multiplication example.\nThis allows us to map arbitrarily large computation onto limited\nhardware accelerator resources.\n\n\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/_downloads/2e974b05b6d59fcf944f96d27106b994/from_keras.ipynb b/docs/_downloads/2e974b05b6d59fcf944f96d27106b994/from_keras.ipynb
new file mode 100644
index 0000000..54e5370
--- /dev/null
+++ b/docs/_downloads/2e974b05b6d59fcf944f96d27106b994/from_keras.ipynb
@@ -0,0 +1,144 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\nCompile Keras Models\n=====================\n**Author**: `Yuwei Hu <https://Huyuwei.github.io/>`_\n\nThis article is an introductory tutorial to deploy keras models with Relay.\n\nFor us to begin with, keras should be installed.\nTensorflow is also required since it's used as the default backend of keras.\n\nA quick solution is to install via pip\n\n.. code-block:: bash\n\n    pip install -U keras --user\n    pip install -U tensorflow --user\n\nor please refer to official site [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import tvm\nfrom tvm import te\nimport tvm.relay as relay\nfrom tvm.contrib.download import download_testdata\nimport keras\nimport numpy as np"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Load pretrained keras model\n----------------------------\nWe load a pretrained resnet-50 classification model provided by keras.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "weights_url = ''.join(['https://github.com/fchollet/deep-learning-models/releases/',\n                       'download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5'])\nweights_file = 'resnet50_weights.h5'\nweights_path = download_testdata(weights_url, weights_file, module='keras')\nkeras_resnet50 = keras.applications.resnet50.ResNet50(include_top=True, weights=None,\n                                                      input_shape=(224, 224, 3), classes=1000)\nkeras_resn [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Load a test image\n------------------\nA single cat dominates the examples!\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from PIL import Image\nfrom matplotlib import pyplot as plt\nfrom keras.applications.resnet50 import preprocess_input\nimg_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'\nimg_path = download_testdata(img_url, 'cat.png', module='data')\nimg = Image.open(img_path).resize((224, 224))\nplt.imshow(img)\nplt.show()\n# input preprocess\ndata = np.array(img)[np.newaxis, :].astype('float32')\ndata = preprocess_input(data).transpose([0, 3, 1, 2])\nprint('input_ [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Compile the model with Relay\n----------------------------\nconvert the keras model(NHWC layout) to Relay format(NCHW layout).\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "shape_dict = {'input_1': data.shape}\nmod, params = relay.frontend.from_keras(keras_resnet50, shape_dict)\n# compile the model\ntarget = 'cuda'\nctx = tvm.gpu(0)\nwith relay.build_config(opt_level=3):\n    executor = relay.build_module.create_executor('graph', mod, ctx, target)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Execute on TVM\n---------------\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "dtype = 'float32'\ntvm_out = executor.evaluate()(tvm.nd.array(data.astype(dtype)), **params)\ntop1_tvm = np.argmax(tvm_out.asnumpy()[0])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Look up synset name\n-------------------\nLook up prediction top 1 index in 1000 class synset.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',\n                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',\n                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',\n                      'imagenet1000_clsid_to_human.txt'])\nsynset_name = 'imagenet1000_clsid_to_human.txt'\nsynset_path = download_testdata(synset_url, synset_name, module='data')\nwith open(synset_path) as f:\n    synset = eval(f.read())\nprint('Relay top-1 id: {}, class name:  [...]
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/_downloads/2f6dcf56b15f857f94b6d320c1ace6e5/from_coreml.ipynb b/docs/_downloads/2f6dcf56b15f857f94b6d320c1ace6e5/from_coreml.ipynb
new file mode 100644
index 0000000..6f98860
--- /dev/null
+++ b/docs/_downloads/2f6dcf56b15f857f94b6d320c1ace6e5/from_coreml.ipynb
@@ -0,0 +1,144 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\nCompile CoreML Models\n=====================\n**Author**: `Joshua Z. Zhang <https://zhreshold.github.io/>`_,             `Kazutaka Morita <https://github.com/kazum>`_,             `Zhao Wu <https://github.com/FrozenGene>`_\n\nThis article is an introductory tutorial to deploy CoreML models with Relay.\n\nFor us to begin with, coremltools module is required to be installed.\n\nA quick solution is to install via pip\n\n.. code-block:: bash\n\n    pip install -U coremltools --use [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import tvm\nfrom tvm import te\nimport tvm.relay as relay\nfrom tvm.contrib.download import download_testdata\nimport coremltools as cm\nimport numpy as np\nfrom PIL import Image"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Load pretrained CoreML model\n----------------------------\nWe will download and load a pretrained mobilenet classification network\nprovided by apple in this example\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "model_url = 'https://docs-assets.developer.apple.com/coreml/models/MobileNet.mlmodel'\nmodel_file = 'mobilenet.mlmodel'\nmodel_path = download_testdata(model_url, model_file, module='coreml')\n# Now you have mobilenet.mlmodel on disk\nmlmodel = cm.models.MLModel(model_path)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Load a test image\n------------------\nA single cat dominates the examples!\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'\nimg_path = download_testdata(img_url, 'cat.png', module='data')\nimg = Image.open(img_path).resize((224, 224))\n# Mobilenet.mlmodel's input is BGR format\nimg_bgr = np.array(img)[:,:,::-1]\nx = np.transpose(img_bgr, (2, 0, 1))[np.newaxis, :]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Compile the model on Relay\n---------------------------\nWe should be familiar with the process right now.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "target = 'llvm'\nshape_dict = {'image': x.shape}\n\n# Parse CoreML model and convert into Relay computation graph\nmod, params = relay.frontend.from_coreml(mlmodel, shape_dict)\n\nwith relay.build_config(opt_level=3):\n    graph, lib, params = relay.build(mod,\n                                     target,\n                                     params=params)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Execute on TVM\n-------------------\nThe process is no different from other example\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from tvm.contrib import graph_runtime\nctx = tvm.cpu(0)\ndtype = 'float32'\nm = graph_runtime.create(graph, lib, ctx)\n# set inputs\nm.set_input('image', tvm.nd.array(x.astype(dtype)))\nm.set_input(**params)\n# execute\nm.run()\n# get outputs\ntvm_output = m.get_output(0)\ntop1 = np.argmax(tvm_output.asnumpy()[0])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Look up synset name\n-------------------\nLook up prediction top 1 index in 1000 class synset.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',\n                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',\n                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',\n                      'imagenet1000_clsid_to_human.txt'])\nsynset_name = 'imagenet1000_clsid_to_human.txt'\nsynset_path = download_testdata(synset_url, synset_name, module='data')\nwith open(synset_path) as f:\n    synset = eval(f.read())\n# You should see the following result:  [...]
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/_downloads/30015213c2882505d466865fafaed52d/from_caffe2.ipynb b/docs/_downloads/30015213c2882505d466865fafaed52d/from_caffe2.ipynb
new file mode 100644
index 0000000..70a7e7f
--- /dev/null
+++ b/docs/_downloads/30015213c2882505d466865fafaed52d/from_caffe2.ipynb
@@ -0,0 +1,133 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\nCompile Caffe2 Models\n=====================\n**Author**: `Hiroyuki Makino <https://makihiro.github.io/>`_\n\nThis article is an introductory tutorial to deploy Caffe2 models with Relay.\n\nFor us to begin with, Caffe2 should be installed.\n\nA quick solution is to install via conda\n\n.. code-block:: bash\n\n    # for cpu\n    conda install pytorch-nightly-cpu -c pytorch\n    # for gpu with CUDA 8\n    conda install pytorch-nightly cuda80 -c pytorch\n\nor please refer to offi [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Load pretrained Caffe2 model\n----------------------------\nWe load a pretrained resnet50 classification model provided by Caffe2.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from caffe2.python.models.download import ModelDownloader\nmf = ModelDownloader()\n\nclass Model:\n    def __init__(self, model_name):\n        self.init_net, self.predict_net, self.value_info = mf.get_c2_model(model_name)\n\nresnet50 = Model('resnet50')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Load a test image\n------------------\nA single cat dominates the examples!\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from tvm.contrib.download import download_testdata\nfrom PIL import Image\nfrom matplotlib import pyplot as plt\nimport numpy as np\nimg_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'\nimg_path = download_testdata(img_url, 'cat.png', module='data')\nimg = Image.open(img_path).resize((224, 224))\nplt.imshow(img)\nplt.show()\n# input preprocess\ndef transform_image(image):\n    image = np.array(image) - np.array([123., 117., 104.])\n    image /= np.arra [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Compile the model on Relay\n--------------------------\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Caffe2 input tensor name, shape and type\ninput_name = resnet50.predict_net.op[0].input[0]\nshape_dict = {input_name: data.shape}\ndtype_dict = {input_name: data.dtype}\n\n# parse Caffe2 model and convert into Relay computation graph\nfrom tvm import relay\nmod, params = relay.frontend.from_caffe2(resnet50.init_net, resnet50.predict_net, shape_dict, dtype_dict)\n\n# compile the model\n# target x86 CPU\ntarget = 'llvm'\nwith relay.build_config(opt_level=3):\n    graph, lib, par [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Execute on TVM\n---------------\nThe process is no different from other examples.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import tvm\nfrom tvm import te\nfrom tvm.contrib import graph_runtime\n# context x86 CPU, use tvm.gpu(0) if you run on GPU\nctx = tvm.cpu(0)\n# create a runtime executor module\nm = graph_runtime.create(graph, lib, ctx)\n# set inputs\nm.set_input(input_name, tvm.nd.array(data.astype('float32')))\n# set related params\nm.set_input(**params)\n# execute\nm.run()\n# get outputs\ntvm_out = m.get_output(0)\ntop1_tvm = np.argmax(tvm_out.asnumpy()[0])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Look up synset name\n-------------------\nLook up prediction top 1 index in 1000 class synset.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from caffe2.python import workspace\nsynset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',\n                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',\n                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',\n                      'imagenet1000_clsid_to_human.txt'])\nsynset_name = 'imagenet1000_clsid_to_human.txt'\nsynset_path = download_testdata(synset_url, synset_name, module='data')\nwith open(synset_path) as f:\n    synset = eval(f.read())\npr [...]
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/_downloads/38606228ff7130fbd6473b7c0625ddcd/deploy_model_on_android.ipynb b/docs/_downloads/38606228ff7130fbd6473b7c0625ddcd/deploy_model_on_android.ipynb
new file mode 100644
index 0000000..6a993e9
--- /dev/null
+++ b/docs/_downloads/38606228ff7130fbd6473b7c0625ddcd/deploy_model_on_android.ipynb
@@ -0,0 +1,190 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n\nDeploy the Pretrained Model on Android\n=======================================\n**Author**: `Tomohiro Kato <https://tkat0.github.io/>`_\n\nThis is an example of using Relay to compile a keras model and deploy it on Android device.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import os\nimport numpy as np\nfrom PIL import Image\nimport keras\nfrom keras.applications.mobilenet_v2 import MobileNetV2\nimport tvm\nfrom tvm import te\nimport tvm.relay as relay\nfrom tvm import rpc\nfrom tvm.contrib import util, ndk, graph_runtime as runtime\nfrom tvm.contrib.download import download_testdata"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Setup Environment\n-----------------\nSince there are many required packages for Android, it is recommended to use the official Docker Image.\n\nFirst, to build and run Docker Image, we can run the following command.\n\n.. code-block:: bash\n\n  git clone --recursive https://github.com/apache/incubator-tvm tvm\n  cd tvm\n  docker build -t tvm.demo_android -f docker/Dockerfile.demo_android ./docker\n  docker run --pid=host -h tvm -v $PWD:/workspace \\\n         -w /workspace -p 9 [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Start RPC Tracker\n-----------------\nTVM uses RPC session to communicate with Android device.\n\nTo start an RPC tracker, run this command in the container. The tracker is\nrequired during the whole tuning process, so we need to open a new terminal for\nthis command:\n\n.. code-block:: bash\n\n  python3 -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190\n\nThe expected output is\n\n.. code-block:: bash\n\n  INFO:RPCTracker:bind to 0.0.0.0:9190\n\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Register Android device to RPC Tracker\n--------------------------------------\nNow we can register our Android device to the tracker.\n\nFollow this `readme page <https://github.com/apache/incubator-tvm/tree/master/apps/android_rpc>`_ to\ninstall TVM RPC APK on the android device.\n\nHere is an example of config.mk. I enabled OpenCL and Vulkan.\n\n\n.. code-block:: bash\n\n  APP_ABI = arm64-v8a\n\n  APP_PLATFORM = android-24\n\n  # whether enable OpenCL during compile\n  USE_OP [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Load pretrained keras model\n---------------------------\nWe load a pretrained MobileNetV2(alpha=0.5) classification model provided by keras.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "keras.backend.clear_session()  # Destroys the current TF graph and creates a new one.\nweights_url = ''.join(['https://github.com/JonathanCMitchell/',\n                       'mobilenet_v2_keras/releases/download/v1.1/',\n                       'mobilenet_v2_weights_tf_dim_ordering_tf_kernels_0.5_224.h5'])\nweights_file = 'mobilenet_v2_weights.h5'\nweights_path = download_testdata(weights_url, weights_file, module='keras')\nkeras_mobilenet_v2 = MobileNetV2(alpha=0.5, include_top [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "In order to test our model, here we download an image of cat and\ntransform its format.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'\nimg_name = 'cat.png'\nimg_path = download_testdata(img_url, img_name, module='data')\nimage = Image.open(img_path).resize((224, 224))\ndtype = 'float32'\n\ndef transform_image(image):\n    image = np.array(image) - np.array([123., 117., 104.])\n    image /= np.array([58.395, 57.12, 57.375])\n    image = image.transpose((2, 0, 1))\n    image = image[np.newaxis, :]\n    return image\n\nx = transform_im [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "synset is used to transform the label from number of ImageNet class to\nthe word human can understand.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',\n                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',\n                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',\n                      'imagenet1000_clsid_to_human.txt'])\nsynset_name = 'imagenet1000_clsid_to_human.txt'\nsynset_path = download_testdata(synset_url, synset_name, module='data')\nwith open(synset_path) as f:\n    synset = eval(f.read())"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Compile the model with relay\n----------------------------\nIf we run the example on our x86 server for demonstration, we can simply\nset it as :code:`llvm`. If running it on the Android device, we need to\nspecify its instruction set. Set :code:`local_demo` to False if you want\nto run this tutorial with a real device.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "local_demo = True\n\n# by default on CPU target will execute.\n# select 'cpu', 'opencl' and 'vulkan'\ntest_target = 'cpu'\n\n# Change target configuration.\n# Run `adb shell cat /proc/cpuinfo` to find the arch.\narch = 'arm64'\ntarget = 'llvm -target=%s-linux-android' % arch\ntarget_host = None\n\nif local_demo:\n    target_host = None\n    target = 'llvm'\nelif test_target == 'opencl':\n    target_host = target\n    target = 'opencl'\nelif test_target == 'vulkan':\n    target_h [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Deploy the Model Remotely by RPC\n--------------------------------\nWith RPC, you can deploy the model remotely from your host machine\nto the remote android device.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "tracker_host = os.environ.get('TVM_TRACKER_HOST', '0.0.0.0')\ntracker_port = int(os.environ.get('TVM_TRACKER_PORT', 9190))\nkey = 'android'\n\nif local_demo:\n    remote = rpc.LocalSession()\nelse:\n    tracker = rpc.connect_tracker(tracker_host, tracker_port)\n    # When running a heavy model, we should increase the `session_timeout`\n    remote = tracker.request(key, priority=0,\n                             session_timeout=60)\n\nif local_demo:\n    ctx = remote.cpu(0)\nelif  [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Execute on TVM\n--------------\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# set parameter (upload params to the remote device. This may take a while)\nmodule.set_input(**params)\n# set input data\nmodule.set_input(input_name, tvm.nd.array(x.astype(dtype)))\n# run\nmodule.run()\n# get output\nout = module.get_output(0)\n\n# get top1 result\ntop1 = np.argmax(out.asnumpy())\nprint('TVM prediction top-1: {}'.format(synset[top1]))\n\nprint('Evaluate inference time cost...')\nftimer = module.module.time_evaluator('run', ctx, number=1, repeat=10)\nprof_res = [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Sample Output\n-------------\nThe following is the result of 'cpu', 'opencl' and 'vulkan' using Adreno 530 on Snapdragon 820\n\nAlthough we can run on a GPU, it is slower than CPU.\nTo speed up, we need to write and optimize the schedule according to the GPU architecture.\n\n.. code-block:: bash\n\n   # cpu\n   TVM prediction top-1: tiger cat\n   Evaluate inference time cost...\n   Mean inference time (std dev): 37.92 ms (19.67 ms)\n\n   # opencl\n   TVM prediction top-1: tiger  [...]
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/_downloads/3961fdfa7abff1b6dc996faa43b4c40f/deploy_model_on_android.py b/docs/_downloads/3961fdfa7abff1b6dc996faa43b4c40f/deploy_model_on_android.py
new file mode 100644
index 0000000..17ec9cb
--- /dev/null
+++ b/docs/_downloads/3961fdfa7abff1b6dc996faa43b4c40f/deploy_model_on_android.py
@@ -0,0 +1,360 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+.. _tutorial-deploy-model-on-android:
+
+Deploy the Pretrained Model on Android
+=======================================
+**Author**: `Tomohiro Kato <https://tkat0.github.io/>`_
+
+This is an example of using Relay to compile a keras model and deploy it on Android device.
+"""
+
+import os
+import numpy as np
+from PIL import Image
+import keras
+from keras.applications.mobilenet_v2 import MobileNetV2
+import tvm
+from tvm import te
+import tvm.relay as relay
+from tvm import rpc
+from tvm.contrib import util, ndk, graph_runtime as runtime
+from tvm.contrib.download import download_testdata
+
+
+######################################################################
+# Setup Environment
+# -----------------
+# Since there are many required packages for Android, it is recommended to use the official Docker Image.
+#
+# First, to build and run Docker Image, we can run the following command.
+#
+# .. code-block:: bash
+#
+#   git clone --recursive https://github.com/apache/incubator-tvm tvm
+#   cd tvm
+#   docker build -t tvm.demo_android -f docker/Dockerfile.demo_android ./docker
+#   docker run --pid=host -h tvm -v $PWD:/workspace \
+#          -w /workspace -p 9190:9190 --name tvm -it tvm.demo_android bash
+#
+# You are now inside the container. The cloned TVM directory is mounted on /workspace.
+# At this time, mount the 9190 port used by RPC described later.
+#
+# .. note::
+#
+#   Please execute the following steps in the container.
+#   We can execute :code:`docker exec -it tvm bash` to open a new terminal in the container.
+#
+# Next we build the TVM.
+#
+# .. code-block:: bash
+#
+#   mkdir build
+#   cd build
+#   cmake -DUSE_LLVM=llvm-config-8 \
+#         -DUSE_RPC=ON \
+#         -DUSE_SORT=ON \
+#         -DUSE_VULKAN=ON \
+#         -DUSE_GRAPH_RUNTIME=ON \
+#         ..
+#   make -j10
+#
+# After building TVM successfully, Please set PYTHONPATH.
+#
+# .. code-block:: bash
+#
+#   echo 'export PYTHONPATH=/workspace/python:/workspace/topi/python:/workspace/vta/python:${PYTHONPATH}' >> ~/.bashrc
+#   source ~/.bashrc
+
+#################################################################
+# Start RPC Tracker
+# -----------------
+# TVM uses RPC session to communicate with Android device.
+#
+# To start an RPC tracker, run this command in the container. The tracker is
+# required during the whole tuning process, so we need to open a new terminal for
+# this command:
+#
+# .. code-block:: bash
+#
+#   python3 -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190
+#
+# The expected output is
+#
+# .. code-block:: bash
+#
+#   INFO:RPCTracker:bind to 0.0.0.0:9190
+
+#################################################################
+# Register Android device to RPC Tracker
+# --------------------------------------
+# Now we can register our Android device to the tracker.
+#
+# Follow this `readme page <https://github.com/apache/incubator-tvm/tree/master/apps/android_rpc>`_ to
+# install TVM RPC APK on the android device.
+#
+# Here is an example of config.mk. I enabled OpenCL and Vulkan.
+#
+#
+# .. code-block:: bash
+#
+#   APP_ABI = arm64-v8a
+#
+#   APP_PLATFORM = android-24
+#
+#   # whether enable OpenCL during compile
+#   USE_OPENCL = 1
+#
+#   # whether to enable Vulkan during compile
+#   USE_VULKAN = 1
+#
+#   ifeq ($(USE_VULKAN), 1)
+#     # Statically linking vulkan requires API Level 24 or higher
+#     APP_PLATFORM = android-24
+#   endif
+#
+#   # the additional include headers you want to add, e.g., SDK_PATH/adrenosdk/Development/Inc
+#   ADD_C_INCLUDES += /work/adrenosdk-linux-5_0/Development/Inc
+#   # downloaded from https://github.com/KhronosGroup/OpenCL-Headers
+#   ADD_C_INCLUDES += /usr/local/OpenCL-Headers/
+#
+#   # the additional link libs you want to add, e.g., ANDROID_LIB_PATH/libOpenCL.so
+#   ADD_LDLIBS = /workspace/pull-from-android-device/libOpenCL.so
+#
+# .. note::
+#
+#   At this time, don't forget to `create a standalone toolchain <https://github.com/apache/incubator-tvm/tree/master/apps/android_rpc#architecture-and-android-standalone-toolchain>`_ .
+#
+#   for example
+#
+#   .. code-block:: bash
+#
+#     /opt/android-sdk-linux/ndk-bundle/build/tools/make-standalone-toolchain.sh \
+#        --platform=android-24 --use-llvm --arch=arm64 --install-dir=/opt/android-toolchain-arm64
+#     export TVM_NDK_CC=/opt/android-toolchain-arm64/bin/aarch64-linux-android-g++
+#
+# Next, start the Android application and enter the IP address and port of RPC Tracker.
+# Then you have already registered your device.
+#
+# After registering devices, we can confirm it by querying rpc_tracker
+#
+# .. code-block:: bash
+#
+#   python3 -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190
+#
+# For example, if we have 1 Android device.
+# the output can be
+#
+# .. code-block:: bash
+#
+#    Queue Status
+#    ----------------------------------
+#    key          total  free  pending
+#    ----------------------------------
+#    android      1      1     0
+#    ----------------------------------
+#
+# To confirm that you can communicate with Android, we can run following test script.
+# If you use OpenCL and Vulkan, please set :code:`test_opencl` and :code:`test_vulkan` in the script.
+#
+# .. code-block:: bash
+#
+#   export TVM_TRACKER_HOST=0.0.0.0
+#   export TVM_TRACKER_PORT=9190
+#
+# .. code-block:: bash
+#
+#   cd /workspace/apps/android_rpc
+#   python3 tests/android_rpc_test.py
+#
+
+######################################################################
+# Load pretrained keras model
+# ---------------------------
+# We load a pretrained MobileNetV2(alpha=0.5) classification model provided by keras.
+keras.backend.clear_session()  # Destroys the current TF graph and creates a new one.
+weights_url = ''.join(['https://github.com/JonathanCMitchell/',
+                       'mobilenet_v2_keras/releases/download/v1.1/',
+                       'mobilenet_v2_weights_tf_dim_ordering_tf_kernels_0.5_224.h5'])
+weights_file = 'mobilenet_v2_weights.h5'
+weights_path = download_testdata(weights_url, weights_file, module='keras')
+keras_mobilenet_v2 = MobileNetV2(alpha=0.5, include_top=True, weights=None,
+                                input_shape=(224, 224, 3), classes=1000)
+keras_mobilenet_v2.load_weights(weights_path)
+
+######################################################################
+# In order to test our model, here we download an image of cat and
+# transform its format.
+img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
+img_name = 'cat.png'
+img_path = download_testdata(img_url, img_name, module='data')
+image = Image.open(img_path).resize((224, 224))
+dtype = 'float32'
+
+def transform_image(image):
+    image = np.array(image) - np.array([123., 117., 104.])
+    image /= np.array([58.395, 57.12, 57.375])
+    image = image.transpose((2, 0, 1))
+    image = image[np.newaxis, :]
+    return image
+
+x = transform_image(image)
+
+######################################################################
+# synset is used to transform the label from number of ImageNet class to
+# the word human can understand.
+synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
+                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',
+                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',
+                      'imagenet1000_clsid_to_human.txt'])
+synset_name = 'imagenet1000_clsid_to_human.txt'
+synset_path = download_testdata(synset_url, synset_name, module='data')
+with open(synset_path) as f:
+    synset = eval(f.read())
+
+
+######################################################################
+# Compile the model with relay
+# ----------------------------
+# If we run the example on our x86 server for demonstration, we can simply
+# set it as :code:`llvm`. If running it on the Android device, we need to
+# specify its instruction set. Set :code:`local_demo` to False if you want
+# to run this tutorial with a real device.
+
+local_demo = True
+
+# by default on CPU target will execute.
+# select 'cpu', 'opencl' and 'vulkan'
+test_target = 'cpu'
+
+# Change target configuration.
+# Run `adb shell cat /proc/cpuinfo` to find the arch.
+arch = 'arm64'
+target = 'llvm -target=%s-linux-android' % arch
+target_host = None
+
+if local_demo:
+    target_host = None
+    target = 'llvm'
+elif test_target == 'opencl':
+    target_host = target
+    target = 'opencl'
+elif test_target == 'vulkan':
+    target_host = target
+    target = 'vulkan'
+
+input_name = 'input_1'
+shape_dict = {input_name: x.shape}
+mod, params = relay.frontend.from_keras(keras_mobilenet_v2, shape_dict)
+
+with relay.build_config(opt_level=3):
+    graph, lib, params = relay.build(mod, target=target,
+                                     target_host=target_host, params=params)
+
+# After `relay.build`, you will get three return values: graph,
+# library and the new parameter, since we do some optimization that will
+# change the parameters but keep the result of model as the same.
+
+# Save the library at local temporary directory.
+tmp = util.tempdir()
+lib_fname = tmp.relpath('net.so')
+fcompile = ndk.create_shared if not local_demo else None
+lib.export_library(lib_fname, fcompile)
+
+######################################################################
+# Deploy the Model Remotely by RPC
+# --------------------------------
+# With RPC, you can deploy the model remotely from your host machine
+# to the remote android device.
+
+tracker_host = os.environ.get('TVM_TRACKER_HOST', '0.0.0.0')
+tracker_port = int(os.environ.get('TVM_TRACKER_PORT', 9190))
+key = 'android'
+
+if local_demo:
+    remote = rpc.LocalSession()
+else:
+    tracker = rpc.connect_tracker(tracker_host, tracker_port)
+    # When running a heavy model, we should increase the `session_timeout`
+    remote = tracker.request(key, priority=0,
+                             session_timeout=60)
+
+if local_demo:
+    ctx = remote.cpu(0)
+elif test_target == 'opencl':
+    ctx = remote.cl(0)
+elif test_target == 'vulkan':
+    ctx = remote.vulkan(0)
+else:
+    ctx = remote.cpu(0)
+
+# upload the library to remote device and load it
+remote.upload(lib_fname)
+rlib = remote.load_module('net.so')
+
+# create the remote runtime module
+module = runtime.create(graph, rlib, ctx)
+
+######################################################################
+# Execute on TVM
+# --------------
+
+# set parameter (upload params to the remote device. This may take a while)
+module.set_input(**params)
+# set input data
+module.set_input(input_name, tvm.nd.array(x.astype(dtype)))
+# run
+module.run()
+# get output
+out = module.get_output(0)
+
+# get top1 result
+top1 = np.argmax(out.asnumpy())
+print('TVM prediction top-1: {}'.format(synset[top1]))
+
+print('Evaluate inference time cost...')
+ftimer = module.module.time_evaluator('run', ctx, number=1, repeat=10)
+prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
+print('Mean inference time (std dev): %.2f ms (%.2f ms)' % (np.mean(prof_res),
+                                                            np.std(prof_res)))
+
+######################################################################
+# Sample Output
+# -------------
+# The following is the result of 'cpu', 'opencl' and 'vulkan' using Adreno 530 on Snapdragon 820
+#
+# Although we can run on a GPU, it is slower than CPU.
+# To speed up, we need to write and optimize the schedule according to the GPU architecture.
+#
+# .. code-block:: bash
+#
+#    # cpu
+#    TVM prediction top-1: tiger cat
+#    Evaluate inference time cost...
+#    Mean inference time (std dev): 37.92 ms (19.67 ms)
+#
+#    # opencl
+#    TVM prediction top-1: tiger cat
+#    Evaluate inference time cost...
+#    Mean inference time (std dev): 419.83 ms (7.49 ms)
+#
+#    # vulkan
+#    TVM prediction top-1: tiger cat
+#    Evaluate inference time cost...
+#    Mean inference time (std dev): 465.80 ms (4.52 ms)
diff --git a/docs/_downloads/39e437b36375e97c049f64073eade7a6/relay_quick_start.py b/docs/_downloads/39e437b36375e97c049f64073eade7a6/relay_quick_start.py
new file mode 100644
index 0000000..b2174a0
--- /dev/null
+++ b/docs/_downloads/39e437b36375e97c049f64073eade7a6/relay_quick_start.py
@@ -0,0 +1,160 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+.. _tutorial-relay-quick-start:
+
+Quick Start Tutorial for Compiling Deep Learning Models
+=======================================================
+**Author**: `Yao Wang <https://github.com/kevinthesun>`_, `Truman Tian <https://github.com/SiNZeRo>`_
+
+This example shows how to build a neural network with Relay python frontend and
+generates a runtime library for Nvidia GPU with TVM.
+Notice that you need to build TVM with cuda and llvm enabled.
+"""
+
+######################################################################
+# Overview for Supported Hardware Backend of TVM
+# ----------------------------------------------
+# The image below shows hardware backend currently supported by TVM:
+#
+# .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/tvm_support_list.png
+#      :align: center
+#
+# In this tutorial, we'll choose cuda and llvm as target backends.
+# To begin with, let's import Relay and TVM.
+
+import numpy as np
+
+from tvm import relay
+from tvm.relay import testing
+import tvm
+from tvm import te
+from tvm.contrib import graph_runtime
+
+######################################################################
+# Define Neural Network in Relay
+# ------------------------------
+# First, let's define a neural network with relay python frontend.
+# For simplicity, we'll use pre-defined resnet-18 network in Relay.
+# Parameters are initialized with Xavier initializer.
+# Relay also supports other model formats such as MXNet, CoreML, ONNX and
+# Tensorflow.
+#
+# In this tutorial, we assume we will do inference on our device
+# and the batch size is set to be 1. Input images are RGB color
+# images of size 224 * 224. We can call the :any:`tvm.relay.TupleWrapper.astext()`
+# to show the network structure.
+
+batch_size = 1
+num_class = 1000
+image_shape = (3, 224, 224)
+data_shape = (batch_size,) + image_shape
+out_shape = (batch_size, num_class)
+
+mod, params = relay.testing.resnet.get_workload(
+    num_layers=18, batch_size=batch_size, image_shape=image_shape)
+
+# set show_meta_data=True if you want to show meta data
+print(mod.astext(show_meta_data=False))
+
+######################################################################
+# Compilation
+# -----------
+# Next step is to compile the model using the Relay/TVM pipeline.
+# Users can specify the optimization level of the compilation.
+# Currently this value can be 0 to 3. The optimization passes include
+# operator fusion, pre-computation, layout transformation and so on.
+#
+# :py:func:`relay.build` returns three components: the execution graph in
+# json format, the TVM module library of compiled functions specifically
+# for this graph on the target hardware, and the parameter blobs of
+# the model. During the compilation, Relay does the graph-level
+# optimization while TVM does the tensor-level optimization, resulting
+# in an optimized runtime module for model serving.
+#
+# We'll first compile for Nvidia GPU. Behind the scene, :py:func:`relay.build`
+# first does a number of graph-level optimizations, e.g. pruning, fusing, etc.,
+# then registers the operators (i.e. the nodes of the optimized graphs) to
+# TVM implementations to generate a `tvm.module`.
+# To generate the module library, TVM will first transfer the high level IR
+# into the lower intrinsic IR of the specified target backend, which is CUDA
+# in this example. Then the machine code will be generated as the module library.
+
+opt_level = 3
+target = tvm.target.cuda()
+with relay.build_config(opt_level=opt_level):
+    graph, lib, params = relay.build(mod, target, params=params)
+
+#####################################################################
+# Run the generate library
+# ------------------------
+# Now we can create graph runtime and run the module on Nvidia GPU.
+
+# create random input
+ctx = tvm.gpu()
+data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
+# create module
+module = graph_runtime.create(graph, lib, ctx)
+# set input and parameters
+module.set_input("data", data)
+module.set_input(**params)
+# run
+module.run()
+# get output
+out = module.get_output(0, tvm.nd.empty(out_shape)).asnumpy()
+
+# Print first 10 elements of output
+print(out.flatten()[0:10])
+
+######################################################################
+# Save and Load Compiled Module
+# -----------------------------
+# We can also save the graph, lib and parameters into files and load them
+# back in deploy environment.
+
+####################################################
+
+# save the graph, lib and params into separate files
+from tvm.contrib import util
+
+temp = util.tempdir()
+path_lib = temp.relpath("deploy_lib.tar")
+lib.export_library(path_lib)
+with open(temp.relpath("deploy_graph.json"), "w") as fo:
+    fo.write(graph)
+with open(temp.relpath("deploy_param.params"), "wb") as fo:
+    fo.write(relay.save_param_dict(params))
+print(temp.listdir())
+
+####################################################
+
+# load the module back.
+loaded_json = open(temp.relpath("deploy_graph.json")).read()
+loaded_lib = tvm.runtime.load_module(path_lib)
+loaded_params = bytearray(open(temp.relpath("deploy_param.params"), "rb").read())
+input_data = tvm.nd.array(np.random.uniform(size=data_shape).astype("float32"))
+
+module = graph_runtime.create(loaded_json, loaded_lib, ctx)
+module.load_params(loaded_params)
+module.run(data=input_data)
+out_deploy = module.get_output(0).asnumpy()
+
+# Print first 10 elements of output
+print(out_deploy.flatten()[0:10])
+
+# check whether the output from deployed module is consistent with original one
+tvm.testing.assert_allclose(out_deploy, out, atol=1e-3)
diff --git a/docs/_downloads/440add54bfa6dfb4fa9ed5037187aa4c/opt_gemm.ipynb b/docs/_downloads/440add54bfa6dfb4fa9ed5037187aa4c/opt_gemm.ipynb
new file mode 100644
index 0000000..78de83b
--- /dev/null
+++ b/docs/_downloads/440add54bfa6dfb4fa9ed5037187aa4c/opt_gemm.ipynb
@@ -0,0 +1,309 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n\nHow to optimize GEMM on CPU\n===========================\n**Author**: `Jian Weng <https://github.com/were>`_,             `Ruofei Yu <https://github.com/yuruofeifei>`_\n\n(TL;DR) TVM provides abstract interfaces which allows users to depict an algorithm and the\nalgorithm's implementing organization (the so-called schedule) separately. Typically, writing\nalgorithm in high-performance schedule breaks the algorithm's readability and modularity. Also,\ntrying various seemingly [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Preparation and Baseline\n------------------------\nIn this tutorial, we will demo how to use TVM to optimize matrix multiplication.\nBefore actually demonstrating, we first define these variables.\nThen we write a baseline implementation, the simplest way to write a matrix multiplication in TVM.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import tvm\nfrom tvm import te\nimport numpy\nimport timeit\n\n# The size of the matrix\n# (M, K) x (K, N)\n# You are free to try out different shapes, sometimes TVM optimization outperforms numpy with MKL.\nM = 1024\nK = 1024\nN = 1024\n\n# The default tensor type in tvm\ndtype = \"float32\"\n\n# using Intel AVX2(Advanced Vector Extensions) ISA for SIMD\n# To get the best performance, please change the following line\n# to llvm -mcpu=core-avx2, or specific type of CPU you use\n [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "In TVM, we can always inspect lower level IR to debug or optimize our schedule.\nHere is the generated IR using our baseline schedule.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "print(tvm.lower(s, [A, B, C], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Blocking\n--------\nA important trick to enhance the cache hit rate is blocking --- data chunk will be computed\nblock by block. The memory access inside the block is a small neighbourhood which is with high\nmemory locality. In this tutorial, I picked up 32 as the blocking factor. So the block will\nfill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB (L1 data cache)\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "bn = 32\ns = te.create_schedule(C.op)\n\n# Blocking by loop tiling\nxo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)\nk, = s[C].op.reduce_axis\nko, ki = s[C].split(k, factor=4)\n\n# Hoist reduction domain outside the blocking loop\ns[C].reorder(xo, yo, ko, ki, xi, yi)\n\nfunc = tvm.build(s, [A, B, C], target=target, name='mmult')\nassert func\n\nc = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)\nfunc(a, b, c)\ntvm.testing.assert_allclose(c.asnumpy(), answer [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Here is the generated IR after blocking.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "print(tvm.lower(s, [A, B, C], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Vectorization\n-------------\nAnother important trick is vectorization. When the memory access pattern is uniform,\nthe compiler can detect this pattern and pass the continuous memory to vector processor. In TVM,\nwe can use `vectorize` interface to hint the compiler this pattern, so that we can accelerate it vastly.\n\nIn this tutorial, we chose to vectorize the inner loop row data since it is cache friendly.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "s = te.create_schedule(C.op)\nxo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)\nk, = s[C].op.reduce_axis\nko, ki = s[C].split(k, factor=4)\n\ns[C].reorder(xo, yo, ko, ki, xi, yi)\n\n# Vectorization\ns[C].vectorize(yi)\n\nfunc = tvm.build(s, [A, B, C], target=target, name='mmult')\nassert func\n\nc = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)\nfunc(a, b, c)\ntvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)\n\nevaluator = func.time_evaluator(fun [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Here is the generated IR after vectorization.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "print(tvm.lower(s, [A, B, C], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Loop Permutation\n----------------\nIf we look at the above IR, we can see the inner loop row data is vectorized and\nB is transformed into PackedB. The traversal of PackedB is sequential now.\nSo we will look at the access pattern of A. In current schedule, A is accessed column by column\nwhich is not cache friendly. If we change the nested loop order of ki and inner axes xi,\nthe access pattern for A matrix is more cache friendly.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "s = te.create_schedule(C.op)\nxo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)\nk, = s[C].op.reduce_axis\nko, ki = s[C].split(k, factor=4)\n\n# re-ordering\ns[C].reorder(xo, yo, ko, xi, ki, yi)\ns[C].vectorize(yi)\n\nfunc = tvm.build(s, [A, B, C], target=target, name='mmult')\nassert func\n\nc = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)\nfunc(a, b, c)\ntvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)\n\nevaluator = func.time_evaluator(func.en [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Here is the generated IR after loop permutation.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "print(tvm.lower(s, [A, B, C], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Array Packing\n-------------\nAnother important trick is array packing. This trick is to reorder the storage dimension of the\narray to convert the continuous access pattern on certain dimension to a sequential pattern after\nflattening.\n\n![](https://github.com/dmlc/web-data/raw/master/tvm/tutorial/array-packing.png)\n\n     :align: center\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Just as it is shown in the figure above, after blocking the computations, we can observe the array\naccess pattern of B (after flattening), which is regular but discontinuous. We expect that after\nsome transformation we can get continuous access pattern. We can reorder a [16][16] array to\na [16/4][16][4] array, so that the access pattern of B will be sequential when grabing\nthe corresponding value from the packed array.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# We have to re-write the algorithm slightly.\npackedB = te.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name='packedB')\nC = te.compute((M, N),\n                lambda x, y: te.sum(A[x, k] * packedB[y // bn, k, tvm.tir.indexmod(y, bn)], axis=k),\n                name = 'C')\n\ns = te.create_schedule(C.op)\n\nxo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)\nk, = s[C].op.reduce_axis\nko, ki = s[C].split(k, factor=4)\n\ns[C].reorder(xo, yo, ko, xi, ki, [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Here is the generated IR after array packing.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "print(tvm.lower(s, [A, B, C], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Write cache for blocks\n----------------------\nAfter blocking, the program will write result to C block by block, the access pattern\nis not sequential. So we can use a sequential cache array to hold the block results and\nwrite to C when all the block results are ready.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "s = te.create_schedule(C.op)\n\n# Allocate write cache\nCC = s.cache_write(C, 'global')\n\nxo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)\n\n# Write cache is computed at yo\ns[CC].compute_at(s[C], yo)\n\n# New inner axes\nxc, yc = s[CC].op.axis\n\nk, = s[CC].op.reduce_axis\nko, ki = s[CC].split(k, factor=4)\ns[CC].reorder(ko, xc, ki, yc)\ns[CC].unroll(ki)\ns[CC].vectorize(yc)\n\nx, y, z = s[packedB].op.axis\ns[packedB].vectorize(z)\ns[packedB].parallel(x)\n\nfunc [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Here is the generated IR after blocking.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "print(tvm.lower(s, [A, B, C], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Parallel\n--------\nFuthermore, we can also utilize multi-core processors to do the thread-level parallelization.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "s = te.create_schedule(C.op)\n\nCC = s.cache_write(C, 'global')\n\nxo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)\n\ns[CC].compute_at(s[C], yo)\n\nxc, yc = s[CC].op.axis\n\nk, = s[CC].op.reduce_axis\nko, ki = s[CC].split(k, factor=4)\ns[CC].reorder(ko, xc, ki, yc)\ns[CC].unroll(ki)\ns[CC].vectorize(yc)\n\n# parallel\ns[C].parallel(xo)\n\nx, y, z = s[packedB].op.axis\ns[packedB].vectorize(z)\ns[packedB].parallel(x)\n\nfunc = tvm.build(s, [A, B, C], target=target,  [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Here is the generated IR after parallelization.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "print(tvm.lower(s, [A, B, C], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Summary\n-------\nAfter applying the above simple optimizations with only 18 lines of code,\nour generated code can achieve 60% of the `numpy` performance with MKL.\nNote that the outputs on the web page reflect the running times on a non-exclusive\nDocker container, thereby they are *unreliable*. It is highly encouraged to run the\ntutorial by yourself to observe the performance gain acheived by TVM.\n\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/_downloads/48779ddff800bd9d4b8b3bd7ef8f054c/using_external_lib.py b/docs/_downloads/48779ddff800bd9d4b8b3bd7ef8f054c/using_external_lib.py
new file mode 100644
index 0000000..7063c0e
--- /dev/null
+++ b/docs/_downloads/48779ddff800bd9d4b8b3bd7ef8f054c/using_external_lib.py
@@ -0,0 +1,561 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Using External Libraries in Relay
+=================================
+**Author**: `Masahiro Masuda <https://github.com/masahi>`_, `Truman Tian <https://github.com/SiNZeRo>`_
+
+This is a short tutorial on how to use external libraries such as cuDNN, or cuBLAS with Relay.
+
+Relay uses TVM internally to generate target specific code. For example, with cuda backend TVM generates cuda kernels for all layers in the user provided network.
+But sometimes it is also helpful to incorporate external libraries developed by various vendors into Relay.
+Luckily, TVM has a mechanism to transparently call into these libraries.
+For Relay users, all we need to do is just to set a target string appropriately.
+
+Before we can use external libraries from Relay, your TVM needs to be built with libraries you want to use.
+For example, to use cuDNN, USE_CUDNN option in `cmake/config.cmake` needs to be enabled, and cuDNN include and library directories need to be specified if necessary.
+
+To begin with, we import Relay and TVM.
+"""
+import tvm
+from tvm import te
+import numpy as np
+from tvm.contrib import graph_runtime as runtime
+from tvm import relay
+from tvm.relay import testing
+
+######################################################################
+# Create a simple network
+# -----------------------
+# Let's create a very simple network for demonstration.
+# It consists of convolution, batch normalization, and ReLU activation.
+
+out_channels = 16
+batch_size = 1
+
+data = relay.var("data", relay.TensorType((batch_size, 3, 224, 224), "float32"))
+weight = relay.var("weight")
+bn_gamma = relay.var("bn_gamma")
+bn_beta = relay.var("bn_beta")
+bn_mmean = relay.var("bn_mean")
+bn_mvar = relay.var("bn_var")
+
+simple_net = relay.nn.conv2d(data=data, weight=weight, kernel_size=(3,3), channels=out_channels, padding=(1, 1))
+simple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, bn_mmean, bn_mvar)[0]
+simple_net = relay.nn.relu(simple_net)
+simple_net = relay.Function(relay.analysis.free_vars(simple_net), simple_net)
+
+data_shape = (batch_size, 3, 224, 224)
+net, params = testing.create_workload(simple_net)
+
+######################################################################
+# Build and run with cuda backend
+# -------------------------------
+# We build and run this network with cuda backend, as usual.
+# By setting the logging level to DEBUG, the result of Relay graph compilation will be dumped as pseudo code.
+import logging
+logging.basicConfig(level=logging.DEBUG) # to dump TVM IR after fusion
+
+target = "cuda"
+graph, lib, params = relay.build_module.build(
+    net, target, params=params)
+
+ctx = tvm.context(target, 0)
+data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
+module = runtime.create(graph, lib, ctx)
+module.set_input(**params)
+module.set_input("data", data)
+module.run()
+out_shape = (batch_size, out_channels, 224, 224)
+out = module.get_output(0, tvm.nd.empty(out_shape))
+out_cuda = out.asnumpy()
+######################################################################
+# The generated pseudo code should look something like below.
+# Note how bias add, batch normalization, and ReLU activation are fused into the convolution kernel.
+# TVM generates a single, fused kernel from this representation.
+#
+# .. code-block:: text
+#
+#       produce tensor {
+#         // attr [iter_var(blockIdx.z, , blockIdx.z)] thread_extent = 1
+#         // attr [compute] storage_scope = "local"
+#         allocate compute[float32 * 32]
+#         // attr [pad_temp.shared] storage_scope = "shared"
+#         allocate pad_temp.shared[float32 * 180]
+#         // attr [placeholder.shared] storage_scope = "shared"
+#         allocate placeholder.shared[float32 * 144]
+#         // attr [iter_var(blockIdx.y, , blockIdx.y)] thread_extent = 28
+#         // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 14
+#         // attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent = 4
+#         // attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 1
+#         // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 16
+#         produce compute {
+#           compute[0] = 0.000000f
+#           compute[1] = 0.000000f
+#           compute[2] = 0.000000f
+#           compute[3] = 0.000000f
+#           compute[4] = 0.000000f
+#           compute[5] = 0.000000f
+#           compute[6] = 0.000000f
+#           compute[7] = 0.000000f
+#           compute[8] = 0.000000f
+#           compute[9] = 0.000000f
+#           compute[10] = 0.000000f
+#           compute[11] = 0.000000f
+#           compute[12] = 0.000000f
+#           compute[13] = 0.000000f
+#           compute[14] = 0.000000f
+#           compute[15] = 0.000000f
+#           compute[16] = 0.000000f
+#           compute[17] = 0.000000f
+#           compute[18] = 0.000000f
+#           compute[19] = 0.000000f
+#           compute[20] = 0.000000f
+#           compute[21] = 0.000000f
+#           compute[22] = 0.000000f
+#           compute[23] = 0.000000f
+#           compute[24] = 0.000000f
+#           compute[25] = 0.000000f
+#           compute[26] = 0.000000f
+#           compute[27] = 0.000000f
+#           compute[28] = 0.000000f
+#           compute[29] = 0.000000f
+#           compute[30] = 0.000000f
+#           compute[31] = 0.000000f
+#           for (rc.outer, 0, 3) {
+#             produce pad_temp.shared {
+#               // attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent = 4
+#               // attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 1
+#               // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 16
+#               if (likely(((threadIdx.z*15) < (60 - threadIdx.x)))) {
+#                 if (likely((threadIdx.x < 15))) {
+#                   pad_temp.shared[(((((threadIdx.z*15) + threadIdx.x)/60)*180) + ((((((threadIdx.z*15) + threadIdx.x)/6) % 10)*18) + ((((threadIdx.z*3) + threadIdx.x)*3) % 18)))] = tvm_if_then_else((((((1 - ((((threadIdx.z*15) + threadIdx.x)/6) % 10)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < (225 - ((((threadIdx.z*15) + threadIdx.x)/6) % 10)))) && ((1 - ((((threadIdx.z*3) + threadIdx.x)*3) % 18)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (225 - ((((threadIdx.z*3) + threadIdx.x)*3) % 18 [...]
+#                   pad_temp.shared[(((((((threadIdx.z*15) + threadIdx.x)*3) + 1)/180)*180) + ((((((((threadIdx.z*15) + threadIdx.x)*3) + 1)/18) % 10)*18) + (((((threadIdx.z*3) + threadIdx.x)*3) + 1) % 18)))] = tvm_if_then_else((((((1 - ((((((threadIdx.z*15) + threadIdx.x)*3) + 1)/18) % 10)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < (225 - ((((((threadIdx.z*15) + threadIdx.x)*3) + 1)/18) % 10)))) && ((1 - (((((threadIdx.z*3) + threadIdx.x)*3) + 1) % 18)) <= (blockIdx.x*16))) && ((blockIdx. [...]
+#                   pad_temp.shared[(((((((threadIdx.z*15) + threadIdx.x)*3) + 2)/180)*180) + ((((((((threadIdx.z*15) + threadIdx.x)*3) + 2)/18) % 10)*18) + (((((threadIdx.z*3) + threadIdx.x)*3) + 2) % 18)))] = tvm_if_then_else((((((1 - ((((((threadIdx.z*15) + threadIdx.x)*3) + 2)/18) % 10)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < (225 - ((((((threadIdx.z*15) + threadIdx.x)*3) + 2)/18) % 10)))) && ((1 - (((((threadIdx.z*3) + threadIdx.x)*3) + 2) % 18)) <= (blockIdx.x*16))) && ((blockIdx. [...]
+#                 }
+#               }
+#             }
+#             produce placeholder.shared {
+#               // attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent = 4
+#               // attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 1
+#               // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 16
+#               if (likely(((threadIdx.z*4) < (16 - (threadIdx.x/3))))) {
+#                 if (likely(((threadIdx.z*12) < (48 - threadIdx.x)))) {
+#                   if (likely((threadIdx.x < 12))) {
+#                     placeholder.shared[(((((threadIdx.z*4) + (threadIdx.x/3))*3) + (threadIdx.x % 3))*3)] = placeholder[(((((rc.outer + (threadIdx.z*12)) + ((threadIdx.x/3)*3))*3) + (threadIdx.x % 3))*3)]
+#                     placeholder.shared[((((((threadIdx.z*4) + (threadIdx.x/3))*3) + (threadIdx.x % 3))*3) + 1)] = placeholder[((((((rc.outer + (threadIdx.z*12)) + ((threadIdx.x/3)*3))*3) + (threadIdx.x % 3))*3) + 1)]
+#                     placeholder.shared[((((((threadIdx.z*4) + (threadIdx.x/3))*3) + (threadIdx.x % 3))*3) + 2)] = placeholder[((((((rc.outer + (threadIdx.z*12)) + ((threadIdx.x/3)*3))*3) + (threadIdx.x % 3))*3) + 2)]
+#                   }
+#                 }
+#               }
+#             }
+#             compute[0] = (compute[0] + (pad_temp.shared[threadIdx.x]*placeholder.shared[(threadIdx.z*36)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[(threadIdx.z*36)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[(threadIdx.z*36)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[(threadIdx.z*36)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[(threadIdx.z*36)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[(threadIdx.z*36)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[(threadIdx.z*36)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[(threadIdx.z*36)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[threadIdx.x]*placeholder.shared[((threadIdx.z*36) + 9)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 9)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 9)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 9)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 9)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 9)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 9)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 9)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[threadIdx.x]*placeholder.shared[((threadIdx.z*36) + 18)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 18)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 18)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 18)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 18)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 18)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 18)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 18)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[threadIdx.x]*placeholder.shared[((threadIdx.z*36) + 27)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 27)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 27)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 27)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 27)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 27)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 27)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 27)]))
+#             compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 1)]*placeholder.shared[((threadIdx.z*36) + 1)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 1)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 1)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 1)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 1)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 1)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 1)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 1)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 1)]*placeholder.shared[((threadIdx.z*36) + 10)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 10)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 10)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 10)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 10)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 10)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 10)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 10)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 1)]*placeholder.shared[((threadIdx.z*36) + 19)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 19)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 19)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 19)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 19)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 19)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 19)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 19)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 1)]*placeholder.shared[((threadIdx.z*36) + 28)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 28)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 28)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 28)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 28)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 28)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 28)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 28)]))
+#             compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 2)]*placeholder.shared[((threadIdx.z*36) + 2)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 2)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 2)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 2)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 2)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 2)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 2)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 2)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 2)]*placeholder.shared[((threadIdx.z*36) + 11)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 11)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 11)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 11)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 11)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 11)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 11)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 11)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 2)]*placeholder.shared[((threadIdx.z*36) + 20)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 20)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 20)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 20)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 20)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 20)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 20)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 20)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 2)]*placeholder.shared[((threadIdx.z*36) + 29)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 29)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 29)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 29)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 29)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 29)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 29)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 29)]))
+#             compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 3)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 3)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 3)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 3)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 3)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 3)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 3)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 3)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 12)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 12)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 12)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 12)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 12)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 12)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 12)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 12)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 21)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 21)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 21)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 21)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 21)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 21)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 21)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 21)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 30)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 30)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 30)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 30)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 30)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 30)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 30)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 30)]))
+#             compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 4)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 4)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 4)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 4)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 4)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 4)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 4)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 4)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 13)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 13)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 13)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 13)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 13)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 13)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 13)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 13)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 22)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 22)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 22)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 22)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 22)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 22)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 22)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 22)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 31)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 31)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 31)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 31)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 31)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 31)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 31)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 31)]))
+#             compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 5)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 5)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 5)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 5)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 5)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 5)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 5)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 5)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 14)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 14)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 14)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 14)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 14)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 14)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 14)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 14)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 23)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 23)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 23)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 23)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 23)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 23)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 23)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 23)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 32)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 32)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 32)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 32)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 32)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 32)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 32)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 32)]))
+#             compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 6)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 6)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 6)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 6)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 6)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 6)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 6)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 162)]*placeholder.shared[((threadIdx.z*36) + 6)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 15)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 15)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 15)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 15)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 15)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 15)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 15)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 162)]*placeholder.shared[((threadIdx.z*36) + 15)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 24)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 24)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 24)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 24)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 24)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 24)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 24)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 162)]*placeholder.shared[((threadIdx.z*36) + 24)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 33)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 33)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 33)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 33)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 33)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 33)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 33)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 162)]*placeholder.shared[((threadIdx.z*36) + 33)]))
+#             compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 7)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 7)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 7)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 7)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 7)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 7)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 7)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 163)]*placeholder.shared[((threadIdx.z*36) + 7)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 16)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 16)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 16)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 16)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 16)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 16)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 16)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 163)]*placeholder.shared[((threadIdx.z*36) + 16)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 25)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 25)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 25)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 25)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 25)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 25)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 25)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 163)]*placeholder.shared[((threadIdx.z*36) + 25)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 34)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 34)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 34)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 34)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 34)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 34)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 34)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 163)]*placeholder.shared[((threadIdx.z*36) + 34)]))
+#             compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 8)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 8)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 8)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 8)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 8)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 8)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 8)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 164)]*placeholder.shared[((threadIdx.z*36) + 8)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 17)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 17)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 17)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 17)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 17)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 17)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 17)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 164)]*placeholder.shared[((threadIdx.z*36) + 17)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 26)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 26)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 26)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 26)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 26)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 26)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 26)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 164)]*placeholder.shared[((threadIdx.z*36) + 26)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 35)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 35)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 35)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 35)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 35)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 35)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 35)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 164)]*placeholder.shared[((threadIdx.z*36) + 35)]))
+#           }
+#         }
+#         tensor[(((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x)] = max(((compute[0]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 224)] = max(((compute[1]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 448)] = max(((compute[2]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 672)] = max(((compute[3]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 896)] = max(((compute[4]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 1120)] = max(((compute[5]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 1344)] = max(((compute[6]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 1568)] = max(((compute[7]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 50176)] = max(((compute[8]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 50400)] = max(((compute[9]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 50624)] = max(((compute[10]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 50848)] = max(((compute[11]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 51072)] = max(((compute[12]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 51296)] = max(((compute[13]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 51520)] = max(((compute[14]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 51744)] = max(((compute[15]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 100352)] = max(((compute[16]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 100576)] = max(((compute[17]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 100800)] = max(((compute[18]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 101024)] = max(((compute[19]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 101248)] = max(((compute[20]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 101472)] = max(((compute[21]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 101696)] = max(((compute[22]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 101920)] = max(((compute[23]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 150528)] = max(((compute[24]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 150752)] = max(((compute[25]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 150976)] = max(((compute[26]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 151200)] = max(((compute[27]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 151424)] = max(((compute[28]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 151648)] = max(((compute[29]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 151872)] = max(((compute[30]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 152096)] = max(((compute[31]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f)
+#       }
+
+######################################################################
+# Use cuDNN for a convolutional layer
+# -----------------------------------
+# We can use cuDNN to replace convolution kernels with cuDNN ones.
+# To do that, all we need to do is to append the option " -libs=cudnn" to the target string.
+net, params = testing.create_workload(simple_net)
+target = "cuda -libs=cudnn" # use cudnn for convolution
+graph, lib, params = relay.build_module.build(
+        net, target, params=params)
+
+ctx = tvm.context(target, 0)
+data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
+module = runtime.create(graph, lib, ctx)
+module.set_input(**params)
+module.set_input("data", data)
+module.run()
+out_shape = (batch_size, out_channels, 224, 224)
+out = module.get_output(0, tvm.nd.empty(out_shape))
+out_cudnn = out.asnumpy()
+
+######################################################################
+# Note that if you use cuDNN, Relay cannot fuse convolution with layers following it.
+# This is because layer fusion happens at the level of TVM internal representation(IR).
+# Relay treats external libraries as black box, so there is no way to fuse them with TVM IR.
+#
+# The pseudo code below shows that cuDNN convolution + bias add + batch norm + ReLU turned into two stages of computation, one for cuDNN call and the other for the rest of operations.
+#
+# .. code-block:: text
+#
+#      // attr [y] storage_scope = "global"
+#      allocate y[float32 * 802816]
+#      produce y {
+#        // attr [0] extern_scope = 0
+#        tvm_call_packed("tvm.contrib.cudnn.conv2d.forward", 1, 0, 1, 1, 1, 1, 1, 1, 1, tvm_stack_make_array(placeholder, tvm_stack_make_shape(1, 3, 224, 224), 0, 4, 0.000000f, 0), tvm_stack_make_array(placeholder, tvm_stack_make_shape(16, 3, 3, 3), 0, 4, 0.000000f, 0), tvm_stack_make_array(y, tvm_stack_make_shape(1, 16, 224, 224), 0, 4, 0.000000f, 0))
+#      }
+#      produce tensor {
+#        // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 256
+#        // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 512
+#        for (ax0.ax1.fused.ax2.fused.ax3.fused.outer, 0, 7) {
+#          if (likely(((blockIdx.x*512) < ((802816 - (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072)) - threadIdx.x)))) {
+#            tensor[(((((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/802816)*802816) + (((((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/224) % 224)*224) + ((((blockIdx.x*64) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*32)) % 224))) + ((((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/50176) % 16)*50176))] = max(((y[(((((((blockIdx.x*512) + threadIdx.x) + ( [...]
+#          }
+#        }
+#      }
+
+
+######################################################################
+# Verify the result
+# -----------------
+# We can check that the results of two runs match.
+
+tvm.testing.assert_allclose(out_cuda, out_cudnn, rtol=1e-5)
+
+#####################################################################
+# Conclusion
+# ----------
+# This tutorial covered the usage of cuDNN with Relay.
+# We also have support for cuBLAS. If cuBLAS is enabled, it will be used inside a fully connected layer (relay.dense).
+# To use cuBLAS, set a target string as "cuda -libs=cublas".
+# You can use both cuDNN and cuBLAS with "cuda -libs=cudnn,cublas".
+#
+# For ROCm backend, we have support for MIOpen and rocBLAS.
+# They can be enabled with target "rocm -libs=miopen,rocblas".
+#
+# Being able to use external libraries is great, but we need to keep in mind some cautions.
+#
+# First, the use of external libraries may restrict your usage of TVM and Relay.
+# For example, MIOpen only supports NCHW layout and fp32 data type at the moment, so you cannot use other layouts or data type in TVM.
+#
+# Second, and more importantly, external libraries restrict the possibility of operator fusion during graph compilation, as shown above.
+# TVM and Relay aim to achieve the best performance on a variety of hardwares, with joint operator level and graph level optimization.
+# To achieve this goal, we should continue developing better optimizations for TVM and Relay, while using external libraries as a nice way to fall back to existing implementation when necessary.
diff --git a/docs/_downloads/48bd751ebaae08fce134e559f86a25cc/tune_relay_vta.ipynb b/docs/_downloads/48bd751ebaae08fce134e559f86a25cc/tune_relay_vta.ipynb
new file mode 100644
index 0000000..4d9ef9d
--- /dev/null
+++ b/docs/_downloads/48bd751ebaae08fce134e559f86a25cc/tune_relay_vta.ipynb
@@ -0,0 +1,186 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\nAuto-tuning a convolutional network on VTA\n==========================================\n**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_\n\nAuto-tuning for a specific accelerator design is critical for getting the best\nperformance for any given operator. This is a tutorial showcases how to tune a\nwhole convolutional network on VTA.\n\nThe operator implementation for VTA in TVM is written in template fo [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Install dependencies\n--------------------\nTo use the autotvm package in tvm, we need to install some extra dependencies.\n(change \"3\" to \"2\" if you use python2):\n\n.. code-block:: bash\n\n  pip3 install --user psutil xgboost tornado mxnet requests \"Pillow<7\"\n\nTo make TVM run faster during tuning, it is recommended to use cython\nas FFI of TVM. In the root directory of TVM, execute\n(change \"3\" to \"2\" if you use python2):\n\n.. code-block:: bash\n\n  pip3 install - [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import os\nfrom mxnet.gluon.model_zoo import vision\nimport numpy as np\nfrom PIL import Image\n\nimport topi\nimport tvm\nfrom tvm import te\nfrom tvm import rpc, autotvm, relay\nfrom tvm.contrib import graph_runtime, util, download\nfrom tvm.autotvm.measure.measure_methods import request_remote\nfrom tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner\n\nimport vta\nfrom vta.testing import simulator\nfrom vta.top import graph_pack"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Compile network\n---------------\nPerform vta-specific compilation with Relay from a Gluon model\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "def compile_network(env, target, model, start_pack, stop_pack):\n\n    # Populate the shape and data type dictionary\n    dtype_dict = {\"data\": 'float32'}\n    shape_dict = {\"data\": (env.BATCH, 3, 224, 224)}\n\n    # Get off the shelf gluon model, and convert to relay\n    gluon_model = vision.get_model(model, pretrained=True)\n    mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)\n\n    # Update shape and type dictionary\n    shape_dict.update({k: v.shape for [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Start RPC Tracker\n-----------------\nTVM uses an RPC session to communicate with Pynq boards.\nDuring tuning, the tuner will send the generated code to the board and\nmeasure the speed of code on the board.\n\nTo scale up tuning, TVM uses an RPC Tracker to manage multiple devices.\nThe RPC Tracker is a centralized master node. We can register all devices to\nthe tracker. For example, if we have 10 Pynq boards, we can register all of them\nto the tracker, and run 10 measurements [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Register devices to RPC Tracker\n-----------------------------------\nNow we can register our devices to the tracker. The first step is to\nbuild the TVM runtime for the Pynq devices.\n\nFollow `vta-index`\nto build the TVM runtime on the device. Then register the device to the tracker with:\n\n.. code-block:: bash\n\n  python -m tvm.exec.rpc_server --tracker=[HOST_IP]:9190 --key=pynq\n\n(replace :code:`[HOST_IP]` with the IP address of your host machine)\n\nAfter registering de [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Set Tuning Options\n------------------\nBefore tuning, we should apply some configurations.\nHere we use an Pynq-Z1 board as an example.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Tracker host and port can be set by your environment\ntracker_host = os.environ.get(\"TVM_TRACKER_HOST\", '0.0.0.0')\ntracker_port = int(os.environ.get(\"TVM_TRACKER_PORT\", 9190))\n\n# Load VTA parameters from the vta/vta-hw/config/vta_config.json file\nenv = vta.get_env()\n\n# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.\n# Set ``device=arm_cpu`` to run inference on the CPU\n# or ``device=vta`` to run inference on the FPGA.\nd [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "<div class=\"alert alert-info\"><h4>Note</h4><p>How to set tuning options\n\n  In general, the default values provided here work well.\n  If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping`\n  to larger values, makes the tuning run for longer.\n  If your device is under-powered or your conv2d operators are large, consider\n  setting a longer timeout.</p></div>\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Begin Tuning\n------------\nNow we can extract tuning tasks from the network and begin tuning.\nHere, we provide a simple utility function to tune a list of tasks.\nThis function is just an initial implementation which tunes them in sequential order.\nWe will introduce a more sophisticated tuning scheduler in the future.\n\nGiven that the tuning will be done on Pynq FPGA boards, make sure that\nthe ```TARGET`` entry in the ``vta_config.json`` file is set to ``pynq``.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# You can skip the implementation of this function for this tutorial.\ndef tune_tasks(tasks,\n               measure_option,\n               tuner='xgb',\n               n_trial=1000,\n               early_stopping=None,\n               log_filename='tuning.log',\n               use_transfer_learning=True):\n\n    # create tmp log file\n    tmp_log_file = log_filename + \".tmp\"\n    if os.path.exists(tmp_log_file):\n        os.remove(tmp_log_file)\n\n    for i, tsk in enumerate [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Register VTA-specific tuning tasks\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "def register_vta_tuning_tasks():\n    from tvm.autotvm.task import TaskExtractEnv\n\n    @tvm.te.tag_scope(tag=topi.tag.ELEMWISE)\n    def my_clip(x, a_min, a_max):\n        \"\"\"Unlike topi's current clip, put min and max into two stages.\"\"\"\n        const_min = tvm.tir.const(a_min, x.dtype)\n        const_max = tvm.tir.const(a_max, x.dtype)\n        x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name=\"clipA\")\n        x = te.compute(x.shape, lambda *i:  [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Finally, we launch tuning jobs and evaluate the end-to-end performance.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "def tune_and_evaluate(tuning_opt):\n\n    if env.TARGET != \"sim\":\n        # Get remote from fleet node\n        remote = autotvm.measure.request_remote(env.TARGET,\n                                                tracker_host,\n                                                tracker_port,\n                                                timeout=10000)\n        # Reconfigure the JIT runtime and FPGA.\n        vta.reconfig_runtime(remote)\n        vta.program_fpga(remote, bitst [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Sample Output\n-------------\nThe tuning needs to compile many programs and extract feature from them.\nSo a high performance CPU is recommended.\nOne sample output is listed below.\nIt takes about 2 hours on a 16T CPU, and 6 Pynq boards.\n\n.. code-block:: bash\n\n   Extract tasks...\n   [Warning] Invalid shape during AutoTVM task creation\n   Extracted 10 conv2d tasks:\n       Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 16, 14, 14, 1, 16), 'int8'), ('TENSOR', (32, 16,  [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "<div class=\"alert alert-info\"><h4>Note</h4><p>**Experiencing Difficulties?**\n\n  The auto tuning module is error-prone. If you always see \" 0.00/ 0.00 GFLOPS\",\n  then there must be something wrong.\n\n  First, make sure you set the correct configuration of your device.\n  Then, you can print debug information by adding these lines in the beginning\n  of the script. It will print every measurement result, where you can find useful\n  error messages.\n\n  .. code-block:: pyt [...]
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/_downloads/4cb9cb94f36033c7820ba70d890df4a1/cross_compilation_and_rpc.py b/docs/_downloads/4cb9cb94f36033c7820ba70d890df4a1/cross_compilation_and_rpc.py
new file mode 100644
index 0000000..553d77d
--- /dev/null
+++ b/docs/_downloads/4cb9cb94f36033c7820ba70d890df4a1/cross_compilation_and_rpc.py
@@ -0,0 +1,263 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+.. _tutorial-cross-compilation-and-rpc:
+
+Cross Compilation and RPC
+=========================
+**Author**: `Ziheng Jiang <https://github.com/ZihengJiang/>`_, `Lianmin Zheng <https://github.com/merrymercy/>`_
+
+This tutorial introduces cross compilation and remote device
+execution with RPC in TVM.
+
+With cross compilation and RPC, you can **compile a program on your
+local machine then run it on the remote device**. It is useful when
+the remote device resource are limited, like Raspberry Pi and mobile
+platforms. In this tutorial, we will use the Raspberry Pi for a CPU example
+and the Firefly-RK3399 for an OpenCL example.
+"""
+
+######################################################################
+# Build TVM Runtime on Device
+# ---------------------------
+#
+# The first step is to build the TVM runtime on the remote device.
+#
+# .. note::
+#
+#   All instructions in both this section and the next section should be
+#   executed on the target device, e.g. Raspberry Pi.  We assume the target
+#   is running Linux.
+#
+# Since we do compilation on the local machine, the remote device is only used
+# for running the generated code. We only need to build the TVM runtime on
+# the remote device.
+#
+# .. code-block:: bash
+#
+#   git clone --recursive https://github.com/apache/incubator-tvm tvm
+#   cd tvm
+#   make runtime -j2
+#
+# After building the runtime successfully, we need to set environment variables
+# in :code:`~/.bashrc` file. We can edit :code:`~/.bashrc`
+# using :code:`vi ~/.bashrc` and add the line below (Assuming your TVM
+# directory is in :code:`~/tvm`):
+#
+# .. code-block:: bash
+#
+#   export PYTHONPATH=$PYTHONPATH:~/tvm/python
+#
+# To update the environment variables, execute :code:`source ~/.bashrc`.
+
+######################################################################
+# Set Up RPC Server on Device
+# ---------------------------
+# To start an RPC server, run the following command on your remote device
+# (Which is Raspberry Pi in this example).
+#
+#   .. code-block:: bash
+#
+#     python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9090
+#
+# If you see the line below, it means the RPC server started
+# successfully on your device.
+#
+#    .. code-block:: bash
+#
+#      INFO:root:RPCServer: bind to 0.0.0.0:9090
+#
+
+######################################################################
+# Declare and Cross Compile Kernel on Local Machine
+# -------------------------------------------------
+#
+# .. note::
+#
+#   Now we go back to the local machine, which has a full TVM installed
+#   (with LLVM).
+#
+# Here we will declare a simple kernel on the local machine:
+
+import numpy as np
+
+import tvm
+from tvm import te
+from tvm import rpc
+from tvm.contrib import util
+
+n = tvm.runtime.convert(1024)
+A = te.placeholder((n,), name='A')
+B = te.compute((n,), lambda i: A[i] + 1.0, name='B')
+s = te.create_schedule(B.op)
+
+######################################################################
+# Then we cross compile the kernel.
+# The target should be 'llvm -target=armv7l-linux-gnueabihf' for
+# Raspberry Pi 3B, but we use 'llvm' here to make this tutorial runnable
+# on our webpage building server. See the detailed note in the following block.
+
+local_demo = True
+
+if local_demo:
+    target = 'llvm'
+else:
+    target = 'llvm -target=armv7l-linux-gnueabihf'
+
+func = tvm.build(s, [A, B], target=target, name='add_one')
+# save the lib at a local temp folder
+temp = util.tempdir()
+path = temp.relpath('lib.tar')
+func.export_library(path)
+
+######################################################################
+# .. note::
+#
+#   To run this tutorial with a real remote device, change :code:`local_demo`
+#   to False and replace :code:`target` in :code:`build` with the appropriate
+#   target triple for your device. The target triple which might be
+#   different for different devices. For example, it is
+#   :code:`'llvm -target=armv7l-linux-gnueabihf'` for Raspberry Pi 3B and
+#   :code:`'llvm -target=aarch64-linux-gnu'` for RK3399.
+#
+#   Usually, you can query the target by running :code:`gcc -v` on your
+#   device, and looking for the line starting with :code:`Target:`
+#   (Though it may still be a loose configuration.)
+#
+#   Besides :code:`-target`, you can also set other compilation options
+#   like:
+#
+#   * -mcpu=<cpuname>
+#       Specify a specific chip in the current architecture to generate code for. By default this is inferred from the target triple and autodetected to the current architecture.
+#   * -mattr=a1,+a2,-a3,...
+#       Override or control specific attributes of the target, such as whether SIMD operations are enabled or not. The default set of attributes is set by the current CPU.
+#       To get the list of available attributes, you can do:
+#
+#       .. code-block:: bash
+#
+#         llc -mtriple=<your device target triple> -mattr=help
+#
+#   These options are consistent with `llc <http://llvm.org/docs/CommandGuide/llc.html>`_.
+#   It is recommended to set target triple and feature set to contain specific
+#   feature available, so we can take full advantage of the features of the
+#   board.
+#   You can find more details about cross compilation attributes from
+#   `LLVM guide of cross compilation <https://clang.llvm.org/docs/CrossCompilation.html>`_.
+
+######################################################################
+# Run CPU Kernel Remotely by RPC
+# ------------------------------
+# We show how to run the generated CPU kernel on the remote device.
+# First we obtain an RPC session from remote device.
+
+if local_demo:
+    remote = rpc.LocalSession()
+else:
+    # The following is my environment, change this to the IP address of your target device
+    host = '10.77.1.162'
+    port = 9090
+    remote = rpc.connect(host, port)
+
+######################################################################
+# Upload the lib to the remote device, then invoke a device local
+# compiler to relink them. Now `func` is a remote module object.
+
+remote.upload(path)
+func = remote.load_module('lib.tar')
+
+# create arrays on the remote device
+ctx = remote.cpu()
+a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
+b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
+# the function will run on the remote device
+func(a, b)
+np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
+
+######################################################################
+# When you want to evaluate the performance of the kernel on the remote
+# device, it is important to avoid the overhead of network.
+# :code:`time_evaluator` will returns a remote function that runs the
+# function over number times, measures the cost per run on the remote
+# device and returns the measured cost. Network overhead is excluded.
+
+time_f = func.time_evaluator(func.entry_name, ctx, number=10)
+cost = time_f(a, b).mean
+print('%g secs/op' % cost)
+
+#########################################################################
+# Run OpenCL Kernel Remotely by RPC
+# ---------------------------------
+# For remote OpenCL devices, the workflow is almost the same as above.
+# You can define the kernel, upload files, and run via RPC.
+#
+# .. note::
+#
+#    Raspberry Pi does not support OpenCL, the following code is tested on
+#    Firefly-RK3399. You may follow this `tutorial <https://gist.github.com/mli/585aed2cec0b5178b1a510f9f236afa2>`_
+#    to setup the OS and OpenCL driver for RK3399.
+#
+#    Also we need to build the runtime with OpenCL enabled on rk3399 board. In the TVM
+#    root directory, execute
+#
+# .. code-block:: bash
+#
+#    cp cmake/config.cmake .
+#    sed -i "s/USE_OPENCL OFF/USE_OPENCL ON/" config.cmake
+#    make runtime -j4
+#
+# The following function shows how we run an OpenCL kernel remotely
+
+def run_opencl():
+    # NOTE: This is the setting for my rk3399 board. You need to modify
+    # them according to your environment.
+    target_host = "llvm -target=aarch64-linux-gnu"
+    opencl_device_host = '10.77.1.145'
+    opencl_device_port = 9090
+
+    # create schedule for the above "add one" compute declaration
+    s = te.create_schedule(B.op)
+    xo, xi = s[B].split(B.op.axis[0], factor=32)
+    s[B].bind(xo, te.thread_axis("blockIdx.x"))
+    s[B].bind(xi, te.thread_axis("threadIdx.x"))
+    func = tvm.build(s, [A, B], "opencl", target_host=target_host)
+
+    remote = rpc.connect(opencl_device_host, opencl_device_port)
+
+    # export and upload
+    path = temp.relpath('lib_cl.tar')
+    func.export_library(path)
+    remote.upload(path)
+    func = remote.load_module('lib_cl.tar')
+
+    # run
+    ctx = remote.cl()
+    a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
+    func(a, b)
+    np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
+    print("OpenCL test passed!")
+
+######################################################################
+# Summary
+# -------
+# This tutorial provides a walk through of cross compilation and RPC
+# features in TVM.
+#
+# - Set up an RPC server on the remote device.
+# - Set up the target device configuration to cross compile the kernels on the
+#   local machine.
+# - Upload and run the kernels remotely via the RPC API.
diff --git a/docs/_downloads/4e9540fc014621d8d3bd14869c1ab227/scan.ipynb b/docs/_downloads/4e9540fc014621d8d3bd14869c1ab227/scan.ipynb
new file mode 100644
index 0000000..f417386
--- /dev/null
+++ b/docs/_downloads/4e9540fc014621d8d3bd14869c1ab227/scan.ipynb
@@ -0,0 +1,169 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\nScan and Recurrent Kernel\n=========================\n**Author**: `Tianqi Chen <https://tqchen.github.io>`_\n\nThis is an introduction material on how to do recurrent computing in TVM.\nRecurrent computing is a typical pattern in neural networks.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from __future__ import absolute_import, print_function\n\nimport tvm\nfrom tvm import te\nimport numpy as np"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "TVM supports a scan operator to describe symbolic loop.\nThe following scan op computes cumsum over columns of X.\n\nThe scan is carried over the highest dimension of the tensor.\n:code:`s_state` is a placeholder that describes the transition state of the scan.\n:code:`s_init` describes how we can initialize the first k timesteps.\nHere since s_init's first dimension is 1, it describes how we initialize\nThe state at first timestep.\n\n:code:`s_update` describes how to update th [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "m = te.var(\"m\")\nn = te.var(\"n\")\nX = te.placeholder((m, n), name=\"X\")\ns_state = te.placeholder((m, n))\ns_init = te.compute((1, n), lambda _, i: X[0, i])\ns_update = te.compute((m, n), lambda t, i: s_state[t-1, i] + X[t, i])\ns_scan = tvm.te.scan(s_init, s_update, s_state, inputs=[X])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Schedule the Scan Cell\n----------------------\nWe can schedule the body of the scan by scheduling the update and\ninit part seperately. Note that it is invalid to schedule the\nfirst iteration dimension of the update part.\nTo split on the time iteration, user can schedule on scan_op.scan_axis instead.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "s = te.create_schedule(s_scan.op)\nnum_thread = 256\nblock_x = te.thread_axis(\"blockIdx.x\")\nthread_x = te.thread_axis(\"threadIdx.x\")\nxo, xi = s[s_init].split(s_init.op.axis[1], factor=num_thread)\ns[s_init].bind(xo, block_x)\ns[s_init].bind(xi, thread_x)\nxo, xi = s[s_update].split(s_update.op.axis[1], factor=num_thread)\ns[s_update].bind(xo, block_x)\ns[s_update].bind(xi, thread_x)\nprint(tvm.lower(s, [X, s_scan], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Build and Verify\n----------------\nWe can build the scan kernel like other TVM kernels, here we use\nnumpy to verify the correctness of the result.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "fscan = tvm.build(s, [X, s_scan], \"cuda\", name=\"myscan\")\nctx = tvm.gpu(0)\nn = 1024\nm = 10\na_np = np.random.uniform(size=(m, n)).astype(s_scan.dtype)\na = tvm.nd.array(a_np, ctx)\nb = tvm.nd.array(np.zeros((m, n), dtype=s_scan.dtype), ctx)\nfscan(a, b)\ntvm.testing.assert_allclose(b.asnumpy(), np.cumsum(a_np, axis=0))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Multi-Stage Scan Cell\n---------------------\nIn the above example we described the scan cell using one Tensor\ncomputation stage in s_update. It is possible to use multiple\nTensor stages in the scan cell.\n\nThe following lines demonstrate a scan with two stage operations\nin the scan cell.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "m = te.var(\"m\")\nn = te.var(\"n\")\nX = te.placeholder((m, n), name=\"X\")\ns_state = te.placeholder((m, n))\ns_init = te.compute((1, n), lambda _, i: X[0, i])\ns_update_s1 = te.compute((m, n), lambda t, i: s_state[t-1, i] * 2, name=\"s1\")\ns_update_s2 = te.compute((m, n), lambda t, i: s_update_s1[t, i] + X[t, i], name=\"s2\")\ns_scan = tvm.te.scan(s_init, s_update_s2, s_state, inputs=[X])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "These intermediate tensors can also be scheduled normally.\nTo ensure correctness, TVM creates a group constraint to forbid\nthe body of scan to be compute_at locations outside the scan loop.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "s = te.create_schedule(s_scan.op)\nxo, xi = s[s_update_s2].split(s_update_s2.op.axis[1], factor=32)\ns[s_update_s1].compute_at(s[s_update_s2], xo)\nprint(tvm.lower(s, [X, s_scan], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Multiple States\n---------------\nFor complicated applications like RNN, we might need more than one\nrecurrent state. Scan support multiple recurrent states.\nThe following example demonstrates how we can build recurrence with two states.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "m = te.var(\"m\")\nn = te.var(\"n\")\nl = te.var(\"l\")\nX = te.placeholder((m, n), name=\"X\")\ns_state1 = te.placeholder((m, n))\ns_state2 = te.placeholder((m, l))\ns_init1 = te.compute((1, n), lambda _, i: X[0, i])\ns_init2 = te.compute((1, l), lambda _, i: 0.0)\ns_update1 = te.compute((m, n), lambda t, i: s_state1[t-1, i] + X[t, i])\ns_update2 = te.compute((m, l), lambda t, i: s_state2[t-1, i] + s_state1[t-1, 0])\ns_scan1, s_scan2 = tvm.te.scan([s_init1, s_init2],\n          [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Summary\n-------\nThis tutorial provides a walk through of scan primitive.\n\n- Describe scan with init and update.\n- Schedule the scan cells as normal schedule.\n- For complicated workload, use multiple states and steps in scan cell.\n\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/_downloads/4f4a49a5483a0d0aa4af30f58c3c8664/deploy_quantized.ipynb b/docs/_downloads/4f4a49a5483a0d0aa4af30f58c3c8664/deploy_quantized.ipynb
new file mode 100644
index 0000000..d387ac7
--- /dev/null
+++ b/docs/_downloads/4f4a49a5483a0d0aa4af30f58c3c8664/deploy_quantized.ipynb
@@ -0,0 +1,144 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\nDeploy a Quantized Model on Cuda\n================================\n**Author**: `Wuwei Lin <https://github.com/vinx13>`_\n\nThis article is an introductory tutorial of automatic quantization with TVM.\nAutomatic quantization is one of the quantization modes in TVM. More details on\nthe quantization story in TVM can be found\n`here <https://discuss.tvm.ai/t/quantization-story/3920>`_.\nIn this tutorial, we will import a GluonCV pre-trained model on ImageNet to\nRelay, quantize  [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import tvm\nfrom tvm import te\nfrom tvm import relay\nimport mxnet as mx\nfrom tvm.contrib.download import download_testdata\nfrom mxnet import gluon\nimport logging\nimport os\n\nbatch_size = 1\nmodel_name = \"resnet18_v1\"\ntarget = 'cuda'\nctx = tvm.context(target)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Prepare the Dataset\n-------------------\nWe will demonstrate how to prepare the calibration dataset for quantization.\nWe first download the validation set of ImageNet and pre-process the dataset.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "calibration_rec = download_testdata(\n    'http://data.mxnet.io.s3-website-us-west-1.amazonaws.com/data/val_256_q90.rec',\n    'val_256_q90.rec')\n\ndef get_val_data(num_workers=4):\n    mean_rgb = [123.68, 116.779, 103.939]\n    std_rgb = [58.393, 57.12, 57.375]\n\n    def batch_fn(batch):\n        return batch.data[0].asnumpy(), batch.label[0].asnumpy()\n\n    img_size = 299 if model_name == 'inceptionv3' else 224\n    val_data = mx.io.ImageRecordIter(\n        path_imgrec=cal [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "The calibration dataset should be an iterable object. We define the\ncalibration dataset as a generator object in Python. In this tutorial, we\nonly use a few samples for calibration.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "calibration_samples = 10\n\ndef calibrate_dataset():\n    val_data, batch_fn = get_val_data()\n    val_data.reset()\n    for i, batch in enumerate(val_data):\n        if i * batch_size >= calibration_samples:\n            break\n        data, _ = batch_fn(batch)\n        yield {'data': data}"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Import the model\n----------------\nWe use the Relay MxNet frontend to import a model from the Gluon model zoo.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "def get_model():\n    gluon_model = gluon.model_zoo.vision.get_model(model_name, pretrained=True)\n    img_size = 299 if model_name == 'inceptionv3' else 224\n    data_shape = (batch_size, 3, img_size, img_size)\n    mod, params = relay.frontend.from_mxnet(gluon_model, {\"data\": data_shape})\n    return mod, params"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Quantize the Model\n------------------\nIn quantization, we need to find the scale for each weight and intermediate\nfeature map tensor of each layer.\n\nFor weights, the scales are directly calculated based on the value of the\nweights. Two modes are supported: `power2` and `max`. Both modes find the\nmaximum value within the weight tensor first. In `power2` mode, the maximum\nis rounded down to power of two. If the scales of both weights and\nintermediate feature maps are powe [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "def quantize(mod, params, data_aware):\n    if data_aware:\n        with relay.quantize.qconfig(calibrate_mode='kl_divergence', weight_scale='max'):\n            mod = relay.quantize.quantize(mod, params, dataset=calibrate_dataset())\n    else:\n        with relay.quantize.qconfig(calibrate_mode='global_scale', global_scale=8.0):\n            mod = relay.quantize.quantize(mod, params)\n    return mod"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Run Inference\n-------------\nWe create a Relay VM to build and execute the model.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "def run_inference(mod):\n    executor = relay.create_executor('vm', mod, ctx, target)\n    val_data, batch_fn = get_val_data()\n    for i, batch in enumerate(val_data):\n        data, label = batch_fn(batch)\n        prediction = executor.evaluate()(data)\n        if i > 10:  # only run inference on a few samples in this tutorial\n            break\n\ndef main():\n    mod, params = get_model()\n    mod = quantize(mod, params, data_aware=True)\n    run_inference(mod)\n\nif __name [...]
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/_downloads/50b174352ccf0a0defcbd8e6b40145e2/from_tensorflow.py b/docs/_downloads/50b174352ccf0a0defcbd8e6b40145e2/from_tensorflow.py
new file mode 100644
index 0000000..0ebd733
--- /dev/null
+++ b/docs/_downloads/50b174352ccf0a0defcbd8e6b40145e2/from_tensorflow.py
@@ -0,0 +1,240 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Compile Tensorflow Models
+=========================
+This article is an introductory tutorial to deploy tensorflow models with TVM.
+
+For us to begin with, tensorflow python module is required to be installed.
+
+Please refer to https://www.tensorflow.org/install
+"""
+
+# tvm, relay
+import tvm
+from tvm import te
+from tvm import relay
+
+# os and numpy
+import numpy as np
+import os.path
+
+# Tensorflow imports
+import tensorflow as tf
+try:
+    tf_compat_v1 = tf.compat.v1
+except ImportError:
+    tf_compat_v1 = tf
+
+# Tensorflow utility functions
+import tvm.relay.testing.tf as tf_testing
+
+# Base location for model related files.
+repo_base = 'https://github.com/dmlc/web-data/raw/master/tensorflow/models/InceptionV1/'
+
+# Test image
+img_name = 'elephant-299.jpg'
+image_url = os.path.join(repo_base, img_name)
+
+######################################################################
+# Tutorials
+# ---------
+# Please refer docs/frontend/tensorflow.md for more details for various models
+# from tensorflow.
+
+model_name = 'classify_image_graph_def-with_shapes.pb'
+model_url = os.path.join(repo_base, model_name)
+
+# Image label map
+map_proto = 'imagenet_2012_challenge_label_map_proto.pbtxt'
+map_proto_url = os.path.join(repo_base, map_proto)
+
+# Human readable text for labels
+label_map = 'imagenet_synset_to_human_label_map.txt'
+label_map_url = os.path.join(repo_base, label_map)
+
+# Target settings
+# Use these commented settings to build for cuda.
+#target = 'cuda'
+#target_host = 'llvm'
+#layout = "NCHW"
+#ctx = tvm.gpu(0)
+target = 'llvm'
+target_host = 'llvm'
+layout = None
+ctx = tvm.cpu(0)
+
+######################################################################
+# Download required files
+# -----------------------
+# Download files listed above.
+from tvm.contrib.download import download_testdata
+
+img_path = download_testdata(image_url, img_name, module='data')
+model_path = download_testdata(model_url, model_name, module=['tf', 'InceptionV1'])
+map_proto_path = download_testdata(map_proto_url, map_proto, module='data')
+label_path = download_testdata(label_map_url, label_map, module='data')
+
+######################################################################
+# Import model
+# ------------
+# Creates tensorflow graph definition from protobuf file.
+
+with tf_compat_v1.gfile.GFile(model_path, 'rb') as f:
+    graph_def = tf_compat_v1.GraphDef()
+    graph_def.ParseFromString(f.read())
+    graph = tf.import_graph_def(graph_def, name='')
+    # Call the utility to import the graph definition into default graph.
+    graph_def = tf_testing.ProcessGraphDefParam(graph_def)
+    # Add shapes to the graph.
+    with tf_compat_v1.Session() as sess:
+        graph_def = tf_testing.AddShapesToGraphDef(sess, 'softmax')
+
+######################################################################
+# Decode image
+# ------------
+# .. note::
+#
+#   tensorflow frontend import doesn't support preprocessing ops like JpegDecode.
+#   JpegDecode is bypassed (just return source node).
+#   Hence we supply decoded frame to TVM instead.
+#
+
+from PIL import Image
+image = Image.open(img_path).resize((299, 299))
+
+x = np.array(image)
+
+######################################################################
+# Import the graph to Relay
+# -------------------------
+# Import tensorflow graph definition to relay frontend.
+#
+# Results:
+#   sym: relay expr for given tensorflow protobuf.
+#   params: params converted from tensorflow params (tensor protobuf).
+shape_dict = {'DecodeJpeg/contents': x.shape}
+dtype_dict = {'DecodeJpeg/contents': 'uint8'}
+mod, params = relay.frontend.from_tensorflow(graph_def,
+                                             layout=layout,
+                                             shape=shape_dict)
+
+print("Tensorflow protobuf imported to relay frontend.")
+######################################################################
+# Relay Build
+# -----------
+# Compile the graph to llvm target with given input specification.
+#
+# Results:
+#   graph: Final graph after compilation.
+#   params: final params after compilation.
+#   lib: target library which can be deployed on target with TVM runtime.
+
+with relay.build_config(opt_level=3):
+    graph, lib, params = relay.build(mod,
+                                     target=target,
+                                     target_host=target_host,
+                                     params=params)
+
+######################################################################
+# Execute the portable graph on TVM
+# ---------------------------------
+# Now we can try deploying the compiled model on target.
+
+from tvm.contrib import graph_runtime
+dtype = 'uint8'
+m = graph_runtime.create(graph, lib, ctx)
+# set inputs
+m.set_input('DecodeJpeg/contents', tvm.nd.array(x.astype(dtype)))
+m.set_input(**params)
+# execute
+m.run()
+# get outputs
+tvm_output = m.get_output(0, tvm.nd.empty(((1, 1008)), 'float32'))
+
+######################################################################
+# Process the output
+# ------------------
+# Process the model output to human readable text for InceptionV1.
+predictions = tvm_output.asnumpy()
+predictions = np.squeeze(predictions)
+
+# Creates node ID --> English string lookup.
+node_lookup = tf_testing.NodeLookup(label_lookup_path=map_proto_path,
+                                    uid_lookup_path=label_path)
+
+# Print top 5 predictions from TVM output.
+top_k = predictions.argsort()[-5:][::-1]
+for node_id in top_k:
+    human_string = node_lookup.id_to_string(node_id)
+    score = predictions[node_id]
+    print('%s (score = %.5f)' % (human_string, score))
+
+######################################################################
+# Inference on tensorflow
+# -----------------------
+# Run the corresponding model on tensorflow
+
+def create_graph():
+    """Creates a graph from saved GraphDef file and returns a saver."""
+    # Creates graph from saved graph_def.pb.
+    with tf_compat_v1.gfile.GFile(model_path, 'rb') as f:
+        graph_def = tf_compat_v1.GraphDef()
+        graph_def.ParseFromString(f.read())
+        graph = tf.import_graph_def(graph_def, name='')
+        # Call the utility to import the graph definition into default graph.
+        graph_def = tf_testing.ProcessGraphDefParam(graph_def)
+
+def run_inference_on_image(image):
+    """Runs inference on an image.
+
+    Parameters
+    ----------
+    image: String
+        Image file name.
+
+    Returns
+    -------
+        Nothing
+    """
+    if not tf_compat_v1.gfile.Exists(image):
+        tf.logging.fatal('File does not exist %s', image)
+    image_data = tf_compat_v1.gfile.GFile(image, 'rb').read()
+
+    # Creates graph from saved GraphDef.
+    create_graph()
+
+    with tf_compat_v1.Session() as sess:
+        softmax_tensor = sess.graph.get_tensor_by_name('softmax:0')
+        predictions = sess.run(softmax_tensor,
+                               {'DecodeJpeg/contents:0': image_data})
+
+        predictions = np.squeeze(predictions)
+
+        # Creates node ID --> English string lookup.
+        node_lookup = tf_testing.NodeLookup(label_lookup_path=map_proto_path,
+                                            uid_lookup_path=label_path)
+
+        # Print top 5 predictions from tensorflow.
+        top_k = predictions.argsort()[-5:][::-1]
+        print ("===== TENSORFLOW RESULTS =======")
+        for node_id in top_k:
+            human_string = node_lookup.id_to_string(node_id)
+            score = predictions[node_id]
+            print('%s (score = %.5f)' % (human_string, score))
+
+run_inference_on_image(img_path)
diff --git a/docs/_downloads/52b04835c256bb81bbf4187f18950a0a/relay_pass_infra.py b/docs/_downloads/52b04835c256bb81bbf4187f18950a0a/relay_pass_infra.py
new file mode 100644
index 0000000..b54ac13
--- /dev/null
+++ b/docs/_downloads/52b04835c256bb81bbf4187f18950a0a/relay_pass_infra.py
@@ -0,0 +1,248 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=line-too-long
+"""
+.. _tutorial-relay-pass-infra:
+
+How to Use Relay Pass Infra
+===========================
+**Author**: `Zhi Chen <https://github.com/zhiics>`_
+
+As the number of optimization passes increases in Relay, it becomes intractable to
+execute them and maintain their dependencies manually. Therefore, we have
+introduced an infrastructure to manage the optimization passes.
+
+The optimizations of a Relay program could be applied at various granularity,
+namely function-level and module-level using :py:class:`tvm.relay.transform.FunctionPass`
+and py:class:`tvm.relay.transform.ModulePass`
+respectively. Or users can rely on py:class:`tvm.relay.transform.Sequential` to apply a sequence of passes
+on a Relay program where the dependencies between passes can be resolved by the
+pass infra. For more details about each type of these passes, please refer to
+the :ref:`relay-pass-infra`
+
+This tutorial demostrates how developers can use the Relay pass infra to perform
+a certain optimization and create an optimization pipeline.
+"""
+
+import numpy as np
+import tvm
+from tvm import te
+import tvm.relay as relay
+
+###############################################################################
+# Create An Example Relay Program
+# -------------------------------
+# First of all, we create a simple Relay program for the tutorial. This program
+# will be used by various optimizations of the examples in this tutorial.
+
+def example():
+    shape = (1, 64, 54, 54)
+    c_data = np.empty(shape).astype("float32")
+    c = relay.const(c_data)
+    weight = relay.var('weight', shape=(64, 64, 3, 3))
+    x = relay.var("x", relay.TensorType((1, 64, 56, 56), "float32"))
+    conv = relay.nn.conv2d(x, weight)
+    y = relay.add(c, c)
+    y = relay.multiply(y, relay.const(2, "float32"))
+    y = relay.add(conv, y)
+    z = relay.add(y, c)
+    z1 = relay.add(y, c)
+    z2 = relay.add(z, z1)
+    return relay.Function([x], z2)
+
+###############################################################################
+# Let us register layout alteration for a conv2d op so that we can apply the
+# layout alteration pass on the example. How alter layout pass works is out
+# the scope of this tutorial.
+
+@relay.op.register_alter_op_layout("nn.conv2d", level=101)
+def alter_conv2d(attrs, inputs, tinfos, out_type):
+    data, weight = inputs
+    new_attrs = dict(attrs)
+    new_attrs['data_layout'] = 'NCHW16c'
+    return relay.nn.conv2d(data, weight, **new_attrs)
+
+###############################################################################
+# Optimize the Program
+# --------------------
+# Now we would like to optimize the program. Relay features a host of
+# optimizations. We will select some of them to apply on this example program.
+#
+# There are multiple ways to optimize a Relay program. Below we will provide
+# examples for each of them.
+#
+# Manually Apply Optimization Passes
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+# Let's first create a relay Module which contains one or multiple Relay
+# functions for optimization.
+f = example()
+mod = tvm.IRModule.from_expr(f)
+
+# Now we can apply constant folding on the module.
+# fold_const here is a callback that doesn't take any parameters.
+fold_const = relay.transform.FoldConstant()
+# Then, we can invoke the pass on the given module. Note that the constant
+# folding pass works at the function-level. That being said, each function in
+# the module will be applied with the optimization. Users don't need to iterate
+# through individual functions manually to apply this pass.
+mod = fold_const(mod)
+# We can see from the updated program that the constants are folded.
+print(mod)
+
+###############################################################################
+# More optimizations can be applied in the similar manner. For instance, we can
+# eliminate the common expressions that used by `z` and `z1`.
+mod = relay.transform.EliminateCommonSubexpr()(mod)
+print(mod)
+
+###############################################################################
+# Some optimizations, such as fusion, are parameteric as well. For example,
+# opt level 0 will not allow operators to be fused together. Users can pass the
+# `fuse_opt_level` to enable this.
+mod = relay.transform.FuseOps(fuse_opt_level=0)(mod)
+
+# We can observe that the optimized module contains functions that only have
+# a signle primitive op.
+print(mod)
+
+###############################################################################
+# Use Sequential to Apply a Sequence of Passes
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Applying passes as above is actually tedious and it may require users to have
+# better understanding about the dependencies between them. For example, fusion
+# currently doesn't work well on let bindings. Therefore, we would not be able
+# to fuse operators that were fusable if :py:func:`relay.transform.ToANormalForm` is applied before
+# fusion, as this pass generates let bindings for each expression to
+# canonicalize a Relay program.
+#
+# Relay, hence, provides :py:class:`tvm.relay.transform.Sequential` to alleviate developers from handling
+# these issues explicitly by specifying the required passes of each pass and
+# packing them as a whole to execute. For example, the same passes can now be
+# applied using the sequential style as the following. :py:class:`tvm.relay.transform.Sequential` is
+# similiar to `torch.nn.sequential <https://pytorch.org/docs/stable/nn.html#torch.nn.Sequential>`_
+# and `mxnet.gluon.block <https://mxnet.incubator.apache.org/api/python/docs/_modules/mxnet/gluon/block.html>`_.
+# For example, `torch.nn.sequential` is used to contain a sequence of PyTorch
+# `Modules` that will be added to build a network. It focuses on the network
+# layers. Instead, the :py:class:`tvm.relay.transform.Sequential` in our pass infra works on the optimizing
+# pass.
+
+# Now let's execute some passes through :py:class:`tvm.relay.transform.Sequential`
+f = example()
+mod = tvm.IRModule.from_expr(f)
+# Glob the interested passes.
+seq = relay.transform.Sequential([relay.transform.FoldConstant(),
+                                  relay.transform.EliminateCommonSubexpr(),
+                                  relay.transform.FuseOps(fuse_opt_level=2)])
+mod1 = seq(mod)
+print(mod1)
+
+###############################################################################
+# From the transformed Relay program, we can see that there are still two
+# identical addition operations. This is because `EliminateCommonSubexpr`
+# was not actually performed. The reason is because only the passes that have
+# optimization level less or equal to 2 will be executed by default under
+# :py:class:`tvm.relay.transform.Sequential`. The pass infra,
+# however, provides a configuration interface
+# for users to customize the optimization level that they want to execute.
+
+with relay.build_config(opt_level=3):
+    mod2 = seq(mod)
+print(mod2)
+
+###############################################################################
+# Now we can see that only one of the two identical additions is kept.
+#
+# In addition, users can selectively disable some passes using the
+# `disabled_pass` config, which is similar to the `-fno-xxx` option used the
+# general purpose compilers, such as Clang and GCC. For example, we can disable
+# EliminateCommonSubexpr as following. The printed module will again show two
+# identical addition operations.
+
+with relay.build_config(opt_level=3, disabled_pass=["EliminateCommonSubexpr"]):
+    mod3 = seq(mod)
+print(mod3)
+
+###############################################################################
+# The passes applied so far are target independent. The pass infra also
+# provides a means to make pass target-aware. For example, the layout
+# alteration pass falls in such category.
+
+with relay.build_config(opt_level=3):
+    mod4 = seq(mod)
+print(mod4)
+
+seq1 = relay.transform.Sequential([relay.transform.AlterOpLayout()])
+with relay.build_config(opt_level=3):
+    with tvm.target.create("llvm"):
+        mod5 = seq1(mod)
+print(mod5)
+
+##############################################################################
+# Implement a Pass Using Python Decorator
+# ------------------------------------------
+# The next example illustrates how we can orchestrate a customized optimization
+# pipeline through the pass infra using Python decorators. This functionality
+# greatly eases the implementation of passes. For example, users can simply
+# define a decorated class to do function-level optimizations as the following
+# example shows. `transform_function` wraps a class to replace all constants
+# with a multiple of `c`. Later on, each function in a given module will be
+# visited and each constant in the function will be replaced when we invoke the
+# customized pass.
+
+@relay.transform.function_pass(opt_level=1)
+class CustomPipeline:
+    """Simple test function to replace one argument to another."""
+
+    def __init__(self, multiplier):
+        self.multiplier = multiplier
+
+    # This function can define a pass.
+    def transform_function(self, func, mod, ctx):
+        obj = self
+
+        class ReplaceConstant(tvm.relay.ExprMutator):
+            def visit_const(self, c):
+                return relay.multiply(obj.multiplier, c)
+        return ReplaceConstant().visit(func)
+
+f = example()
+mod = tvm.IRModule.from_expr(f)
+custom_pass = CustomPipeline(multiplier=relay.const(3, "float"))
+assert custom_pass.info.name == "CustomPipeline"
+mod3 = custom_pass(mod)
+print(mod3)
+
+##############################################################################
+# Debug a Pass
+# ------------
+# Relay provides users a plug-and-play style debugging pass that print the IR
+# after a certain pass is done. For example, we can print out the IR on the
+# completion of constant folding and fusion by adding the debugging pass after
+# them.
+
+f = example()
+mod = tvm.IRModule.from_expr(f)
+seq = relay.transform.Sequential([relay.transform.FoldConstant(),
+                                  relay.transform.PrintIR(False),
+                                  relay.transform.EliminateCommonSubexpr(),
+                                  relay.transform.FuseOps(),
+                                  relay.transform.PrintIR(False)])
+with relay.build_config(opt_level=3):
+    mod = seq(mod)
+
+print("done")
diff --git a/docs/_downloads/578004d7db54caef0007609ae5540c72/intro_topi.ipynb b/docs/_downloads/578004d7db54caef0007609ae5540c72/intro_topi.ipynb
new file mode 100644
index 0000000..d1b3085
--- /dev/null
+++ b/docs/_downloads/578004d7db54caef0007609ae5540c72/intro_topi.ipynb
@@ -0,0 +1,230 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\nIntroduction to TOPI\n====================\n**Author**: `Ehsan M. Kermani <https://github.com/ehsanmok>`_\n\nThis is an introductory tutorial to TVM Operator Inventory (TOPI).\nTOPI provides numpy-style generic operations and schedules with higher abstractions than TVM.\nIn this tutorial, we will see how TOPI can save us from writing boilerplates code in TVM.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from __future__ import absolute_import, print_function\n\nimport tvm\nfrom tvm import te\nimport topi\nimport numpy as np"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Basic example\n-------------\nLet's revisit the sum of rows operation (equivalent to :code:`B = numpy.sum(A, axis=1)`') \\\nTo compute the sum of rows of a two dimensional TVM tensor A, we should\nspecify the symbolic operation as well as schedule as follows\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "n = te.var(\"n\")\nm = te.var(\"m\")\nA = te.placeholder((n, m), name='A')\nk = te.reduce_axis((0, m), \"k\")\nB = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name=\"B\")\ns = te.create_schedule(B.op)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "and to examine the IR code in human readable format, we can do\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "print(tvm.lower(s, [A], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "However, for such a common operation we had to define the reduce axis ourselves as well as explicit computation with\n:code:`te.compute`. Imagine for more complicated operations how much details we need to provide.\nFortunately, we can replace those two lines with simple :code:`topi.sum` much like :code:`numpy.sum`\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "C = topi.sum(A, axis=1)\nts = te.create_schedule(C.op)\nprint(tvm.lower(ts, [A], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Numpy-style operator overloading\n--------------------------------\nWe can add two tensors using :code:`topi.broadcast_add` that have correct (broadcastable with specific) shapes.\nEven shorter, TOPI provides operator overloading for such common operations. For example,\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "x, y = 100, 10\na = te.placeholder((x, y, y), name=\"a\")\nb = te.placeholder((y, y), name=\"b\")\nc = a + b  # same as topi.broadcast_add\nd = a * b  # same as topi.broadcast_mul"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Overloaded with the same syntax, TOPI handles broadcasting a primitive (`int`, `float`) to a tensor :code:`d - 3.14`.\n\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Generic schedules and fusing operations\n---------------------------------------\nUp to now, we have seen an example of how TOPI can save us from writing explicit computations in lower level API.\nBut it doesn't stop here. Still we did the scheduling as before. TOPI also provides higher level\nscheduling recipes depending on a given context. For example, for CUDA,\nwe can schedule the following series of operations ending with :code:`topi.sum` using only\n:code:`topi.generic.sch [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "e = topi.elemwise_sum([c, d])\nf = e / 2.0\ng = topi.sum(f)\nwith tvm.target.cuda():\n    sg = topi.cuda.schedule_reduce(g)\n    print(tvm.lower(sg, [a, b], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "As you can see, scheduled stages of computation have been accumulated and we can examine them by\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "print(sg.stages)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "We can test the correctness by comparing with :code:`numpy` result as follows\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "func = tvm.build(sg, [a, b, g], 'cuda')\nctx = tvm.gpu(0)\na_np = np.random.uniform(size=(x, y, y)).astype(a.dtype)\nb_np = np.random.uniform(size=(y, y)).astype(b.dtype)\ng_np = np.sum(np.add(a_np + b_np, a_np * b_np) / 2.0)\na_nd = tvm.nd.array(a_np, ctx)\nb_nd = tvm.nd.array(b_np, ctx)\ng_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), ctx)\nfunc(a_nd, b_nd, g_nd)\ntvm.testing.assert_allclose(g_nd.asnumpy(), g_np, rtol=1e-5)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "TOPI also provides common neural nets operations such as _softmax_ with optimized schedule\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "tarray = te.placeholder((512, 512), name=\"tarray\")\nsoftmax_topi = topi.nn.softmax(tarray)\nwith tvm.target.create(\"cuda\"):\n    sst = topi.cuda.schedule_softmax(softmax_topi)\n    print(tvm.lower(sst, [tarray], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Fusing convolutions\n-------------------\nWe can fuse :code:`topi.nn.conv2d` and :code:`topi.nn.relu` together.\n\n<div class=\"alert alert-info\"><h4>Note</h4><p>TOPI functions are all generic functions. They have different implementations\n   for different backends to optimize for performance.\n   For each backend, it is necessary to call them under a target scope for both\n   compute declaration and schedule. TVM will choose the right function to call with\n   the target info [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "data = te.placeholder((1, 3, 224, 224))\nkernel = te.placeholder((10, 3, 5, 5))\n\nwith tvm.target.create(\"cuda\"):\n    conv = topi.cuda.conv2d_nchw(data, kernel, 1, 2, 1)\n    out = topi.nn.relu(conv)\n    sconv = topi.cuda.schedule_conv2d_nchw([out])\n    print(tvm.lower(sconv, [data, kernel], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Summary\n-------\nIn this tutorial, we have seen\n\n- How to use TOPI API for common operations with numpy-style operators.\n- How TOPI facilitates generic schedules and operator fusion for a context, to generate optimized kernel codes.\n\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/_downloads/5b32f1dc3e9e2fc5ac5be0918758b967/deploy_quantized.py b/docs/_downloads/5b32f1dc3e9e2fc5ac5be0918758b967/deploy_quantized.py
new file mode 100644
index 0000000..2586318
--- /dev/null
+++ b/docs/_downloads/5b32f1dc3e9e2fc5ac5be0918758b967/deploy_quantized.py
@@ -0,0 +1,159 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Deploy a Quantized Model on Cuda
+================================
+**Author**: `Wuwei Lin <https://github.com/vinx13>`_
+
+This article is an introductory tutorial of automatic quantization with TVM.
+Automatic quantization is one of the quantization modes in TVM. More details on
+the quantization story in TVM can be found
+`here <https://discuss.tvm.ai/t/quantization-story/3920>`_.
+In this tutorial, we will import a GluonCV pre-trained model on ImageNet to
+Relay, quantize the Relay model and then perform the inference.
+"""
+
+import tvm
+from tvm import te
+from tvm import relay
+import mxnet as mx
+from tvm.contrib.download import download_testdata
+from mxnet import gluon
+import logging
+import os
+
+batch_size = 1
+model_name = "resnet18_v1"
+target = 'cuda'
+ctx = tvm.context(target)
+
+###############################################################################
+# Prepare the Dataset
+# -------------------
+# We will demonstrate how to prepare the calibration dataset for quantization.
+# We first download the validation set of ImageNet and pre-process the dataset.
+calibration_rec = download_testdata(
+    'http://data.mxnet.io.s3-website-us-west-1.amazonaws.com/data/val_256_q90.rec',
+    'val_256_q90.rec')
+
+def get_val_data(num_workers=4):
+    mean_rgb = [123.68, 116.779, 103.939]
+    std_rgb = [58.393, 57.12, 57.375]
+
+    def batch_fn(batch):
+        return batch.data[0].asnumpy(), batch.label[0].asnumpy()
+
+    img_size = 299 if model_name == 'inceptionv3' else 224
+    val_data = mx.io.ImageRecordIter(
+        path_imgrec=calibration_rec,
+        preprocess_threads=num_workers,
+        shuffle=False,
+        batch_size=batch_size,
+        resize=256,
+        data_shape=(3, img_size, img_size),
+        mean_r=mean_rgb[0],
+        mean_g=mean_rgb[1],
+        mean_b=mean_rgb[2],
+        std_r=std_rgb[0],
+        std_g=std_rgb[1],
+        std_b=std_rgb[2],
+    )
+    return val_data, batch_fn
+
+
+###############################################################################
+# The calibration dataset should be an iterable object. We define the
+# calibration dataset as a generator object in Python. In this tutorial, we
+# only use a few samples for calibration.
+
+calibration_samples = 10
+
+def calibrate_dataset():
+    val_data, batch_fn = get_val_data()
+    val_data.reset()
+    for i, batch in enumerate(val_data):
+        if i * batch_size >= calibration_samples:
+            break
+        data, _ = batch_fn(batch)
+        yield {'data': data}
+
+
+###############################################################################
+# Import the model
+# ----------------
+# We use the Relay MxNet frontend to import a model from the Gluon model zoo.
+def get_model():
+    gluon_model = gluon.model_zoo.vision.get_model(model_name, pretrained=True)
+    img_size = 299 if model_name == 'inceptionv3' else 224
+    data_shape = (batch_size, 3, img_size, img_size)
+    mod, params = relay.frontend.from_mxnet(gluon_model, {"data": data_shape})
+    return mod, params
+
+
+###############################################################################
+# Quantize the Model
+# ------------------
+# In quantization, we need to find the scale for each weight and intermediate
+# feature map tensor of each layer.
+#
+# For weights, the scales are directly calculated based on the value of the
+# weights. Two modes are supported: `power2` and `max`. Both modes find the
+# maximum value within the weight tensor first. In `power2` mode, the maximum
+# is rounded down to power of two. If the scales of both weights and
+# intermediate feature maps are power of two, we can leverage bit shifting for
+# multiplications. This make it computationally more efficient. In `max` mode,
+# the maximum is used as the scale. Without rounding, `max` mode might have
+# better accuracy in some cases. When the scales are not powers of two, fixed
+# point multiplications will be used.
+#
+# For intermediate feature maps, we can find the scales with data-aware
+# quantization. Data-aware quantization takes a calibration dataset as the
+# input argument. Scales are calculated by minimizing the KL divergence between
+# distribution of activation before and after quantization.
+# Alternatively, we can also use pre-defined global scales. This saves the time
+# for calibration. But the accuracy might be impacted.
+
+def quantize(mod, params, data_aware):
+    if data_aware:
+        with relay.quantize.qconfig(calibrate_mode='kl_divergence', weight_scale='max'):
+            mod = relay.quantize.quantize(mod, params, dataset=calibrate_dataset())
+    else:
+        with relay.quantize.qconfig(calibrate_mode='global_scale', global_scale=8.0):
+            mod = relay.quantize.quantize(mod, params)
+    return mod
+
+
+###############################################################################
+# Run Inference
+# -------------
+# We create a Relay VM to build and execute the model.
+def run_inference(mod):
+    executor = relay.create_executor('vm', mod, ctx, target)
+    val_data, batch_fn = get_val_data()
+    for i, batch in enumerate(val_data):
+        data, label = batch_fn(batch)
+        prediction = executor.evaluate()(data)
+        if i > 10:  # only run inference on a few samples in this tutorial
+            break
+
+def main():
+    mod, params = get_model()
+    mod = quantize(mod, params, data_aware=True)
+    run_inference(mod)
+
+if __name__ == '__main__':
+    main()
diff --git a/docs/_downloads/5bd1bb9c6505ea40407fa19f01579414/reduction.py b/docs/_downloads/5bd1bb9c6505ea40407fa19f01579414/reduction.py
new file mode 100644
index 0000000..cdfc94e
--- /dev/null
+++ b/docs/_downloads/5bd1bb9c6505ea40407fa19f01579414/reduction.py
@@ -0,0 +1,196 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Reduction
+=========
+**Author**: `Tianqi Chen <https://tqchen.github.io>`_
+
+This is an introduction material on how to do reduction in TVM.
+Associative reduction operators like sum/max/min are typical
+construction blocks of linear algebra operations.
+
+In this tutorial, we will demonstrate how to do reduction in TVM.
+"""
+from __future__ import absolute_import, print_function
+
+import tvm
+from tvm import te
+import numpy as np
+
+######################################################################
+# Describe Sum of Rows
+# --------------------
+# Assume we want to compute sum of rows as our example.
+# In numpy semantics this can be written as :code:`B = numpy.sum(A, axis=1)`
+#
+# The following lines describe the row sum operation.
+# To create a reduction formula, we declare a reduction axis using
+# :any:`te.reduce_axis`. :any:`te.reduce_axis` takes in the range of reductions.
+# :any:`te.sum` takes in the expression to be reduced as well as the reduction
+# axis and compute the sum of value over all k in the declared range.
+#
+# The equivalent C code is as follows:
+#
+# .. code-block:: c
+#
+#   for (int i = 0; i < n; ++i) {
+#     B[i] = 0;
+#     for (int k = 0; k < m; ++k) {
+#       B[i] = B[i] + A[i][k];
+#     }
+#   }
+#
+n = te.var("n")
+m = te.var("m")
+A = te.placeholder((n, m), name='A')
+k = te.reduce_axis((0, m), "k")
+B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B")
+
+######################################################################
+# Schedule the Reduction
+# ----------------------
+# There are several ways to schedule a reduction.
+# Before doing anything, let us print out the IR code of default schedule.
+#
+s = te.create_schedule(B.op)
+print(tvm.lower(s, [A, B], simple_mode=True))
+
+######################################################################
+# You can find that the IR code is quite like the C code.
+# The reduction axis is similar to a normal axis, it can be splitted.
+#
+# In the following code we split both the row axis of B as well
+# axis by different factors. The result is a nested reduction.
+#
+ko, ki = s[B].split(B.op.reduce_axis[0], factor=16)
+xo, xi = s[B].split(B.op.axis[0], factor=32)
+print(tvm.lower(s, [A, B], simple_mode=True))
+
+######################################################################
+# If we are building a GPU kernel, we can bind the rows of B to GPU threads.
+s[B].bind(xo, te.thread_axis("blockIdx.x"))
+s[B].bind(xi, te.thread_axis("threadIdx.x"))
+print(tvm.lower(s, [A, B], simple_mode=True))
+
+######################################################################
+# Reduction Factoring and Parallelization
+# ---------------------------------------
+# One problem of building a reduction is that we cannot simply
+# parallelize over the reduction axis. We need to divide the computation
+# of the reduction, store the local reduction result in a temporal array
+# before doing a reduction over the temp array.
+#
+# The rfactor primitive does such rewrite of the computation.
+# In the following schedule, the result of B is written to a temporary
+# result B.rf. The factored dimension becomes the first dimension of B.rf.
+#
+s = te.create_schedule(B.op)
+ko, ki = s[B].split(B.op.reduce_axis[0], factor=16)
+BF = s.rfactor(B, ki)
+print(tvm.lower(s, [A, B], simple_mode=True))
+
+######################################################################
+# The scheduled operator of B also get rewritten to be sum over
+# the first axis of reduced result of B.f
+#
+print(s[B].op.body)
+
+######################################################################
+# Cross Thread Reduction
+# ----------------------
+# We can now parallelize over the factored axis.
+# Here the reduction axis of B is marked to be a thread.
+# TVM allows reduction axis to be marked as thread if it is the only
+# axis in reduction and cross thread reduction is possible in the device.
+#
+# This is indeed the case after the factoring.
+# We can directly compute BF at the reduction axis as well.
+# The final generated kernel will divide the rows by blockIdx.x and threadIdx.y
+# columns by threadIdx.x and finally do a cross thread reduction over threadIdx.x
+#
+xo, xi = s[B].split(s[B].op.axis[0], factor=32)
+s[B].bind(xo, te.thread_axis("blockIdx.x"))
+s[B].bind(xi, te.thread_axis("threadIdx.y"))
+tx = te.thread_axis("threadIdx.x")
+s[B].bind(s[B].op.reduce_axis[0], tx)
+s[BF].compute_at(s[B], s[B].op.reduce_axis[0])
+s[B].set_store_predicate(tx.var.equal(0))
+fcuda = tvm.build(s, [A, B], "cuda")
+print(fcuda.imported_modules[0].get_source())
+
+######################################################################
+# Verify the correctness of result kernel by comparing it to numpy.
+#
+nn = 128
+ctx  = tvm.gpu(0)
+a = tvm.nd.array(np.random.uniform(size=(nn, nn)).astype(A.dtype), ctx)
+b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), ctx)
+fcuda(a, b)
+tvm.testing.assert_allclose(
+    b.asnumpy(),  np.sum(a.asnumpy(), axis=1), rtol=1e-4)
+
+######################################################################
+# Describe Convolution via 2D Reduction
+# -------------------------------------
+# In TVM, we can describe convolution via 2D reduction in a simple way.
+# Here is an example for 2D convolution with filter size = [3, 3] and strides = [1, 1].
+#
+n = te.var('n')
+Input = te.placeholder((n, n), name='Input')
+Filter = te.placeholder((3, 3), name='Filter')
+di = te.reduce_axis((0, 3), name='di')
+dj = te.reduce_axis((0, 3), name='dj')
+Output = te.compute(
+    (n - 2, n - 2),
+    lambda i, j: te.sum(Input[i + di, j + dj] * Filter[di, dj], axis=[di, dj]),
+    name='Output')
+s = te.create_schedule(Output.op)
+print(tvm.lower(s, [Input, Filter, Output], simple_mode=True))
+
+######################################################################
+# .. _general-reduction:
+#
+# Define General Commutative Reduction Operation
+# ----------------------------------------------
+# Besides the built-in reduction operations like :any:`te.sum`,
+# :any:`tvm.te.min` and :any:`tvm.te.max`, you can also define your
+# commutative reduction operation by :any:`te.comm_reducer`.
+#
+
+n = te.var('n')
+m = te.var('m')
+product = te.comm_reducer(lambda x, y: x*y,
+    lambda t: tvm.tir.const(1, dtype=t), name="product")
+A = te.placeholder((n, m), name='A')
+k = te.reduce_axis((0, m), name='k')
+B = te.compute((n,), lambda i: product(A[i, k], axis=k), name='B')
+
+######################################################################
+# .. note::
+#
+#   Sometimes we would like to perform reduction that involves multiple
+#   values like :code:`argmax`, which can be done by tuple inputs.
+#   See :ref:`reduction-with-tuple-inputs` for more detail.
+
+######################################################################
+# Summary
+# -------
+# This tutorial provides a walk through of reduction schedule.
+#
+# - Describe reduction with reduce_axis.
+# - Use rfactor to factor out axis if we need parallelism.
+# - Define new reduction operation by :any:`te.comm_reducer`
diff --git a/docs/_downloads/5df1a8bfe653027789c10728e74a65c0/intrin_math.ipynb b/docs/_downloads/5df1a8bfe653027789c10728e74a65c0/intrin_math.ipynb
new file mode 100644
index 0000000..c8643d9
--- /dev/null
+++ b/docs/_downloads/5df1a8bfe653027789c10728e74a65c0/intrin_math.ipynb
@@ -0,0 +1,169 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\nIntrinsics and Math Functions\n=============================\n**Author**: `Tianqi Chen <https://tqchen.github.io>`_\n\nWhile TVM supports basic arithmetic operations. In many cases\nusually we will need more complicated builtin functions.\nFor example :code:`exp` to take the exponential of the function.\n\nThese functions are target system dependent and may have different\nnames of different target platforms. In this tutorial, we will learn\nhow we can invoke these target spec [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from __future__ import absolute_import, print_function\n\nimport tvm\nfrom tvm import te\nimport numpy as np"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Direct Declare Extern Math Call\n-------------------------------\nThe most straight-forward way to call target specific function is via\nextern function call construct in tvm.\nIn the following example, we use :any:`tvm.tir.call_pure_extern` to call\n:code:`__expf` function, which is only available under CUDA.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "n = te.var(\"n\")\nA = te.placeholder((n,), name='A')\nB = te.compute(A.shape,\n                lambda i: tvm.tir.call_pure_extern(\"float32\", \"__expf\", A[i]),\n                name=\"B\")\ns = te.create_schedule(B.op)\nnum_thread = 64\nbx, tx = s[B].split(B.op.axis[0], factor=num_thread)\ns[B].bind(bx, te.thread_axis(\"blockIdx.x\"))\ns[B].bind(tx, te.thread_axis(\"threadIdx.x\"))\nf = tvm.build(s, [A, B], \"cuda\", name=\"myexp\")\nprint(f.imported_modules[0].get_source())"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Unified Intrinsic Call\n----------------------\nThe above code verifies that direct external call can be used to\ncall into device specific functions.\nHowever, the above way only works for CUDA target with float type.\nIdeally, we want to write same code for any device and any data type.\n\nTVM intrinsic provides the user a mechanism to achieve this, and this\nis the recommended way to solve the problem.\nThe following code use te.exp instead, which create an intrinsic call\n:p [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "n = te.var(\"n\")\nA = te.placeholder((n,), name='A')\nB = te.compute(A.shape, lambda i: te.exp(A[i]), name=\"B\")\ns = te.create_schedule(B.op)\nnum_thread = 64\nbx, tx = s[B].split(B.op.axis[0], factor=num_thread)\ns[B].bind(bx, te.thread_axis(\"blockIdx.x\"))\ns[B].bind(tx, te.thread_axis(\"threadIdx.x\"))\nfcuda = tvm.build(s, [A, B], \"cuda\", name=\"myexp\")\nprint(fcuda.imported_modules[0].get_source())"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "We can find that the code works for both CUDA and opencl.\nThe same te.exp can also be used for float64 data types.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "fopencl = tvm.build(s, [A, B], \"opencl\", name=\"myexp\")\nprint(fopencl.imported_modules[0].get_source())"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Intrinsic Lowering Rule\n-----------------------\nWhen :py:func:`tvm.te.exp` is called, TVM creates an intrinsic Call Expr.\nTVM uses transformation rules to transform the intrinsic\ncall to device specific extern calls.\n\nTVM also allows user to customize the rules during runtime.\nThe following example customizes CUDA lowering rule for :code:`exp`.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "def my_cuda_math_rule(op):\n    \"\"\"Customized CUDA intrinsic lowering rule\"\"\"\n    assert isinstance(op, tvm.tir.Call)\n    if op.dtype == \"float32\":\n        # call float function\n        return tvm.tir.call_pure_extern(\"float32\", \"%sf\" % op.name, op.args[0])\n    elif op.dtype == \"float64\":\n        # call double function\n        return tvm.tir.call_pure_extern(\"float32\", op.name, op.args[0])\n    else:\n        # cannot do translation, return self.\n         [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Register the rule to TVM with override option to override existing rule.\nNotice the difference between the printed code from previous one:\nour new rule uses math function :code:`expf` instead of\nfast math version :code:`__expf`.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "fcuda = tvm.build(s, [A, B], \"cuda\", name=\"myexp\")\nprint(fcuda.imported_modules[0].get_source())"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Add Your Own Intrinsic\n----------------------\nIf there is an intrinsic that is not provided by TVM.\nUser can easily add new intrinsic by using the intrinsic rule system.\nThe following example add an intrinsic :code:`mylog` to the system.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "def mylog(x):\n    \"\"\"customized log intrinsic function\"\"\"\n    return tvm.tir.call_pure_intrin(x.dtype, \"mylog\", x)\n\n\ndef my_cuda_mylog_rule(op):\n    \"\"\"CUDA lowering rule for log\"\"\"\n    if op.dtype == \"float32\":\n        return tvm.tir.call_pure_extern(\"float32\", \"logf\", op.args[0])\n    elif op.dtype == \"float64\":\n        return tvm.tir.call_pure_extern(\"float64\", \"log\", op.args[0])\n    else:\n        return op\n\n\ntvm.target.register_intrin_ [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Summary\n-------\n- TVM can call extern target dependent math function.\n- Use intrinsic to defined a unified interface for the functions.\n- For more intrinsics available in tvm, take a look at :any:`tvm.tir`\n- You can customize the intrinsic behavior by defining your own rules.\n\n\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/_downloads/612f9e42b0247df5c8ab277534e2af65/tune_relay_vta.py b/docs/_downloads/612f9e42b0247df5c8ab277534e2af65/tune_relay_vta.py
new file mode 100644
index 0000000..b0870b1
--- /dev/null
+++ b/docs/_downloads/612f9e42b0247df5c8ab277534e2af65/tune_relay_vta.py
@@ -0,0 +1,494 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-tuning a convolutional network on VTA
+==========================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_
+
+Auto-tuning for a specific accelerator design is critical for getting the best
+performance for any given operator. This is a tutorial showcases how to tune a
+whole convolutional network on VTA.
+
+The operator implementation for VTA in TVM is written in template form.
+The template has many tunable knobs (tile factor, virtual threads, etc).
+We will tune all convolution operators in the neural network. After tuning,
+we produce a log file which stores the best schedule parameters for all tuned
+operators. When the TVM compiler compiles these operators, it will query this
+log file to get the best knob parameters.
+
+"""
+
+######################################################################
+# Install dependencies
+# --------------------
+# To use the autotvm package in tvm, we need to install some extra dependencies.
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user psutil xgboost tornado mxnet requests "Pillow<7"
+#
+# To make TVM run faster during tuning, it is recommended to use cython
+# as FFI of TVM. In the root directory of TVM, execute
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user cython
+#   sudo make cython3
+#
+# Now return to python code. Import packages.
+
+import os
+from mxnet.gluon.model_zoo import vision
+import numpy as np
+from PIL import Image
+
+import topi
+import tvm
+from tvm import te
+from tvm import rpc, autotvm, relay
+from tvm.contrib import graph_runtime, util, download
+from tvm.autotvm.measure.measure_methods import request_remote
+from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+
+import vta
+from vta.testing import simulator
+from vta.top import graph_pack
+
+#################################################################
+# Compile network
+# ---------------
+# Perform vta-specific compilation with Relay from a Gluon model
+
+
+def compile_network(env, target, model, start_pack, stop_pack):
+
+    # Populate the shape and data type dictionary
+    dtype_dict = {"data": 'float32'}
+    shape_dict = {"data": (env.BATCH, 3, 224, 224)}
+
+    # Get off the shelf gluon model, and convert to relay
+    gluon_model = vision.get_model(model, pretrained=True)
+    mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
+
+    # Update shape and type dictionary
+    shape_dict.update({k: v.shape for k, v in params.items()})
+    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
+
+    # Perform quantization in Relay
+    # Note: We set opt_level to 3 in order to fold batch norm
+    with relay.build_config(opt_level=3):
+        with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]):
+            mod = relay.quantize.quantize(mod, params=params)
+
+    # Perform graph packing and constant folding for VTA target
+    if target.device_name == "vta":
+        assert env.BLOCK_IN == env.BLOCK_OUT
+        relay_prog = graph_pack(mod["main"],
+                                env.BATCH,
+                                env.BLOCK_OUT,
+                                env.WGT_WIDTH,
+                                start_name=start_pack,
+                                stop_name=stop_pack)
+
+    return relay_prog, params
+
+
+#################################################################
+# Start RPC Tracker
+# -----------------
+# TVM uses an RPC session to communicate with Pynq boards.
+# During tuning, the tuner will send the generated code to the board and
+# measure the speed of code on the board.
+#
+# To scale up tuning, TVM uses an RPC Tracker to manage multiple devices.
+# The RPC Tracker is a centralized master node. We can register all devices to
+# the tracker. For example, if we have 10 Pynq boards, we can register all of them
+# to the tracker, and run 10 measurements in parallel, accelerating the tuning process.
+#
+# To start an RPC tracker, run this command on the host machine. The tracker is
+# required during the whole tuning process, so we need to open a new terminal for
+# this command:
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190
+#
+# The expected output is:
+#
+# .. code-block:: bash
+#
+#   INFO:RPCTracker:bind to 0.0.0.0:9190
+
+#################################################################
+# Register devices to RPC Tracker
+# -----------------------------------
+# Now we can register our devices to the tracker. The first step is to
+# build the TVM runtime for the Pynq devices.
+#
+# Follow :ref:`vta-index`
+# to build the TVM runtime on the device. Then register the device to the tracker with:
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.rpc_server --tracker=[HOST_IP]:9190 --key=pynq
+#
+# (replace :code:`[HOST_IP]` with the IP address of your host machine)
+#
+# After registering devices, we can confirm it by querying the rpc_tracker:
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190
+#
+# For example, if we have 6 Pynq boards and 11 Raspberry Pi 3B,
+# the output can be
+#
+# .. code-block:: bash
+#
+#    Queue Status
+#    ----------------------------------
+#    key          total  free  pending
+#    ----------------------------------
+#    pynq         6      6     0
+#    rpi3b        11     11    0
+#    ----------------------------------
+#
+# You can register multiple devices to the tracker to accelerate tuning.
+
+###########################################
+# Set Tuning Options
+# ------------------
+# Before tuning, we should apply some configurations.
+# Here we use an Pynq-Z1 board as an example.
+
+# Tracker host and port can be set by your environment
+tracker_host = os.environ.get("TVM_TRACKER_HOST", '0.0.0.0')
+tracker_port = int(os.environ.get("TVM_TRACKER_PORT", 9190))
+
+# Load VTA parameters from the vta/vta-hw/config/vta_config.json file
+env = vta.get_env()
+
+# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.
+# Set ``device=arm_cpu`` to run inference on the CPU
+# or ``device=vta`` to run inference on the FPGA.
+device = "vta"
+target = env.target if device == "vta" else env.target_vta_cpu
+
+# Name of Gluon model to compile
+# The ``start_pack`` and ``stop_pack`` labels indicate where
+# to start and end the graph packing relay pass: in other words
+# where to start and finish offloading to VTA.
+network = "resnet18_v1"
+start_pack = "nn.max_pool2d"
+stop_pack = "nn.global_avg_pool2d"
+
+# Tuning option
+log_file = "%s.%s.log" % (device, network)
+tuning_option = {
+    'log_filename': log_file,
+
+    'tuner': 'random',
+    'n_trial': 1000,
+    'early_stopping': None,
+
+    'measure_option': autotvm.measure_option(
+        builder=autotvm.LocalBuilder(),
+        runner=autotvm.RPCRunner(env.TARGET,
+                                 host=tracker_host,
+                                 port=tracker_port,
+                                 number=5,
+                                 timeout=60,
+                                 check_correctness=True),
+    ),
+}
+
+####################################################################
+#
+# .. note:: How to set tuning options
+#
+#   In general, the default values provided here work well.
+#   If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping`
+#   to larger values, makes the tuning run for longer.
+#   If your device is under-powered or your conv2d operators are large, consider
+#   setting a longer timeout.
+#
+
+###################################################################
+# Begin Tuning
+# ------------
+# Now we can extract tuning tasks from the network and begin tuning.
+# Here, we provide a simple utility function to tune a list of tasks.
+# This function is just an initial implementation which tunes them in sequential order.
+# We will introduce a more sophisticated tuning scheduler in the future.
+#
+# Given that the tuning will be done on Pynq FPGA boards, make sure that
+# the ```TARGET`` entry in the ``vta_config.json`` file is set to ``pynq``.
+
+
+# You can skip the implementation of this function for this tutorial.
+def tune_tasks(tasks,
+               measure_option,
+               tuner='xgb',
+               n_trial=1000,
+               early_stopping=None,
+               log_filename='tuning.log',
+               use_transfer_learning=True):
+
+    # create tmp log file
+    tmp_log_file = log_filename + ".tmp"
+    if os.path.exists(tmp_log_file):
+        os.remove(tmp_log_file)
+
+    for i, tsk in enumerate(reversed(tasks)):
+        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
+
+        # create tuner
+        if tuner == 'xgb' or tuner == 'xgb-rank':
+            tuner_obj = XGBTuner(tsk, loss_type='rank')
+        elif tuner == 'xgb_knob':
+            tuner_obj = XGBTuner(tsk, loss_type='rank', feature_type='knob')
+        elif tuner == 'ga':
+            tuner_obj = GATuner(tsk, pop_size=50)
+        elif tuner == 'random':
+            tuner_obj = RandomTuner(tsk)
+        elif tuner == 'gridsearch':
+            tuner_obj = GridSearchTuner(tsk)
+        else:
+            raise ValueError("Invalid tuner: " + tuner)
+
+        if use_transfer_learning:
+            if os.path.isfile(tmp_log_file):
+                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
+
+        # do tuning
+        tsk_trial = min(n_trial, len(tsk.config_space))
+        tuner_obj.tune(n_trial=tsk_trial,
+                       early_stopping=early_stopping,
+                       measure_option=measure_option,
+                       callbacks=[
+                           autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
+                           autotvm.callback.log_to_file(tmp_log_file)
+                       ])
+
+    # pick best records to a cache file
+    autotvm.record.pick_best(tmp_log_file, log_filename)
+    os.remove(tmp_log_file)
+
+
+########################################################################
+# Register VTA-specific tuning tasks
+
+
+def register_vta_tuning_tasks():
+    from tvm.autotvm.task import TaskExtractEnv
+
+    @tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
+    def my_clip(x, a_min, a_max):
+        """Unlike topi's current clip, put min and max into two stages."""
+        const_min = tvm.tir.const(a_min, x.dtype)
+        const_max = tvm.tir.const(a_max, x.dtype)
+        x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
+        x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
+        return x
+
+    # init autotvm env to register VTA operator
+    TaskExtractEnv()
+
+    @autotvm.template("conv2d_packed.vta")
+    def _topi_nn_conv2d(*args, **kwargs):
+        assert not kwargs, "Do not support kwargs in template function call"
+        A, W = args[:2]
+
+        with tvm.target.vta():
+            res = vta.top.conv2d_packed(*args, **kwargs)
+            res = topi.right_shift(res, 8)
+            res = my_clip(res, 0, 127)
+            res = topi.cast(res, "int8")
+
+        if tvm.target.Target.current().device_name == 'vta':
+            s = vta.top.schedule_conv2d_packed([res])
+        else:
+            s = te.create_schedule([res.op])
+        return s, [A, W, res]
+
+
+########################################################################
+# Finally, we launch tuning jobs and evaluate the end-to-end performance.
+
+
+def tune_and_evaluate(tuning_opt):
+
+    if env.TARGET != "sim":
+        # Get remote from fleet node
+        remote = autotvm.measure.request_remote(env.TARGET,
+                                                tracker_host,
+                                                tracker_port,
+                                                timeout=10000)
+        # Reconfigure the JIT runtime and FPGA.
+        vta.reconfig_runtime(remote)
+        vta.program_fpga(remote, bitstream=None)
+    else:
+        # In simulation mode, host the RPC server locally.
+        remote = rpc.LocalSession()
+
+    # Register VTA tuning tasks
+    register_vta_tuning_tasks()
+
+    # Perform task extraction on Relay program
+    print("Extract tasks...")
+    relay_prog, params = compile_network(env, target, network, start_pack, stop_pack)
+    mod = tvm.IRModule.from_expr(relay_prog)
+    tasks = autotvm.task.extract_from_program(mod,
+                                              params=params,
+                                              ops=(relay.op.get("nn.conv2d"),),
+                                              target=target,
+                                              target_host=env.target_host)
+
+    # filter out non-packed conv2d task
+    tasks = list(filter(lambda t: len(t.args[0][1]) > 4, tasks))
+
+    # We should have extracted 10 convolution tasks
+    assert len(tasks) == 10
+    print("Extracted {} conv2d tasks:".format(len(tasks)))
+    for tsk in tasks:
+        inp = tsk.args[0][1]
+        wgt = tsk.args[1][1]
+        batch = inp[0] * inp[4]
+        in_filter = inp[1] * inp[5]
+        out_filter = wgt[0] * wgt[4]
+        height, width = inp[2], inp[3]
+        hkernel, wkernel = wgt[2], wgt[3]
+        hstride, wstride = tsk.args[2][0], tsk.args[2][1]
+        hpad, wpad = tsk.args[3][0], tsk.args[3][1]
+        print("({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})".format(
+            batch, height, width, in_filter, out_filter, hkernel, wkernel,
+            hpad, wpad, hstride, wstride))
+
+    # We do not run the tuning in our webpage server since it takes too long.
+    # Comment the following line to run it by yourself.
+    return
+
+    # run tuning tasks
+    print("Tuning...")
+    tune_tasks(tasks, **tuning_opt)
+
+    # compile kernels with history best records
+    with autotvm.tophub.context(target, extra_files=[log_file]):
+        # Compile network
+        print("Compile...")
+        with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
+            if target.device_name != "vta":
+                graph, lib, params = relay.build(relay_prog,
+                                                 target=target,
+                                                 params=params,
+                                                 target_host=env.target_host)
+            else:
+                with vta.build_config():
+                    graph, lib, params = relay.build(
+                        relay_prog,
+                        target=target,
+                        params=params,
+                        target_host=env.target_host)
+
+        # Export library
+        print("Upload...")
+        temp = util.tempdir()
+        lib.save(temp.relpath("graphlib.o"))
+        remote.upload(temp.relpath("graphlib.o"))
+        lib = remote.load_module("graphlib.o")
+
+        # Generate the graph runtime
+        ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
+        m = graph_runtime.create(graph, lib, ctx)
+
+        # upload parameters to device
+        image = tvm.nd.array(
+            (np.random.uniform(size=(1, 3, 224, 224))).astype('float32'))
+        m.set_input(**params)
+        m.set_input('data', image)
+
+        # evaluate
+        print("Evaluate inference time cost...")
+        timer = m.module.time_evaluator("run", ctx, number=1, repeat=10)
+        tcost = timer()
+        prof_res = np.array(tcost.results) * 1000  # convert to millisecond
+        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
+              (np.mean(prof_res), np.std(prof_res)))
+
+
+# Run the tuning and evaluate the results
+tune_and_evaluate(tuning_option)
+
+######################################################################
+# Sample Output
+# -------------
+# The tuning needs to compile many programs and extract feature from them.
+# So a high performance CPU is recommended.
+# One sample output is listed below.
+# It takes about 2 hours on a 16T CPU, and 6 Pynq boards.
+#
+# .. code-block:: bash
+#
+#    Extract tasks...
+#    [Warning] Invalid shape during AutoTVM task creation
+#    Extracted 10 conv2d tasks:
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 16, 14, 14, 1, 16), 'int8'), ('TENSOR', (32, 16, 1, 1, 16, 16), 'int8'), (2, 2), (0, 0), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 16, 14, 14, 1, 16, 'int8'), (32, 16, 1, 1, 16, 16, 'int8'), (2, 2), (0, 0), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 8, 28, 28, 1, 16), 'int8'), ('TENSOR', (16, 8, 1, 1, 16, 16), 'int8'), (2, 2), (0, 0), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 8, 28, 28, 1, 16, 'int8'), (16, 8, 1, 1, 16, 16, 'int8'), (2, 2), (0, 0), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 4, 56, 56, 1, 16), 'int8'), ('TENSOR', (8, 4, 1, 1, 16, 16), 'int8'), (2, 2), (0, 0), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 4, 56, 56, 1, 16, 'int8'), (8, 4, 1, 1, 16, 16, 'int8'), (2, 2), (0, 0), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 4, 56, 56, 1, 16), 'int8'), ('TENSOR', (4, 4, 3, 3, 16, 16), 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 4, 56, 56, 1, 16, 'int8'), (4, 4, 3, 3, 16, 16, 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 8, 28, 28, 1, 16), 'int8'), ('TENSOR', (8, 8, 3, 3, 16, 16), 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 8, 28, 28, 1, 16, 'int8'), (8, 8, 3, 3, 16, 16, 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 4, 56, 56, 1, 16), 'int8'), ('TENSOR', (8, 4, 3, 3, 16, 16), 'int8'), (2, 2), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 4, 56, 56, 1, 16, 'int8'), (8, 4, 3, 3, 16, 16, 'int8'), (2, 2), (1, 1), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 16, 14, 14, 1, 16), 'int8'), ('TENSOR', (16, 16, 3, 3, 16, 16), 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 16, 14, 14, 1, 16, 'int8'), (16, 16, 3, 3, 16, 16, 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 8, 28, 28, 1, 16), 'int8'), ('TENSOR', (16, 8, 3, 3, 16, 16), 'int8'), (2, 2), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 8, 28, 28, 1, 16, 'int8'), (16, 8, 3, 3, 16, 16, 'int8'), (2, 2), (1, 1), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 32, 7, 7, 1, 16), 'int8'), ('TENSOR', (32, 32, 3, 3, 16, 16), 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 32, 7, 7, 1, 16, 'int8'), (32, 32, 3, 3, 16, 16, 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 16, 14, 14, 1, 16), 'int8'), ('TENSOR', (32, 16, 3, 3, 16, 16), 'int8'), (2, 2), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 16, 14, 14, 1, 16, 'int8'), (32, 16, 3, 3, 16, 16, 'int8'), (2, 2), (1, 1), (1, 1), 'NCHW1n16c', 'int32'))
+#    Tuning...
+#    [Task  1/10]  Current/Best:    0.72/  23.24 GFLOPS | Progress: (480/1000) | 640.31 s Done.
+#    [Task  2/10]  Current/Best:    0.00/  27.69 GFLOPS | Progress: (576/1000) | 810.09 s Done.
+#    [Task  3/10]  Current/Best:    0.00/  22.97 GFLOPS | Progress: (1000/1000) | 1125.37 s Done.
+#    [Task  4/10]  Current/Best:    0.00/  31.26 GFLOPS | Progress: (1000/1000) | 1025.52 s Done.
+#    [Task  5/10]  Current/Best:    0.00/  15.15 GFLOPS | Progress: (1000/1000) | 1236.58 s Done.
+#    [Task  6/10]  Current/Best:    0.00/  22.74 GFLOPS | Progress: (1000/1000) | 906.60 s Done.
+#    [Task  7/10]  Current/Best:    0.00/  15.27 GFLOPS | Progress: (1000/1000) | 1056.25 s Done.
+#    [Task  8/10]  Current/Best:    0.00/   2.18 GFLOPS | Progress: (1000/1000) | 2275.29 s Done.
+#    [Task  9/10]  Current/Best:    2.23/   3.99 GFLOPS | Progress: (1000/1000) | 2527.25 s Done.
+#    [Task 10/10]  Current/Best:    1.56/   6.32 GFLOPS | Progress: (480/1000) | 1304.84 s Done.
+#    Compile...
+#    Upload...
+#    Evaluate inference time cost...
+#    Mean inference time (std dev): 621.79 ms (0.14 ms)
+
+######################################################################
+#
+# .. note:: **Experiencing Difficulties?**
+#
+#   The auto tuning module is error-prone. If you always see " 0.00/ 0.00 GFLOPS",
+#   then there must be something wrong.
+#
+#   First, make sure you set the correct configuration of your device.
+#   Then, you can print debug information by adding these lines in the beginning
+#   of the script. It will print every measurement result, where you can find useful
+#   error messages.
+#
+#   .. code-block:: python
+#
+#      import logging
+#      logging.getLogger('autotvm').setLevel(logging.DEBUG)
+#
+#   Finally, always feel free to ask our community for help on https://discuss.tvm.ai
diff --git a/docs/_downloads/64a7765a4ac55f228cf82b8462944a61/schedule_primitives.ipynb b/docs/_downloads/64a7765a4ac55f228cf82b8462944a61/schedule_primitives.ipynb
new file mode 100644
index 0000000..771c9ba
--- /dev/null
+++ b/docs/_downloads/64a7765a4ac55f228cf82b8462944a61/schedule_primitives.ipynb
@@ -0,0 +1,284 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\nSchedule Primitives in TVM\n==========================\n**Author**: `Ziheng Jiang <https://github.com/ZihengJiang>`_\n\nTVM is a domain specific language for efficient kernel construction.\n\nIn this tutorial, we will show you how to schedule the computation by\nvarious primitives provided by TVM.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from __future__ import absolute_import, print_function\n\nimport tvm\nfrom tvm import te\nimport numpy as np"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "There often exist several methods to compute the same result,\nhowever, different methods will result in different locality and\nperformance. So TVM asks user to provide how to execute the\ncomputation called **Schedule**.\n\nA **Schedule** is a set of transformation of computation that\ntransforms the loop of computations in the program.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# declare some variables for use later\nn = te.var('n')\nm = te.var('m')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "A schedule can be created from a list of ops, by default the\nschedule computes tensor in a serial manner in a row-major order.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# declare a matrix element-wise multiply\nA = te.placeholder((m, n), name='A')\nB = te.placeholder((m, n), name='B')\nC = te.compute((m, n), lambda i, j: A[i, j] * B[i, j], name='C')\n\ns = te.create_schedule([C.op])\n# lower will transform the computation from definition to the real\n# callable function. With argument `simple_mode=True`, it will\n# return you a readable C like statement, we use it here to print the\n# schedule result.\nprint(tvm.lower(s, [A, B, C], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "One schedule is composed by multiple stages, and one\n**Stage** represents schedule for one operation. We provide various\nmethods to schedule every stage.\n\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "split\n-----\n:code:`split` can split a specified axis into two axises by\n:code:`factor`.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "A = te.placeholder((m,), name='A')\nB = te.compute((m,), lambda i: A[i]*2, name='B')\n\ns = te.create_schedule(B.op)\nxo, xi = s[B].split(B.op.axis[0], factor=32)\nprint(tvm.lower(s, [A, B], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "You can also split a axis by :code:`nparts`, which splits the axis\ncontrary with :code:`factor`.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "A = te.placeholder((m,), name='A')\nB = te.compute((m,), lambda i: A[i], name='B')\n\ns = te.create_schedule(B.op)\nbx, tx = s[B].split(B.op.axis[0], nparts=32)\nprint(tvm.lower(s, [A, B], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "tile\n----\n:code:`tile` help you execute the computation tile by tile over two\naxises.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "A = te.placeholder((m, n), name='A')\nB = te.compute((m, n), lambda i, j: A[i, j], name='B')\n\ns = te.create_schedule(B.op)\nxo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)\nprint(tvm.lower(s, [A, B], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "fuse\n----\n:code:`fuse` can fuse two consecutive axises of one computation.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "A = te.placeholder((m, n), name='A')\nB = te.compute((m, n), lambda i, j: A[i, j], name='B')\n\ns = te.create_schedule(B.op)\n# tile to four axises first: (i.outer, j.outer, i.inner, j.inner)\nxo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)\n# then fuse (i.inner, j.inner) into one axis: (i.inner.j.inner.fused)\nfused = s[B].fuse(xi, yi)\nprint(tvm.lower(s, [A, B], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "reorder\n-------\n:code:`reorder` can reorder the axises in the specified order.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "A = te.placeholder((m, n), name='A')\nB = te.compute((m, n), lambda i, j: A[i, j], name='B')\n\ns = te.create_schedule(B.op)\n# tile to four axises first: (i.outer, j.outer, i.inner, j.inner)\nxo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)\n# then reorder the axises: (i.inner, j.outer, i.outer, j.inner)\ns[B].reorder(xi, yo, xo, yi)\nprint(tvm.lower(s, [A, B], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "bind\n----\n:code:`bind` can bind a specified axis with a thread axis, often used\nin gpu programming.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "A = te.placeholder((n,), name='A')\nB = te.compute(A.shape, lambda i: A[i] * 2, name='B')\n\ns = te.create_schedule(B.op)\nbx, tx = s[B].split(B.op.axis[0], factor=64)\ns[B].bind(bx, te.thread_axis(\"blockIdx.x\"))\ns[B].bind(tx, te.thread_axis(\"threadIdx.x\"))\nprint(tvm.lower(s, [A, B], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "compute_at\n----------\nFor a schedule that consists of multiple operators, TVM will compute\ntensors at the root separately by default.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "A = te.placeholder((m,), name='A')\nB = te.compute((m,), lambda i: A[i]+1, name='B')\nC = te.compute((m,), lambda i: B[i]*2, name='C')\n\ns = te.create_schedule(C.op)\nprint(tvm.lower(s, [A, B, C], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        ":code:`compute_at` can move computation of `B` into the first axis\nof computation of `C`.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "A = te.placeholder((m,), name='A')\nB = te.compute((m,), lambda i: A[i]+1, name='B')\nC = te.compute((m,), lambda i: B[i]*2, name='C')\n\ns = te.create_schedule(C.op)\ns[B].compute_at(s[C], C.op.axis[0])\nprint(tvm.lower(s, [A, B, C], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "compute_inline\n--------------\n:code:`compute_inline` can mark one stage as inline, then the body of\ncomputation will be expanded and inserted at the address where the\ntensor is required.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "A = te.placeholder((m,), name='A')\nB = te.compute((m,), lambda i: A[i]+1, name='B')\nC = te.compute((m,), lambda i: B[i]*2, name='C')\n\ns = te.create_schedule(C.op)\ns[B].compute_inline()\nprint(tvm.lower(s, [A, B, C], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "compute_root\n------------\n:code:`compute_root` can move computation of one stage to the root.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "A = te.placeholder((m,), name='A')\nB = te.compute((m,), lambda i: A[i]+1, name='B')\nC = te.compute((m,), lambda i: B[i]*2, name='C')\n\ns = te.create_schedule(C.op)\ns[B].compute_at(s[C], C.op.axis[0])\ns[B].compute_root()\nprint(tvm.lower(s, [A, B, C], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Summary\n-------\nThis tutorial provides an introduction to schedule primitives in\ntvm, which permits users schedule the computation easily and\nflexibly.\n\nIn order to get a good performance kernel implementation, the\ngeneral workflow often is:\n\n- Describe your computation via series of operations.\n- Try to schedule the computation with primitives.\n- Compile and run to see the performance difference.\n- Adjust your schedule according the running result.\n\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/_downloads/65bd9927a152de6eed3444185b24287f/tensorize.ipynb b/docs/_downloads/65bd9927a152de6eed3444185b24287f/tensorize.ipynb
new file mode 100644
index 0000000..49493fe
--- /dev/null
+++ b/docs/_downloads/65bd9927a152de6eed3444185b24287f/tensorize.ipynb
@@ -0,0 +1,241 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n\nUse Tensorize to Leverage Hardware Intrinsics\n=============================================\n**Author**: `Yizhi Liu <https://github.com/yzhliu>`_\n\nThis is an introduction material on how to perform tensorization in TVM.\n\nBy using schedule primitive :code:`tensorize`,\npeople can replace a unit of computation with the corresponding intrinsics,\nmaking it easy to leverage handcrafted micro-kernels,\nas well as extend TVM to support new hardware architectures.\n\nThe purpo [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from __future__ import absolute_import, print_function\n\nimport tvm\nfrom tvm import te\nimport numpy as np"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Define Matrix Multiplication\n----------------------------\nTake matrix multiplication as our example.\nMatmul first multiply the corresponding elements between two matrix,\nthen accumulate across a certain axis.\nThe following lines describe the computation :code:`A * B^T` in TVM.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "N, M, L = 1024, 512, 64\nA = te.placeholder((N, L), name='A')\nB = te.placeholder((M, L), name='B')\nk = te.reduce_axis((0, L), name='k')\nC = te.compute((N, M), lambda i, j:\n                te.sum(A[i, k] * B[j, k], axis=k), name='C')\ns = te.create_schedule(C.op)\nprint(tvm.lower(s, [A, B, C], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Schedule the Matmul\n-------------------\nNow, suppose we have an accelerator that supports\nmatrix-vector multiplication (GEMV) as a hardware primitive,\nwhich can take arbitrary size of reduce axis,\nbut another axis needs to be no larger than 16.\nThus we break down the matmul loops to make the innermost loops a (16x64) GEMV.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "factor = 16\nx, y = C.op.axis\nz, = C.op.reduce_axis\nyo, yi = s[C].split(y, factor=factor)\ns[C].reorder(x, yo, yi, z)\nprint(tvm.lower(s, [A, B, C], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "As showed in the IR printed above,\nthe inner loops :code:`j.inner` along with :code:`k` together form a computation of GEMV\n- within the inner most two loops, the index :code:`i` is fixed,\nthe access to the matrix :code:`A` only varies by :code:`k`,\nwhich makes the access pattern of :code:`A` a \"vector\".\nIn order to leverage our hypothetical hardware's GEMV instruction,\nwe can tensorize over :code:`j.inner`.\n\nDefine GEMV Tensorization Intrinsic\n----------------------- [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "def intrin_gemv(m, l):\n    a = te.placeholder((l,), name='a')\n    b = te.placeholder((m, l), name='b')\n    k = te.reduce_axis((0, l), name='k')\n    c = te.compute((m,), lambda i: te.sum(a[k] * b[i, k], axis=k), name='c')\n    Ab = tvm.tir.decl_buffer(a.shape, a.dtype,\n                         name=\"A\",\n                         offset_factor=1,\n                         strides=[1])\n    Bb = tvm.tir.decl_buffer(b.shape, b.dtype,\n                         name=\"B\",\n    [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Here :code:`te.decl_tensor_intrin` declares how to execute the computation :code:`c.op`.\nOur implementation simply takes the inputs and outputs,\nconverts them to pointers and emit an external function call.\nNote that tensorization requires user to specify :code:`offset_factor`,\nwith this information, TVM has knowledge of whether the data is aligned\nbetween the start address of the original data structure\nand the offset being passed to tensorize,\nso that it has chance to o [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "gemv = intrin_gemv(factor, L)\ns[C].tensorize(yi, gemv)\nprint(tvm.lower(s, [A, B, C], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "By tensorizing over :code:`yi`, the inner most two loops are\nnow replaced by the intrinsic function we defined before.\nIn order to build and run the module, let's define the external function :code:`gemv_update`,\nit is a naive implementation of GEMV, just for demonstration.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "def gemv_impl():\n    cc_code = \"\"\"\n      extern \"C\" int gemv_update(float *cc, float *aa, float *bb, int m, int l, int stride) {\n        for (int i = 0; i < m; ++i) {\n            for (int j = 0; j < l; ++j) {\n                cc[i] += aa[j] * bb[i * stride + j];\n            }\n        }\n        return 0;\n      }\n    \"\"\"\n    from tvm.contrib import util, clang\n    temp = util.tempdir()\n    ll_path = temp.relpath(\"temp.ll\")\n    # Create LLVM ir from c source  [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Now we leverage the pragma attribute :code:`import_llvm` to import llvm asm inline.\nThe importing needs to happen before the tensorized GEMV being executed.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "s[C].pragma(x, \"import_llvm\", gemv_impl())\nprint(tvm.lower(s, [A, B, C], simple_mode=True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Finally we compare the tensorize version with that :code:`numpy.dot` produces,\nensure our implementation is correct.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "func = tvm.build(s, [A, B, C], target=\"llvm\", name=\"gemv\")\n\nfrom topi.util import get_const_tuple\ndtype = A.dtype\nctx = tvm.context(\"cpu\", 0)\na = np.random.uniform(size=get_const_tuple(A.shape)).astype(dtype)\nb = np.random.uniform(size=get_const_tuple(B.shape)).astype(dtype)\nc = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), ctx)\nfunc(tvm.nd.array(a, ctx), tvm.nd.array(b, ctx), c)\ntvm.testing.assert_allclose(c.asnumpy(), np.dot(a, b.T), rtol=1e-3)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Reduce-update for Tensorize\n---------------------------\nSo far you have learned the basic idea of tensorize,\nnow let's move one step forward to a more complicated case.\n\nAssume our accelerator could only multiply a vector by a square matrix,\nin which the vector size needs to be no larger than 16.\nGiven such hardware constrain, now we need to split the reduce axis as following,\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "zo, zi = s[C].split(z, factor=factor)\ns[C].reorder(x, yo, zo, yi, zi)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "However, since the tensorize intrinsic now only covers a part of the reduce axis,\ninstead of using one \"body\" function, TVM requires a :code:`reduce_reset` function,\nwhich will be invoked before the reduce for-loop, and a :code:`reduce_update` function,\nwhich defines the \"update\" computing strategy.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "def gemv_impl():\n    cc_code = \"\"\"\n      extern \"C\" int gemv_update(float *cc, float *aa, float *bb, int m, int l, int stride) {\n        for (int i = 0; i < m; ++i) {\n            for (int j = 0; j < l; ++j) {\n                cc[i] += aa[j] * bb[i * stride + j];\n            }\n        }\n        return 0;\n      }\n      extern \"C\" int gemv_reset(float *cc, int m) {\n        for (int i = 0; i < m; ++i) {\n            cc[i] = 0.0;\n        }\n        return 0;\n       [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Note that :code:`intrin_func` now returns a triplet:\n:code:`(body, reduce_reset, reduce_update)`.\nIf tensorization includes all the reduce axes, function :code:`body()` will be invoked,\notherwise :code:`reduce_reset()` and :code:`reduce_update()` together will be used.\nIn our example :code:`body()` and :code:`reduce_update()`\nshare the same implementation,\nwhile in other cases, hardware may have different instructions for these two functions.\nMoreover, we can see now :cod [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "gemv = intrin_gemv(factor, factor)\ns[C].tensorize(yi, gemv)\ns[C].pragma(yo, \"import_llvm\", gemv_impl())\n\nfunc = tvm.build(s, [A, B, C], target=\"llvm\", name=\"gemv\")\na = np.random.uniform(size=get_const_tuple(A.shape)).astype(dtype)\nb = np.random.uniform(size=get_const_tuple(B.shape)).astype(dtype)\nc = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), ctx)\nfunc(tvm.nd.array(a, ctx), tvm.nd.array(b, ctx), c)\ntvm.testing.assert_allclose(c.asnumpy(), np.dot [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Summary\n-------\nThis tutorial demonstrates the usage of tensorize intrinsic in TVM.\nTensorize provides a way for users to get fully optimized schedule via micro-kernels.\nFor example, INT8 quantization on Intel CPUs uses tensorization\nto invoke AVX instruction directly.\nIt also enables TVM to compile to ASICs -\ncheckout `vta-index` for details.\nWe also demonstrates how to use inline assembly importing,\nwhich helps users inject asm easily into the schedule.\n\n\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/_downloads/66258823cf03d1875c9ea84a82131101/deploy_detection.ipynb b/docs/_downloads/66258823cf03d1875c9ea84a82131101/deploy_detection.ipynb
new file mode 100644
index 0000000..f4718e2
--- /dev/null
+++ b/docs/_downloads/66258823cf03d1875c9ea84a82131101/deploy_detection.ipynb
@@ -0,0 +1,169 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\nDeploy Pretrained Vision Detection Model from Darknet on VTA\n============================================================\n**Author**: `Hua Jiang <https://github.com/huajsj>`_\n\nThis tutorial provides an end-to-end demo, on how to run Darknet YoloV3-tiny\ninference onto the VTA accelerator design to perform Image detection tasks.\nIt showcases Relay as a front end compiler that can perform quantization (VTA\nonly supports int8/32 inference) as well as graph packing (in order [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Install dependencies\n--------------------\nTo use the autotvm package in tvm, we need to install some extra dependencies.\n(change \"3\" to \"2\" if you use python2):\n\n.. code-block:: bash\n\npip3 install \"Pillow<7\"\n\nYOLO-V3-tiny Model with Darknet parsing have dependancy with CFFI and CV2 library,\nwe need to install CFFI and CV2 before executing this script.\n\npip3 install \"Pillow<7\"\n\npip3 install cffi\npip3 install opencv-python\n\nNow return to the python code. I [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from __future__ import absolute_import, print_function\n\nimport sys\nimport os\nimport time\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport tvm\nimport vta\nfrom tvm import rpc, autotvm, relay\nfrom tvm.relay.testing import yolo_detection, darknet\nfrom tvm.relay.testing.darknet import __darknetffi__\nfrom tvm.contrib import graph_runtime, graph_runtime, util\nfrom tvm.contrib.download import download_testdata\nfrom vta.testing import simulator\nfrom vta.top import [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Download yolo net configure file, weight file, darknet library file based on\nModel Name\n----------------------------------------------------------------------------\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "MODEL_NAME = 'yolov3-tiny'\nREPO_URL = 'https://github.com/dmlc/web-data/blob/master/darknet/'\n\ncfg_path = download_testdata('https://github.com/pjreddie/darknet/blob/master/cfg/'\n                             + MODEL_NAME + '.cfg' + '?raw=true',\n                             MODEL_NAME + '.cfg',\n                             module=\"darknet\")\nweights_path = download_testdata('https://pjreddie.com/media/files/'\n                                 + MODEL_NAME + '.weights' + ' [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Download yolo categories and illustration front.\n------------------------------------------------\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "coco_path = download_testdata(REPO_URL + 'data/' + 'coco.names' + '?raw=true',\n                              'coco.names',\n                              module='data')\nfont_path = download_testdata(REPO_URL + 'data/' + 'arial.ttf' + '?raw=true',\n                              'arial.ttf',\n                              module='data')\nwith open(coco_path) as f:\n    content = f.readlines()\nnames = [x.strip() for x in content]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Define the platform and model targets.\n--------------------------------------\nExecute on CPU vs. VTA, and define the model.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Load VTA parameters from the vta/vta-hw/config/vta_config.json file\nenv = vta.get_env()\n# Set ``device=arm_cpu`` to run inference on the CPU\n# or ``device=vta`` to run inference on the FPGA.\ndevice = \"vta\"\ntarget = env.target if device == \"vta\" else env.target_vta_cpu\n\npack_dict = {\n    \"yolov3-tiny\": [\"nn.max_pool2d\", \"cast\", 4, 185],\n}\n\n# Name of Darknet model to compile\n# The ``start_pack`` and ``stop_pack`` labels indicate where\n# to start and end th [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Obtain an execution remote.\n---------------------------\nWhen target is 'pynq' or other FPGA backend, reconfigure FPGA and runtime.\nOtherwise, if target is 'sim', execute locally.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "if env.TARGET not in [\"sim\", \"tsim\"]:\n    # Get remote from tracker node if environment variable is set.\n    # To set up the tracker, you'll need to follow the \"Auto-tuning\n    # a convolutional network for VTA\" tutorial.\n    tracker_host = os.environ.get(\"TVM_TRACKER_HOST\", None)\n    tracker_port = os.environ.get(\"TVM_TRACKER_PORT\", None)\n    # Otherwise if you have a device you want to program directly from\n    # the host, make sure you've set the variables be [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Build the inference graph runtime.\n----------------------------------\nUsing Darknet library load downloaded vision model and compile with Relay.\nThe compilation steps are:\n\n1. Front end translation from Darknet into Relay module.\n2. Apply 8-bit quantization: here we skip the first conv layer,\n   and dense layer which will both be executed in fp32 on the CPU.\n3. Perform graph packing to alter the data layout for tensorization.\n4. Perform constant folding to reduce number [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Load pre-configured AutoTVM schedules\nwith autotvm.tophub.context(target):\n    net = __darknetffi__.dlopen(darknet_lib_path).load_network(cfg_path.encode('utf-8'),\n                                                               weights_path.encode('utf-8'),\n                                                               0)\n    dshape = (env.BATCH, net.c, net.h, net.w)\n    dtype = 'float32'\n\n    # Measure build start time\n    build_start = time.time()\n\n    # Start fron [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Perform image detection inference.\n----------------------------------\nWe run detect on an downloaded image\nDownload test image\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "[neth, netw] = dshape[2:]\ntest_image = 'person.jpg'\nimg_url = REPO_URL + 'data/' + test_image + '?raw=true'\nimg_path = download_testdata(img_url, test_image, \"data\")\ndata = darknet.load_image(img_path, neth, netw).transpose(1, 2, 0)\n\n# Prepare test image for inference\nplt.imshow(data)\nplt.show()\ndata = data.transpose((2, 0, 1))\ndata = data[np.newaxis, :]\ndata = np.repeat(data, env.BATCH, axis=0)\n\n# Set the network parameters and inputs\nm.set_input('data', data)\n [...]
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/_downloads/6748a8fb7e82692825b259c20af8372a/opt_conv_cuda.ipynb b/docs/_downloads/6748a8fb7e82692825b259c20af8372a/opt_conv_cuda.ipynb
new file mode 100644
index 0000000..fc71dcf
--- /dev/null
+++ b/docs/_downloads/6748a8fb7e82692825b259c20af8372a/opt_conv_cuda.ipynb
@@ -0,0 +1,151 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n\nHow to optimize convolution on GPU\n==================================\n**Author**: `Haichen Shen <https://homes.cs.washington.edu/~haichen/>`_\n\nIn this tutorial, we will demonstrate how to write a high performance\nconvolution implementation in TVM. We use square size input tensors and filters\nas an example, and assume the input to convolution has a large batch. In this\nexample, we use a different layout to store the data in order to achieve better\ndata locality. The b [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Preparation and Algorithm\n-------------------------\n\nWe use the fixed size for input tensors with 256 channels and 14 x 14\ndimensions. The batch size is 256. Convolution filters contain 512 filters\nof size 3 x 3.  We use stride size 1 and padding size 1 for the\nconvolution. The following code defines the convolution algorithm in TVM.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\nimport tvm\nfrom tvm import te\n\n# The sizes of inputs and filters\nbatch = 256\nin_channel = 256\nout_channel = 512\nin_size = 14\nkernel = 3\npad = 1\nstride = 1\n\n# Algorithm\nA = te.placeholder((in_size, in_size, in_channel, batch), name='A')\nW = te.placeholder((kernel, kernel, in_channel, out_channel), name='W')\nout_size = (in_size - kernel + 2*pad) // stride + 1\n# Pad input\nApad = te.compute(\n    (in_size + 2*pad, in_size + 2*pad, in_channel, bat [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Memory Hierarchy\n----------------\n\nWe first specify the memory hierarchy for buffers. The figure below shows the\nGPU memory hierarchy. One important difference from CPU memory hierarchy is\nthat GPU provides a cache buffer called shared memory, which is managed by\nprogrammers. Thus how to maximize the data reuse in the shared memory is\ncritical to achieve high performance in GPU kernels.\n\n![](https://github.com/dmlc/web-data/raw/master/tvm/tutorial/gpu_memory_hierarchy.p [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Designate the memory hierarchy\ns = te.create_schedule(B.op)\ns[Apad].compute_inline() # compute Apad inline\nAA = s.cache_read(Apad, 'shared', [B])\nWW = s.cache_read(W, \"shared\", [B])\nAL = s.cache_read(AA, \"local\", [B])\nWL = s.cache_read(WW, \"local\", [B])\nBL = s.cache_write(B, \"local\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Blocking\n--------\n\nThe following code splits the workload into thread blocks and individual\nthreads. We follow the blocking scheme in the matrix multiply. As shown in the\nfigure below, given a pixel coordinate (y, x), a thread block is responsible\nfor computing a region of block_factor x block_factor (64 x 64) for output\nchannels and batch. Due to the limit of shared memory space, we only load step\nx block_factor (8 x 64) data from Apad and B each time to buffers in the\ [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# tile consts\ntile = 8\nnum_thread = 8\nblock_factor = tile * num_thread\nstep = 8\nvthread = 2\n\n# Get the GPU thread indices\nblock_x = te.thread_axis(\"blockIdx.x\")\nblock_y = te.thread_axis(\"blockIdx.y\")\nblock_z = te.thread_axis(\"blockIdx.z\")\nthread_x = te.thread_axis((0, num_thread), \"threadIdx.x\")\nthread_y = te.thread_axis((0, num_thread), \"threadIdx.y\")\nthread_xz = te.thread_axis((0, vthread), \"vthread\", name=\"vx\")\nthread_yz = te.thread_axis((0, vthrea [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Virtual Thread Split\n--------------------\n\nWe further split the workload from a thread block to individual threads. To\navoid *memory bank conflict*, we use virtual thread to split the area into 4\nparts, and then tile into 8x8 grids. Therefore, shown in the figure below,\neach thread computes 4 strided grids, where size of each grid is 4 x 4.\n\n![](https://github.com/dmlc/web-data/raw/master/tvm/tutorial/conv_gpu_vthread.png)\n\n     :align: center\n     :height: 188px\n    [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "tyz, fi = s[B].split(fi, nparts=vthread)  # virtual thread split\ntxz, ni = s[B].split(ni, nparts=vthread)  # virtual thread split\nty, fi = s[B].split(fi, nparts=num_thread)\ntx, ni = s[B].split(ni, nparts=num_thread)\ns[B].reorder(bz, by, bx, tyz, txz, ty, tx, fi, ni)\n\ns[B].bind(tyz, thread_yz)\ns[B].bind(txz, thread_xz)\ns[B].bind(ty, thread_y)\ns[B].bind(tx, thread_x)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Cooperative Fetching\n--------------------\n\nAs mentioned before, each time step we need to transfer step x block_factor\ndata from GPU global memory to shared memory. In order to reduce the memory\ntransfer per thread, the following code lets threads in the same thread block\ncoopertively fetch dependent data from global memory.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Schedule BL local write\ns[BL].compute_at(s[B], tx)\nyi, xi, fi, ni = s[BL].op.axis\nry, rx, rc = s[BL].op.reduce_axis\nrco, rci = s[BL].split(rc, factor=step)\ns[BL].reorder(rco, ry, rx, rci, fi, ni)\n\n# Attach computation to iteration variables\ns[AA].compute_at(s[BL], rx)\ns[WW].compute_at(s[BL], rx)\ns[AL].compute_at(s[BL], rci)\ns[WL].compute_at(s[BL], rci)\n\n# Schedule for A's shared memory load\nyi, xi, ci, ni = s[AA].op.axis\nty, ci = s[AA].split(ci, nparts=num_threa [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Generate CUDA Kernel\n--------------------\n\nFinally we use TVM to generate and compile the CUDA kernel, and evaluate the\nlatency of convolution.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "func = tvm.build(s, [A, W, B], 'cuda')\nctx = tvm.gpu(0)\na_np = np.random.uniform(size=(in_size, in_size, in_channel, batch)).astype(A.dtype)\nw_np = np.random.uniform(size=(kernel, kernel, in_channel, out_channel)).astype(W.dtype)\na = tvm.nd.array(a_np, ctx)\nw = tvm.nd.array(w_np, ctx)\nb = tvm.nd.array(np.zeros((out_size, out_size, out_channel, batch), dtype=B.dtype), ctx)\nfunc(a, w, b)\nevaluator = func.time_evaluator(func.entry_name, ctx, number=1)\nprint('Convolution: % [...]
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/_downloads/67c18c78b0f12c3be5dc41b22637d719/matrix_multiply_opt.py b/docs/_downloads/67c18c78b0f12c3be5dc41b22637d719/matrix_multiply_opt.py
new file mode 100644
index 0000000..da3b9bb
--- /dev/null
+++ b/docs/_downloads/67c18c78b0f12c3be5dc41b22637d719/matrix_multiply_opt.py
@@ -0,0 +1,391 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+.. _vta-mat-mult-opt:
+
+Matrix Multiply Blocking
+========================
+**Author**: `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_
+
+This tutorial provides an overview on how to use TVM to map matrix
+multiplication efficiently on the VTA design.
+We recommend covering the :ref:`basic-mat-mult` tutorial first.
+
+In this tutorial, we will demonstrate TVM schedule optimizations to break large
+neural network operators down onto smaller blocks to achieve computation within
+limited hardware accelerator resources.
+"""
+
+######################################################################
+# RPC Setup
+# ---------
+# We start by programming the Pynq's FPGA and building its RPC runtime.
+
+from __future__ import absolute_import, print_function
+
+import os
+import tvm
+from tvm import te
+import vta
+import numpy as np
+from tvm import rpc
+from tvm.contrib import util
+from vta.testing import simulator
+
+# Load VTA parameters from the vta/vta-hw/config/vta_config.json file
+env = vta.get_env()
+
+# We read the Pynq RPC host IP address and port number from the OS environment
+host = os.environ.get("VTA_RPC_HOST", "192.168.2.99")
+port = int(os.environ.get("VTA_RPC_PORT", "9091"))
+
+# We configure both the bitstream and the runtime system on the Pynq
+# to match the VTA configuration specified by the vta_config.json file.
+if env.TARGET == "pynq":
+
+    # Make sure that TVM was compiled with RPC=1
+    assert tvm.runtime.enabled("rpc")
+    remote = rpc.connect(host, port)
+
+    # Reconfigure the JIT runtime
+    vta.reconfig_runtime(remote)
+
+    # Program the FPGA with a pre-compiled VTA bitstream.
+    # You can program the FPGA with your own custom bitstream
+    # by passing the path to the bitstream file instead of None.
+    vta.program_fpga(remote, bitstream=None)
+
+# In simulation mode, host the RPC server locally.
+elif env.TARGET in ["sim", "tsim"]:
+    remote = rpc.LocalSession()
+
+######################################################################
+# Computation Declaration
+# -----------------------
+# As a first step, we need to describe our matrix multiplication computation.
+# We define the matrix multiplication as the computation one would find in a
+# fully connected layer, defined by its batch size, input channels, and output
+# channels.
+# These have to be integer multiples of the VTA tensor shape:
+# :code:`BATCH`, :code:`BLOCK_IN`, and :code:`BLOCK_OUT` respectively.
+#
+# We've added extra operators to the matrix multiplication that apply
+# shifting and clipping to the output in order to mimic a fixed-point
+# matrix multiplication followed by a rectified linear activation.
+# We describe the TVM dataflow graph of the fully connected layer below:
+#
+# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/fc_dataflow.png
+#      :align: center
+#
+# This computation is intentionally too large to fit onto VTA's on-chip
+# buffers all at once. Therefore in the scheduling phase we'll
+# rely on computation blocking strategies to break the computation down into
+# manageable chunks.
+
+# Fully connected layer dimensions: 1024 x 1024
+batch_size = 1
+in_channels = 1024
+out_channels = 1024
+assert batch_size % env.BATCH == 0
+assert in_channels % env.BLOCK_IN == 0
+assert out_channels % env.BLOCK_OUT == 0
+
+# Let's derive the tiled input tensor shapes
+data_shape = (batch_size // env.BATCH,
+              in_channels // env.BLOCK_IN,
+              env.BATCH,
+              env.BLOCK_IN)
+weight_shape = (out_channels // env.BLOCK_OUT,
+                in_channels // env.BLOCK_IN,
+                env.BLOCK_OUT,
+                env.BLOCK_IN)
+output_shape = (batch_size // env.BATCH,
+                out_channels // env.BLOCK_OUT,
+                env.BATCH,
+                env.BLOCK_OUT)
+num_ops = in_channels * out_channels * batch_size * 2
+
+# Reduction axes
... 154601 lines suppressed ...